def test_multi_threaded_reader_output(self): """ Check if multi-threaded and single threaded readers produce the correct output. """ data_paths = [ 'mldp/tests/data/small_chunks/chunk1.csv', 'mldp/tests/data/small_chunks/chunk2.csv', 'mldp/tests/data/small_chunks/chunk3.csv' ] chunk_size = 2 reader = CsvReader(chunk_size=chunk_size, worker_threads_num=3, sep=',') expected_data = read_data_from_csv_file(data_paths) actual_data_chunks = DataChunk() for data_chunk in reader.iter(data_path=data_paths): for key in data_chunk.keys(): if key not in actual_data_chunks: actual_data_chunks[key] = np.array([]) actual_data_chunks[key] = np.concatenate( [actual_data_chunks[key], data_chunk[key]]) self.compare_unsorted_data_chunks(dc1=expected_data, dc2=actual_data_chunks, sort_key='id')
def test_valid_paths(self): """ Passing an intentionally wrong input to the reader and expecting it to throw an error. """ data_paths = ["a", "b", 123123123, 123.12313] reader = CsvReader() with self.assertRaises(ValueError): itr = reader.iter(data_path=data_paths) chunk = next(itr.__iter__())
def train_and_save_true_casing_model(input_fps, text_fname, output_fp): """Trains the Moses model on tokenized csv files; saves params.""" mtr = MosesTruecaser(is_asr=True) reader = CsvReader(quoting=QUOTE_NONE, sep='\t', engine='python', encoding='utf-8') texts = [] logger.info("Loading data from: '%s'." % input_fps) for dc in reader.iter(data_path=input_fps): for du in dc.iter(): texts.append(du[text_fname].split()) logger.info("Loaded the data.") safe_mkfdir(output_fp) logger.info("Training the truecaser.") mtr.train(texts, save_to=output_fp, progress_bar=True, processes=1) logger.info("Done, saved the model to: '%s'." % output_fp)
def test_output(self): """Checking if read data-chunks are valid.""" data_path = 'mltoolkit/mldp/tests/data/small_chunks/chunk2.csv' chunk_size = 2 reader = CsvReader(chunk_size=chunk_size, worker_threads_num=1, sep=',', encoding='utf-8', use_lists=False) data = read_data_from_csv_file(data_path, encoding='utf-8') expected_chunks = create_list_of_data_chunks(data, chunk_size=chunk_size) itr = reader.iter(data_path=data_path) i = 0 for (actual_chunk, expected_chunk) in zip(itr, expected_chunks): self.assertTrue(actual_chunk == expected_chunk) i += 1 self.assertTrue(i == len(expected_chunks) and len(expected_chunks) > 0)