def test_simple_scenario(self): """ Tries to run the pipeline, and if it works - it's considered to be successful. Tries different numbers of workers. """ data_path = 'mldp/tests/data/small_chunks' field_names = ['first_name', 'email'] worker_processes_nums = [0, 1, 2, 3, 4] reader = CsvReader(sep=",") for wpn in worker_processes_nums: dev_data_pipeline = Pipeline(reader=reader, worker_processes_num=wpn) dev_data_pipeline.add_step(FieldSelector(field_names)) dev_data_pipeline.add_step(ChunkAccumulator(new_size=3)) dev_data_pipeline.add_step(PandasFormatter()) flag = False for data_chunk in dev_data_pipeline.iter(data_path=data_path): flag = True self.assertTrue(len(data_chunk) > 0) self.assertTrue(flag)
def test_readme_example(self): from mltoolkit.mldp.pipeline import Pipeline from mltoolkit.mldp.steps.readers import CsvReader from mltoolkit.mldp.steps.transformers.nlp import TokenProcessor, Padder from mltoolkit.mldp.steps.transformers.field import FieldSelector data_path = "mltoolkit/mldp/tests/data/tweets.csv" # creating steps csv_reader = CsvReader(sep='\t', chunk_size=30) fields_selector = FieldSelector(fnames=["tweets", "labels"]) token_processor = TokenProcessor(fnames="tweets", tok_func=lambda x: x.split(), lowercase=True) padder = Padder(fname="tweets", new_mask_fname="tweets_mask", pad_symbol="<PAD>") # creating the pipeline pipeline = Pipeline(reader=csv_reader, worker_processes_num=1) pipeline.add_step(fields_selector) pipeline.add_step(token_processor) pipeline.add_step(padder) # iterate over data chunks for data_chunk in pipeline.iter(data_path=data_path): pass # generate documentation and print it print(pipeline)
def test_invalid_pipeline(self): """ Tries to create an invalid data processing pipeline, and expect to get an error. """ reader = CsvReader() with self.assertRaises(ValueError): data_pipeline = Pipeline(reader) data_pipeline.add_step(FieldSelector(["dummy"])) data_pipeline.add_step(PandasFormatter()) data_pipeline.add_step(FunctionApplier({"dummy": lambda x: x}))
def test_empty_chunks(self): """Testing whether empty chunks do not reach user.""" data_path = 'mltoolkit/mldp/tests/data/small_chunks' field_names = ['first_name', 'email'] reader = CsvReader(chunk_size=1, sep=",") empty_chunk_transformer = EmptyChunkTransformer(max_count=3) dev_data_pipeline = Pipeline(reader=reader) dev_data_pipeline.add_step(empty_chunk_transformer) dev_data_pipeline.add_step(FieldSelector(field_names)) flag = False for dc in dev_data_pipeline.iter(data_path=data_path): flag = True self.assertFalse(equal_to_constant(dc, EMPTY_CHUNK)) self.assertTrue(flag)
def test_invalid_steps(self): """Testing whether an error is raised if an invalid step is present.""" data_path = 'mldp/tests/data/news.csv' data_source = {'data_path': data_path} inv_reader = InvalidCsvReader() val_reader = CsvReader() val_transf1 = FieldSelector("text") val_transf2 = TokenProcessor(fnames='text') inv_transf1 = InvalidTransformer() accum = ChunkAccumulator(new_size=3) formatter = PandasFormatter() # try only the invalid reader and valid steps dp = Pipeline(reader=inv_reader, error_on_invalid_chunk='error') for vs in [val_transf1, val_transf2, accum, formatter]: dp.add_step(vs) with self.assertRaises(DataChunkError): for _ in dp.iter(**data_source): pass # try valid reader and invalid steps steps = [val_transf1, val_transf2, inv_transf1, accum] for st in permutations(steps): dp = Pipeline(reader=val_reader, error_on_invalid_chunk='error') for s in st: dp.add_step(s) dp.add_step(formatter) with self.assertRaises(DataChunkError): for _ in dp.iter(**data_source): pass
def test_how_to_apply_run(self): data_path = os.path.join(self.tutorials_path, "data/tweets.csv") # paths where vocabs will be saved and later loaded from words_vocab_file_path = os.path.join(self.tutorials_path, "data/vocabs/words.txt") labels_vocab_file_path = os.path.join(self.tutorials_path, 'data/vocabs/labels.txt') # creating step objects twitter_tokenizer = TweetTokenizer() preprocessor = TwitterFilesPreprocessor( input_cols_number=3, tweets_indx=2, add_header=['ids', 'labels', 'tweets']) csv_reader = CsvReader(sep='\t', chunk_size=30) fields_selector = FieldSelector(fnames=["tweets", "labels"]) token_processor = TokenProcessor( fnames="tweets", tok_func=twitter_tokenizer.tokenize, tok_cleaning_func=twitter_text_cleaner, lowercase=True) # data pipeline for vocabularies creation vocab_data_pipeline = Pipeline(reader=csv_reader, preprocessor=preprocessor, worker_processes_num=0, name_prefix="vocabs") vocab_data_pipeline.add_step(fields_selector) vocab_data_pipeline.add_step(token_processor) # creating or loading vocabs words_vocab = Vocabulary(vocab_data_pipeline, name_prefix="words") words_vocab.load_or_create(words_vocab_file_path, data_source={"data_path": data_path}, data_fnames="tweets") labels_vocab = Vocabulary(vocab_data_pipeline, name_prefix="labels") labels_vocab.load_or_create(labels_vocab_file_path, data_source={"data_path": data_path}, data_fnames="labels") print(words_vocab) print(labels_vocab) print(vocab_data_pipeline) # extra steps for training and evaluation mapper = VocabMapper(field_names_to_vocabs={ "tweets": words_vocab, "labels": labels_vocab }) padder = Padder(fname="tweets", new_mask_fname="tweets_mask", pad_symbol=words_vocab[PAD].id) formatter = FeaturesLabelsFormatter(features_field_name="tweets", labels_field_name="labels", classes_number=len(labels_vocab)) # building the actual pipeline dev_data_pipeline = Pipeline(reader=csv_reader, preprocessor=preprocessor, worker_processes_num=1, name_prefix="dev") dev_data_pipeline.add_step(fields_selector) dev_data_pipeline.add_step(token_processor) dev_data_pipeline.add_step(mapper) dev_data_pipeline.add_step(padder) dev_data_pipeline.add_step(formatter) print(dev_data_pipeline) epochs = 2 i_model = ISentiLSTM(dev_data_pipeline) i_model.init_model(words_vocab_size=len(words_vocab), input_dim=50, lstm_hidden_dim=120, number_of_classes=len(labels_vocab), mask_symbol=words_vocab[PAD].id)