Exemple #1
0
    def test_simple_scenario(self):
        """
        Tries to run the pipeline, and if it works - it's considered to be
        successful. Tries different numbers of workers.
        """
        data_path = 'mldp/tests/data/small_chunks'
        field_names = ['first_name', 'email']
        worker_processes_nums = [0, 1, 2, 3, 4]

        reader = CsvReader(sep=",")

        for wpn in worker_processes_nums:

            dev_data_pipeline = Pipeline(reader=reader,
                                         worker_processes_num=wpn)
            dev_data_pipeline.add_step(FieldSelector(field_names))
            dev_data_pipeline.add_step(ChunkAccumulator(new_size=3))
            dev_data_pipeline.add_step(PandasFormatter())

            flag = False
            for data_chunk in dev_data_pipeline.iter(data_path=data_path):
                flag = True
                self.assertTrue(len(data_chunk) > 0)

        self.assertTrue(flag)
Exemple #2
0
    def test_readme_example(self):
        from mltoolkit.mldp.pipeline import Pipeline
        from mltoolkit.mldp.steps.readers import CsvReader
        from mltoolkit.mldp.steps.transformers.nlp import TokenProcessor, Padder
        from mltoolkit.mldp.steps.transformers.field import FieldSelector

        data_path = "mltoolkit/mldp/tests/data/tweets.csv"

        # creating steps
        csv_reader = CsvReader(sep='\t', chunk_size=30)
        fields_selector = FieldSelector(fnames=["tweets", "labels"])
        token_processor = TokenProcessor(fnames="tweets",
                                         tok_func=lambda x: x.split(),
                                         lowercase=True)
        padder = Padder(fname="tweets", new_mask_fname="tweets_mask",
                        pad_symbol="<PAD>")

        # creating the pipeline
        pipeline = Pipeline(reader=csv_reader, worker_processes_num=1)
        pipeline.add_step(fields_selector)
        pipeline.add_step(token_processor)
        pipeline.add_step(padder)

        # iterate over data chunks
        for data_chunk in pipeline.iter(data_path=data_path):
            pass

        # generate documentation and print it
        print(pipeline)
Exemple #3
0
 def test_invalid_pipeline(self):
     """
     Tries to create an invalid data processing pipeline, and expect to get
     an error.
     """
     reader = CsvReader()
     with self.assertRaises(ValueError):
         data_pipeline = Pipeline(reader)
         data_pipeline.add_step(FieldSelector(["dummy"]))
         data_pipeline.add_step(PandasFormatter())
         data_pipeline.add_step(FunctionApplier({"dummy": lambda x: x}))
Exemple #4
0
    def test_empty_chunks(self):
        """Testing whether empty chunks do not reach user."""
        data_path = 'mltoolkit/mldp/tests/data/small_chunks'
        field_names = ['first_name', 'email']
        reader = CsvReader(chunk_size=1, sep=",")
        empty_chunk_transformer = EmptyChunkTransformer(max_count=3)

        dev_data_pipeline = Pipeline(reader=reader)
        dev_data_pipeline.add_step(empty_chunk_transformer)
        dev_data_pipeline.add_step(FieldSelector(field_names))

        flag = False
        for dc in dev_data_pipeline.iter(data_path=data_path):
            flag = True
            self.assertFalse(equal_to_constant(dc, EMPTY_CHUNK))

        self.assertTrue(flag)
Exemple #5
0
    def test_invalid_steps(self):
        """Testing whether an error is raised if an invalid step is present."""
        data_path = 'mldp/tests/data/news.csv'
        data_source = {'data_path': data_path}

        inv_reader = InvalidCsvReader()
        val_reader = CsvReader()

        val_transf1 = FieldSelector("text")
        val_transf2 = TokenProcessor(fnames='text')
        inv_transf1 = InvalidTransformer()
        accum = ChunkAccumulator(new_size=3)
        formatter = PandasFormatter()

        # try only the invalid reader and valid steps
        dp = Pipeline(reader=inv_reader, error_on_invalid_chunk='error')
        for vs in [val_transf1, val_transf2, accum, formatter]:
            dp.add_step(vs)
        with self.assertRaises(DataChunkError):
            for _ in dp.iter(**data_source):
                pass

        # try valid reader and invalid steps
        steps = [val_transf1, val_transf2, inv_transf1, accum]
        for st in permutations(steps):
            dp = Pipeline(reader=val_reader, error_on_invalid_chunk='error')
            for s in st:
                dp.add_step(s)
            dp.add_step(formatter)
            with self.assertRaises(DataChunkError):
                for _ in dp.iter(**data_source):
                    pass
Exemple #6
0
    def test_how_to_apply_run(self):

        data_path = os.path.join(self.tutorials_path, "data/tweets.csv")

        # paths where vocabs will be saved and later loaded from
        words_vocab_file_path = os.path.join(self.tutorials_path,
                                             "data/vocabs/words.txt")
        labels_vocab_file_path = os.path.join(self.tutorials_path,
                                              'data/vocabs/labels.txt')

        # creating step objects
        twitter_tokenizer = TweetTokenizer()
        preprocessor = TwitterFilesPreprocessor(
            input_cols_number=3,
            tweets_indx=2,
            add_header=['ids', 'labels', 'tweets'])
        csv_reader = CsvReader(sep='\t', chunk_size=30)
        fields_selector = FieldSelector(fnames=["tweets", "labels"])
        token_processor = TokenProcessor(
            fnames="tweets",
            tok_func=twitter_tokenizer.tokenize,
            tok_cleaning_func=twitter_text_cleaner,
            lowercase=True)

        # data pipeline for vocabularies creation
        vocab_data_pipeline = Pipeline(reader=csv_reader,
                                       preprocessor=preprocessor,
                                       worker_processes_num=0,
                                       name_prefix="vocabs")
        vocab_data_pipeline.add_step(fields_selector)
        vocab_data_pipeline.add_step(token_processor)

        # creating or loading vocabs
        words_vocab = Vocabulary(vocab_data_pipeline, name_prefix="words")
        words_vocab.load_or_create(words_vocab_file_path,
                                   data_source={"data_path": data_path},
                                   data_fnames="tweets")

        labels_vocab = Vocabulary(vocab_data_pipeline, name_prefix="labels")
        labels_vocab.load_or_create(labels_vocab_file_path,
                                    data_source={"data_path": data_path},
                                    data_fnames="labels")

        print(words_vocab)

        print(labels_vocab)

        print(vocab_data_pipeline)

        # extra steps for training and evaluation
        mapper = VocabMapper(field_names_to_vocabs={
            "tweets": words_vocab,
            "labels": labels_vocab
        })
        padder = Padder(fname="tweets",
                        new_mask_fname="tweets_mask",
                        pad_symbol=words_vocab[PAD].id)
        formatter = FeaturesLabelsFormatter(features_field_name="tweets",
                                            labels_field_name="labels",
                                            classes_number=len(labels_vocab))

        # building the actual pipeline
        dev_data_pipeline = Pipeline(reader=csv_reader,
                                     preprocessor=preprocessor,
                                     worker_processes_num=1,
                                     name_prefix="dev")
        dev_data_pipeline.add_step(fields_selector)
        dev_data_pipeline.add_step(token_processor)
        dev_data_pipeline.add_step(mapper)
        dev_data_pipeline.add_step(padder)
        dev_data_pipeline.add_step(formatter)

        print(dev_data_pipeline)

        epochs = 2

        i_model = ISentiLSTM(dev_data_pipeline)
        i_model.init_model(words_vocab_size=len(words_vocab),
                           input_dim=50,
                           lstm_hidden_dim=120,
                           number_of_classes=len(labels_vocab),
                           mask_symbol=words_vocab[PAD].id)