Esempio n. 1
0
def test_get_docs_returns_spacy_docs():
    op = SpacyBasedOperation(nlp=nlp)
    text_docs = DataStream(["this is doc 1", "this is doc 2"])
    spacy_docs = DataStream([nlp.make_doc(d) for d in text_docs])

    assert all((isinstance(d, Doc) for d in op.get_docs_stream(text_docs)))
    assert all((isinstance(d, Doc) for d in op.get_docs_stream(spacy_docs)))
Esempio n. 2
0
def test_remove_numbers():
    texts = ["One is 1", "Hey, my number is 23458"]
    expected = ["One is", "Hey, my number is"]
    ds = DataStream(texts)
    actual = list(map(str, ds.apply(remove_numbers())))

    assert actual == expected
Esempio n. 3
0
def test_accepts_stream_of_texts():
    patterns = [[{"LOWER": "this"}]]
    op = TokenFilterOperation(patterns)
    ds = DataStream(["this is a string data stream"])

    output = list(ds.apply(op))
    assert isinstance(output[0], str)
Esempio n. 4
0
def test_does_not_train_while_training_is_disabled():
    ds = DataStream(["this is text1", "this is text2"])
    op = ScikitBasedOperation(model=MagicMock(spec_set=TfidfVectorizer),
                              predict_fn_name="transform")
    op.should_train = False
    ds.apply(op)
    op.model.fit.assert_not_called()
Esempio n. 5
0
def test_remove_short_words():
    texts = ["this is a first text", "what was that"]
    expected = ["this first text", "what that"]

    ds = DataStream(texts)
    actual = list(map(str, ds.apply(remove_short_words(length=4))))

    assert actual == expected
Esempio n. 6
0
def test_accepts_stream_of_spacy_docs():
    nlp = spacy.load("en_core_web_sm")
    patterns = [[{"LOWER": "this"}]]
    op = TokenFilterOperation(patterns)
    ds = DataStream(nlp.pipe(["this is a spacy doc data stream"]))

    output = list(ds.apply(op))
    assert isinstance(output[0], str)
Esempio n. 7
0
def test_remove_emails():
    texts = [
        "please contact us at: [email protected]",
        "send email @ [email protected]",
    ]
    expected = ["please contact us at:", "send email @"]
    ds = DataStream(texts)
    actual = list(map(str, ds.apply(remove_emails())))

    assert actual == expected
Esempio n. 8
0
def test_remove_links():
    texts = [
        "visit us at www.example.com/testing",
        "our website is http://example.com/",
    ]
    expected = ["visit us at", "our website is"]
    ds = DataStream(texts)
    actual = list(map(str, ds.apply(remove_links())))

    assert actual == expected
Esempio n. 9
0
def test_vectorizes_correctly(op, input, input_count, is_input_generator):
    # pytest doesn't seem to support parametrize generator
    # convert input to generator here
    if is_input_generator:
        input = (x for x in input)
    ds = DataStream(input)
    features_ds = ds.apply(op)
    assert len(op.model.vocabulary_) > 0
    features = list(features_ds)
    assert len(features) == input_count
    assert features[0].shape[-1] == len(op.model.vocabulary_)
Esempio n. 10
0
def test_remove_stopwords():
    stopwords = ["this", "that", "an", "a"]
    texts = [
        "That is a nice car",
        "Python is a type of a snake",
        "This test should pass",
    ]
    expected = ["is nice car", "Python is type of snake", "test should pass"]
    ds = DataStream(texts)
    actual = list(map(str, ds.apply(remove_stopwords(stopwords))))

    assert actual == expected
Esempio n. 11
0
def test_if_every_token_is_removed_then_items_is_discarded():
    texts = ["this will be deleted", "this will not be deleted"]
    context = ["a", "b"]
    ds = DataStream(items=texts, context=context)
    op = remove_stopwords(["this", "will", "be", "deleted"])
    output_ds = ds.apply(op)

    actual_texts = list(output_ds.items)
    actual_context = list(output_ds.context)

    # check that we have only one text in the stream
    # and the context is "b"
    assert len(actual_texts) == 1
    assert len(actual_context) == 1
    assert actual_context[0] == "b"
Esempio n. 12
0
    def get_docs_stream(self, ds: DataStream) -> DataStream:
        """Returns DataStream of spacy Docs.
        If the data stream already contains spacy Docs then they
        are returned as-is otherwise the nlp object is used to
        create spacy Docs

        Parameters
        ----------
        ds : DataStream
            input data stream

        Returns
        ------
        out : DataStream
            A datastream containing an iterable of spacy's `Doc` objects
        """
        if ds.item_type != Doc:
            docs_with_context = self.nlp.pipe(
                zip(ds, ds.context),
                as_tuples=True,
                n_process=config.ALLOCATED_PROCESSOR_FOR_SPACY,
            )
            new_docs, context = more_itertools.unzip(docs_with_context)
            return DataStream(items=new_docs,
                              applied_ops=ds.applied_ops,
                              context=context)
        else:
            return ds
Esempio n. 13
0
 def run(self, ds: DataStream) -> DataStream:
     docs_ds = self.get_docs_stream(ds)
     processed_docs = map(self.process_doc, docs_ds, docs_ds.context)
     processed_docs = (x for x in processed_docs if x is not None)
     items, context = more_itertools.unzip(processed_docs)
     return DataStream(items=items,
                       applied_ops=ds.applied_ops + [self],
                       context=context)
Esempio n. 14
0
    def run(self, ds, fit_params: dict = {}, predict_params: dict = {}):
        if not self.can_predict_on_new:
            self.should_train = True

        if self.should_train:
            if ds.is_countable:
                train_ds = ds
                pred_ds = ds
            else:
                train_items, pred_items = itertools.tee(ds, 2)
                train_context, pred_context = itertools.tee(ds.context, 2)
                train_ds = DataStream(train_items, context=train_context)
                pred_ds = DataStream(pred_items, context=pred_context)

            self._fit(train_ds, fit_params)
        else:
            pred_ds = ds
        preds, context = more_itertools.unzip(
            self._predict(pred_ds, predict_params))
        preds = itertools.chain.from_iterable(preds)
        context = itertools.chain.from_iterable(context)
        return DataStream(items=preds,
                          context=context,
                          applied_ops=ds.applied_ops + [self])
Esempio n. 15
0
def ds():
    return DataStream(items=["Aa", "bbB"])
Esempio n. 16
0
def test_lemmatizes_correctly_for_stream_of_texts(texts, lemmatized, nlp):
    ds = DataStream(texts)
    op = lemmatize(nlp=nlp)
    assert list(ds.apply(op)) == lemmatized
Esempio n. 17
0
def test_token_filter_keeps_matching_tokens(input, patterns, expected):
    op = TokenFilterOperation(patterns, keep_matching_tokens=True)
    ds = DataStream(input)
    actual = list(map(str, ds.apply(op)))
    assert actual == expected
Esempio n. 18
0
def test_lemmatizes_correctly_for_stream_of_spacy_docs(texts, lemmatized, nlp):
    op = lemmatize(nlp=nlp)
    docs = nlp.pipe(texts)
    assert list(DataStream(docs).apply(op)) == lemmatized
Esempio n. 19
0
def test_correctly_changes_cases(ds: DataStream, mode: str, expected: list):
    op = CaseChangeOperation(mode=mode)
    assert list(ds.apply(op)) == expected
Esempio n. 20
0
def test_filter_pos():
    ds = DataStream(items=["that is an apple"])
    assert list(ds.apply(pos_filter("NOUN")).items) == ["that is an"]