def test_lf_applier_pandas_spacy_preprocessor_memoized(self) -> None:
        spacy = SpacyPreprocessor(text_field="text", doc_field="doc")
        spacy.memoize = True

        @labeling_function(pre=[spacy])
        def first_is_name(x: DataPoint) -> int:
            return 0 if x.doc[0].pos_ == "PROPN" else -1

        @labeling_function(pre=[spacy])
        def has_verb(x: DataPoint) -> int:
            return 0 if sum(t.pos_ == "VERB" for t in x.doc) > 0 else -1

        df = pd.DataFrame(dict(text=TEXT_DATA))
        df = dd.from_pandas(df, npartitions=2)
        applier = DaskLFApplier([first_is_name, has_verb])
        L = applier.apply(df)
        np.testing.assert_equal(L, L_TEXT_EXPECTED)
 def test_spacy_preprocessor(self) -> None:
     x = SimpleNamespace(text="Jane plays soccer.")
     preprocessor = SpacyPreprocessor("text", "doc")
     x_preprocessed = preprocessor(x)
     assert x_preprocessed is not None
     self.assertEqual(len(x_preprocessed.doc), 4)
     token = x_preprocessed.doc[0]
     self.assertEqual(token.text, "Jane")
     self.assertEqual(token.pos_, "PROPN")
    def test_lf_applier_pandas_spacy_preprocessor(self) -> None:
        spacy = SpacyPreprocessor(text_field="text", doc_field="doc")

        @labeling_function(pre=[spacy])
        def first_is_name(x: DataPoint) -> int:
            return 0 if x.doc[0].pos_ == "PROPN" else -1

        @labeling_function(pre=[spacy])
        def has_verb(x: DataPoint) -> int:
            return 0 if sum(t.pos_ == "VERB" for t in x.doc) > 0 else -1

        df = pd.DataFrame(dict(text=TEXT_DATA))
        applier = PandasLFApplier([first_is_name, has_verb])
        L = applier.apply(df, progress_bar=False)
        np.testing.assert_equal(L, L_TEXT_EXPECTED)
Example #4
0
 def _create_preprocessor(
         cls, parameters: SpacyPreprocessorParameters) -> SpacyPreprocessor:
     return SpacyPreprocessor(**parameters._asdict())
Example #5
0
# For more info, see the [`SpacyPreprocessor` documentation](https://snorkel.readthedocs.io/en/master/packages/_autosummary/preprocess/snorkel.preprocess.nlp.SpacyPreprocessor.html#snorkel.preprocess.nlp.SpacyPreprocessor).
#
#
# If you prefer to use a different NLP tool, you can also wrap that as a `Preprocessor` and use it in the same way.
# For more info, see the [`preprocessor` documentation](https://snorkel.readthedocs.io/en/master/packages/_autosummary/preprocess/snorkel.preprocess.preprocessor.html#snorkel.preprocess.preprocessor).

# %% [markdown] {"tags": ["md-exclude"]}
# If the spaCy English model wasn't already installed, the next cell may raise an exception.
# If this happens, restart the kernel and re-execute the cells up to this point.

# %%
from snorkel.preprocess.nlp import SpacyPreprocessor

# The SpacyPreprocessor parses the text in text_field and
# stores the new enriched representation in doc_field
spacy = SpacyPreprocessor(text_field="text", doc_field="doc", memoize=True)


# %%
@labeling_function(pre=[spacy])
def has_person(x):
    """Ham comments mention specific people and are short."""
    if len(x.doc) < 20 and any([ent.label_ == "PERSON" for ent in x.doc.ents]):
        return HAM
    else:
        return ABSTAIN


# %% [markdown]
# Because spaCy is such a common preprocessor for NLP applications, we also provide a
# [prebuilt `labeling_function`-like decorator that uses spaCy](https://snorkel.readthedocs.io/en/master/packages/_autosummary/labeling/snorkel.labeling.lf.nlp.nlp_labeling_function.html#snorkel.labeling.lf.nlp.nlp_labeling_function).
Example #6
0
 def _create_preprocessor(
         cls, parameters: SpacyPreprocessorParameters) -> SpacyPreprocessor:
     preprocessor = SpacyPreprocessor(**parameters._asdict())
     make_spark_preprocessor(preprocessor)
     return preprocessor
Example #7
0
    for line in inF:
        allDev.append(json.loads(line))

df_dev = pd.DataFrame(allDev)

df_train = df_tweets.loc[~df_tweets['id'].isin(df_dev['id'])]

#label mappings
BE = 0
NL = 1
ABSTAIN = -1

#spacy preprocessor for dutch
spacy_preproc = SpacyPreprocessor('text',
                                  'doc',
                                  language='nl_core_news_sm',
                                  memoize=True,
                                  disable=['tagger', 'parser'])


## RULES
@labeling_function()
def country_code(x):
    #country_code based on tweet location
    #precise but low coverage
    if x.country_code == 'BE':
        return BE
    elif x.country_code == 'NL':
        return NL
    else:
        return ABSTAIN