Esempio n. 1
0
    def pos_regex_matches(self, pattern):
        """
        Extract sequences of consecutive tokens from a spacy-parsed doc whose
        part-of-speech tags match the specified regex pattern.

        Args:
            pattern (str): Pattern of consecutive POS tags whose corresponding words
                are to be extracted, inspired by the regex patterns used in NLTK's
                ``nltk.chunk.regexp``. Tags are uppercase, from the universal tag set;
                delimited by < and >, which are basically converted to parentheses
                with spaces as needed to correctly extract matching word sequences;
                white space in the input doesn't matter.

                Examples (see :obj:`POS_REGEX_PATTERNS <textacy.regexes_etc.POS_REGEX_PATTERNS>`):

                * noun phrase: r'<DET>? (<NOUN>+ <ADP|CONJ>)* <NOUN>+'
                * compound nouns: r'<NOUN>+'
                * verb phrase: r'<VERB>?<ADV>*<VERB>+'
                * prepositional phrase: r'<PREP> <DET>? (<NOUN>+<ADP>)* <NOUN>+'

        Yields:
            ``spacy.Span``: the next span of consecutive tokens whose parts-of-speech
                match ``pattern``, in order of apperance in the document
        """
        for match in extract.pos_regex_matches(self.spacy_doc, pattern):
            yield match
Esempio n. 2
0
    def pos_regex_matches(self, pattern):
        """
        Extract sequences of consecutive tokens from a spacy-parsed doc whose
        part-of-speech tags match the specified regex pattern.

        Args:
            pattern (str): Pattern of consecutive POS tags whose corresponding words
                are to be extracted, inspired by the regex patterns used in NLTK's
                ``nltk.chunk.regexp``. Tags are uppercase, from the universal tag set;
                delimited by < and >, which are basically converted to parentheses
                with spaces as needed to correctly extract matching word sequences;
                white space in the input doesn't matter.

                Examples (see :obj:`POS_REGEX_PATTERNS <textacy.regexes_etc.POS_REGEX_PATTERNS>`):

                * noun phrase: r'<DET>? (<NOUN>+ <ADP|CONJ>)* <NOUN>+'
                * compound nouns: r'<NOUN>+'
                * verb phrase: r'<VERB>?<ADV>*<VERB>+'
                * prepositional phrase: r'<PREP> <DET>? (<NOUN>+<ADP>)* <NOUN>+'

        Yields:
            ``spacy.Span``: the next span of consecutive tokens whose parts-of-speech
                match ``pattern``, in order of apperance in the document
        """
        for match in extract.pos_regex_matches(self.spacy_doc, pattern):
            yield match
Esempio n. 3
0
def detect_verb_phrases(sentence, return_as_string: bool = True):
    pattern = r"(<VERB>?<ADV>*<VERB>+)"
    doc = make_spacy_doc(data=sentence, lang="en_core_web_sm")
    verb_phrases = pos_regex_matches(doc=doc, pattern=pattern)

    if return_as_string:
        return " ".join([c.text for c in verb_phrases])
    return verb_phrases
Esempio n. 4
0
 def test_complex(self, spacy_doc):
     pattern = constants.POS_REGEX_PATTERNS["en"]["NP"]
     valid_pos = set(re.findall(r"(\w+)", pattern))
     required_pos = {"NOUN", "PROPN"}
     result = list(extract.pos_regex_matches(spacy_doc, pattern))
     assert all(isinstance(span, Span) for span in result)
     assert all(tok.pos_ in valid_pos for span in result for tok in span)
     assert all(any(tok.pos_ in required_pos for tok in span) for span in result)
Esempio n. 5
0
 def test_pos_regex_matches(self):
     expected = [
         'Two weeks', 'Kuwait', 'an I.M.F. seminar', 'Arab educators',
         '30 minutes', 'the impact', 'technology trends', 'education',
         'the Middle East', 'an Egyptian education official', 'his hand',
         'a personal question', 'Donald Trump', 'mosques',
         'the United States', 'great sorrow', 'that what', 'our kids']
     observed = [span.text for span in extract.pos_regex_matches(
         self.spacy_doc, regexes_etc.POS_REGEX_PATTERNS['en']['NP'])]
     self.assertEqual(observed, expected)
Esempio n. 6
0
 def test_noun_conj(self, sentence, noun):
     """
     Given a sentence and verb, look for conjunctions containing
     that noun
     param: sentence     A Spacy Span
     param: noun     A Spacy Token
     return: A Spacy span (the conjunction containing the noun)
     """
     possible_conjs = list(
         pos_regex_matches(sentence, r'<NOUN><CONJ><NOUN>'))
     for conj in possible_conjs:
         if noun in conj:
             return conj
Esempio n. 7
0
 def test_pos_regex_matches(self):
     expected = [
         'Two weeks', 'Kuwait', 'an I.M.F. seminar', 'Arab educators',
         '30 minutes', 'the impact', 'technology trends', 'education',
         'the Middle East', 'an Egyptian education official', 'his hand',
         'a personal question', 'Donald Trump', 'mosques',
         'the United States', 'great sorrow', 'that what', 'our kids'
     ]
     observed = [
         span.text for span in extract.pos_regex_matches(
             self.spacy_doc, regexes_etc.POS_REGEX_PATTERNS['en']['NP'])
     ]
     self.assertEqual(observed, expected)
Esempio n. 8
0
 def nouns_from_relative_clause(self, sentence, verb):
     """
     Given a sentence and verb, look for relative clauses and 
     identify nouns  
     param: sentence     A Spacy Span
     param: verb     A SPacy Token
     return: A Spacy token (the extracted noun)
     """
     possible_clauses = list(pos_regex_matches(sentence, r'<NOUN>+<VERB>'))
     for clause in possible_clauses:
         if verb in clause:
             for token in clause:
                 if token.tag_ == 'NNS':
                     return token
Esempio n. 9
0
def test_extract_functionality(doc):
    bigrams = list(
        extract.ngrams(doc,
                       2,
                       filter_stops=True,
                       filter_punct=True,
                       filter_nums=False))[:10]
    for bigram in bigrams:
        assert isinstance(bigram, SpacySpan)
        assert len(bigram) == 2

    trigrams = list(
        extract.ngrams(doc,
                       3,
                       filter_stops=True,
                       filter_punct=True,
                       min_freq=2))[:10]
    for trigram in trigrams:
        assert isinstance(trigram, SpacySpan)
        assert len(trigram) == 3

    nes = list(
        extract.named_entities(doc,
                               drop_determiners=False,
                               exclude_types='numeric'))[:10]
    for ne in nes:
        assert isinstance(ne, SpacySpan)
        assert ne.label_
        assert ne.label_ != 'QUANTITY'

    pos_regex_matches = list(
        extract.pos_regex_matches(
            doc, constants.POS_REGEX_PATTERNS['en']['NP']))[:10]
    for match in pos_regex_matches:
        assert isinstance(match, SpacySpan)

    stmts = list(extract.semistructured_statements(doc, 'I', cue='be'))[:10]
    for stmt in stmts:
        assert isinstance(stmt, list)
        assert isinstance(stmt[0], compat.unicode_)
        assert len(stmt) == 3

    kts = keyterms.textrank(doc, n_keyterms=10)
    for keyterm in kts:
        assert isinstance(keyterm, tuple)
        assert isinstance(keyterm[0], compat.unicode_)
        assert isinstance(keyterm[1], float)
        assert keyterm[1] > 0.0
Esempio n. 10
0
def test_extract_functionality(doc):
    bigrams = list(
        extract.ngrams(doc,
                       2,
                       filter_stops=True,
                       filter_punct=True,
                       filter_nums=False))[:10]
    for bigram in bigrams:
        assert isinstance(bigram, Span)
        assert len(bigram) == 2

    trigrams = list(
        extract.ngrams(doc,
                       3,
                       filter_stops=True,
                       filter_punct=True,
                       min_freq=2))[:10]
    for trigram in trigrams:
        assert isinstance(trigram, Span)
        assert len(trigram) == 3

    nes = list(
        extract.entities(doc, drop_determiners=False,
                         exclude_types="numeric"))[:10]
    for ne in nes:
        assert isinstance(ne, Span)
        assert ne.label_
        assert ne.label_ != "QUANTITY"

    pos_regex_matches = list(
        extract.pos_regex_matches(
            doc, constants.POS_REGEX_PATTERNS["en"]["NP"]))[:10]
    for match in pos_regex_matches:
        assert isinstance(match, Span)

    stmts = list(extract.semistructured_statements(doc, "I", cue="be"))[:10]
    for stmt in stmts:
        assert isinstance(stmt, list)
        assert isinstance(stmt[0], compat.unicode_)
        assert len(stmt) == 3

    kts = textacy.ke.textrank(doc, topn=10)
    for keyterm in kts:
        assert isinstance(keyterm, tuple)
        assert isinstance(keyterm[0], compat.unicode_)
        assert isinstance(keyterm[1], float)
        assert keyterm[1] > 0.0
Esempio n. 11
0
def get_pos_regex_matches(doc, pattern):
    assert isinstance(doc, textacy.Doc) or isinstance(
        doc, spacy.tokens.Doc), "Only {} are supported".format(possible_docs)
    assert isinstance(pattern, basestring), "The pattern should be a string"
    return extract.pos_regex_matches(doc, pattern)
Esempio n. 12
0
def extract_pattern_list(doc, pptn):
    return list(extract.pos_regex_matches(doc, pptn))
Esempio n. 13
0
 def test_simple(self, spacy_doc):
     result = list(extract.pos_regex_matches(spacy_doc, r"<NOUN>+"))
     assert all(isinstance(span, Span) for span in result)
     assert all(tok.pos_ == "NOUN" for span in result for tok in span)
Esempio n. 14
0
 def test_deprecation_warning(self, spacy_doc):
     with pytest.warns(DeprecationWarning):
         _ = list(extract.pos_regex_matches(spacy_doc, r"<NOUN>"))