Exemple #1
0
def select_extractors(use_unigrams=False):
    """
    Extractors For Alignment
    :return: List of Extractors objects to use for text-text alignment
    note: ngram extractors below filter out stopwords and number words/symbols
    """
    noun_chunk_extractor = Extractor(
        lambda doc: list(filter(lambda x: len(x) > 3, list(noun_chunks(doc)))))
    tetragram_extractor = Extractor(
        lambda doc: list(ngrams(doc, 4, filter_stops=True, filter_nums=True)))
    trigram_extractor = Extractor(
        lambda doc: list(ngrams(doc, 3, filter_stops=True, filter_nums=True)))
    bigram_extractor = Extractor(lambda doc: list(
        ngrams(doc, 2, filter_stops=False, filter_nums=False)))
    unigram_extractor = Extractor(lambda doc: list(
        ngrams(doc, 1, filter_stops=False, filter_nums=False)))

    extractor_list = [
        noun_chunk_extractor,
        tetragram_extractor,
        trigram_extractor,
        bigram_extractor,
    ]

    if use_unigrams:
        extractor_list.append(unigram_extractor)

    return extractor_list
Exemple #2
0
 def test_pos(self, spacy_doc):
     result1 = list(extract.ngrams(spacy_doc, 2, include_pos={"NOUN"}))
     result2 = list(extract.ngrams(spacy_doc, 2, include_pos="NOUN"))
     assert all(tok.pos_ == "NOUN" for span in result1 for tok in span)
     assert all(tok.pos_ == "NOUN" for span in result2 for tok in span)
     result3 = list(extract.ngrams(spacy_doc, 2, exclude_pos={"NOUN"}))
     result4 = list(extract.ngrams(spacy_doc, 2, exclude_pos="NOUN"))
     assert not any(tok.pos_ == "NOUN" for span in result3 for tok in span)
     assert not any(tok.pos_ == "NOUN" for span in result4 for tok in span)
Exemple #3
0
def test_extract_functionality(doc):
    bigrams = list(
        extract.ngrams(doc,
                       2,
                       filter_stops=True,
                       filter_punct=True,
                       filter_nums=False))[:10]
    for bigram in bigrams:
        assert isinstance(bigram, SpacySpan)
        assert len(bigram) == 2

    trigrams = list(
        extract.ngrams(doc,
                       3,
                       filter_stops=True,
                       filter_punct=True,
                       min_freq=2))[:10]
    for trigram in trigrams:
        assert isinstance(trigram, SpacySpan)
        assert len(trigram) == 3

    nes = list(
        extract.named_entities(doc,
                               drop_determiners=False,
                               exclude_types='numeric'))[:10]
    for ne in nes:
        assert isinstance(ne, SpacySpan)
        assert ne.label_
        assert ne.label_ != 'QUANTITY'

    pos_regex_matches = list(
        extract.pos_regex_matches(
            doc, constants.POS_REGEX_PATTERNS['en']['NP']))[:10]
    for match in pos_regex_matches:
        assert isinstance(match, SpacySpan)

    stmts = list(extract.semistructured_statements(doc, 'I', cue='be'))[:10]
    for stmt in stmts:
        assert isinstance(stmt, list)
        assert isinstance(stmt[0], compat.unicode_)
        assert len(stmt) == 3

    kts = keyterms.textrank(doc, n_keyterms=10)
    for keyterm in kts:
        assert isinstance(keyterm, tuple)
        assert isinstance(keyterm[0], compat.unicode_)
        assert isinstance(keyterm[1], float)
        assert keyterm[1] > 0.0
Exemple #4
0
    def ngrams(self, n, **kwargs):
        """
        Extract an ordered sequence of n-grams (``n`` consecutive words) from doc,
        optionally filtering n-grams by the types and parts-of-speech of the
        constituent words.

        Args:
            n (int): number of tokens to include in n-grams;
                1 => unigrams, 2 => bigrams
            **kwargs:
                filter_stops (bool, optional): if True, remove ngrams that start or end
                    with a stop word
                filter_punct (bool, optional): if True, remove ngrams that contain
                    any punctuation-only tokens
                filter_nums (bool, optional): if True, remove ngrams that contain
                    any numbers or number-like tokens (e.g. 10, 'ten')
                good_pos_tags (set[str], optional): remove ngrams whose constituent
                    tokens' part-of-speech tags are NOT all in the specified tags,
                    using the universal POS tagset
                bad_pos_tags (set[str], optional): remove ngrams if any of their constituent
                    tokens' part-of-speech tags are in the specified tags,
                    using the universal POS tagset
                min_freq (int, optional): remove ngrams that occur in `doc` fewer than
                    `min_freq` times

        Yields:
            ``spacy.Span``: the next ngram passing all specified filters,
                in order of appearance in the document

        .. seealso:: :func:`extract.ngrams() <textacy.extract.ngrams>`
        """
        for ngram in extract.ngrams(self.spacy_doc, n, **kwargs):
            yield ngram
    def align_data(self, text, data):

        if type(text) == str:
            doc = self.nlp(text)
        else:
            doc = text

        ngrams = []

        for n in range(1, min(self.max_n + 1, len(doc))):

            ngrams.extend(extract.ngrams(doc, n))

        subject_sims = [(ngram,
                         self.similarity_metric(ngram.text, data['subject']))
                        for ngram in ngrams]

        self.logger.debug("Similarities from subject %s",
                          list(zip(ngrams, subject_sims)))

        subject_span, subject_sim = max(subject_sims, key=lambda x: x[1])

        self.logger.debug(
            f"Selected subject_span [{subject_span}] with similarity [{subject_sim}] for [{data['subject']}]"
        )

        object_sims = [(ngram,
                        self.similarity_metric(ngram.text, data['object']))
                       for ngram in ngrams]

        self.logger.debug("Similarities from object %s",
                          list(zip(ngrams, object_sims)))

        object_span = None

        for span, sim in sorted(object_sims, key=lambda x: x[1], reverse=True):

            # tests if the current span doesn't overlaps the subject one
            if overlaps(span, subject_span):

                self.logger.debug(
                    f"Span [{span.text}] overlaps subject_span [{subject_span.text}]"
                )
                continue

            object_span = span

            self.logger.debug(
                f"Selected object_span [{object_span.text}] with similarity [{sim}] for [{data['object']}]"
            )
            break

        if object_span is None:

            self.logger.warning("I can't extract object_span.")

        self.subject_align[data['subject']] = subject_span
        self.object_align[data['object']] = object_span

        return subject_span, object_span
Exemple #6
0
    def ngrams(self, n, **kwargs):
        """
        Extract an ordered sequence of n-grams (``n`` consecutive words) from doc,
        optionally filtering n-grams by the types and parts-of-speech of the
        constituent words.

        Args:
            n (int): number of tokens to include in n-grams;
                1 => unigrams, 2 => bigrams
            **kwargs:
                filter_stops (bool, optional): if True, remove ngrams that start or end
                    with a stop word
                filter_punct (bool, optional): if True, remove ngrams that contain
                    any punctuation-only tokens
                filter_nums (bool, optional): if True, remove ngrams that contain
                    any numbers or number-like tokens (e.g. 10, 'ten')
                good_pos_tags (set[str], optional): remove ngrams whose constituent
                    tokens' part-of-speech tags are NOT all in the specified tags,
                    using the universal POS tagset
                bad_pos_tags (set[str], optional): remove ngrams if any of their constituent
                    tokens' part-of-speech tags are in the specified tags,
                    using the universal POS tagset
                min_freq (int, optional): remove ngrams that occur in `doc` fewer than
                    `min_freq` times

        Yields:
            ``spacy.Span``: the next ngram passing all specified filters,
                in order of appearance in the document

        .. seealso:: :func:`extract.ngrams() <textacy.extract.ngrams>`
        """
        for ngram in extract.ngrams(self.spacy_doc, n, **kwargs):
            yield ngram
Exemple #7
0
 def test_min_freq(self, spacy_doc):
     n = 2
     counts = collections.Counter()
     counts.update(spacy_doc[i:i + n].lower_
                   for i in range(len(spacy_doc) - n + 1))
     result = list(extract.ngrams(spacy_doc, 2, min_freq=2))
     assert all(counts[span.lower_] >= 2 for span in result)
Exemple #8
0
def test_extract_functionality(doc):
    bigrams = list(
        extract.ngrams(doc,
                       2,
                       filter_stops=True,
                       filter_punct=True,
                       filter_nums=False))[:10]
    for bigram in bigrams:
        assert isinstance(bigram, Span)
        assert len(bigram) == 2

    trigrams = list(
        extract.ngrams(doc,
                       3,
                       filter_stops=True,
                       filter_punct=True,
                       min_freq=2))[:10]
    for trigram in trigrams:
        assert isinstance(trigram, Span)
        assert len(trigram) == 3

    nes = list(
        extract.entities(doc, drop_determiners=False,
                         exclude_types="numeric"))[:10]
    for ne in nes:
        assert isinstance(ne, Span)
        assert ne.label_
        assert ne.label_ != "QUANTITY"

    pos_regex_matches = list(
        extract.pos_regex_matches(
            doc, constants.POS_REGEX_PATTERNS["en"]["NP"]))[:10]
    for match in pos_regex_matches:
        assert isinstance(match, Span)

    stmts = list(extract.semistructured_statements(doc, "I", cue="be"))[:10]
    for stmt in stmts:
        assert isinstance(stmt, list)
        assert isinstance(stmt[0], compat.unicode_)
        assert len(stmt) == 3

    kts = textacy.ke.textrank(doc, topn=10)
    for keyterm in kts:
        assert isinstance(keyterm, tuple)
        assert isinstance(keyterm[0], compat.unicode_)
        assert isinstance(keyterm[1], float)
        assert keyterm[1] > 0.0
Exemple #9
0
 def test_ngrams_good_tag(self):
     expected = [
         'I.M.F. seminar', 'technology trends', 'Middle East', 'education official',
         'Donald Trump', 'United States', 'what we']
     observed = [span.orth_ for span in extract.ngrams(
         self.spacy_doc, 2, filter_stops=False, filter_punct=False, filter_nums=False,
         good_pos_tags={'NOUN'})]
     self.assertEqual(observed, expected)
Exemple #10
0
 def test_ngrams_1(self):
     expected = [
         'Two', 'weeks', 'ago', ',', 'I', 'was', 'in', 'Kuwait', 'participating',
         'in', 'an', 'I.M.F.', 'seminar', 'for', 'Arab', 'educators', '.', 'For',
         '30', 'minutes', ',', 'we', 'discussed', 'the', 'impact']
     observed = [span.orth_ for span in extract.ngrams(
         self.spacy_doc, 1, filter_stops=False, filter_punct=False, filter_nums=False)][:25]
     self.assertEqual(observed, expected)
Exemple #11
0
def test_extract_functionality(doc):
    bigrams = list(
        extract.ngrams(doc,
                       2,
                       filter_stops=True,
                       filter_punct=True,
                       filter_nums=False))[:10]
    for bigram in bigrams:
        assert isinstance(bigram, Span)
        assert len(bigram) == 2

    trigrams = list(
        extract.ngrams(doc,
                       3,
                       filter_stops=True,
                       filter_punct=True,
                       min_freq=2))[:10]
    for trigram in trigrams:
        assert isinstance(trigram, Span)
        assert len(trigram) == 3

    nes = list(
        extract.entities(doc, drop_determiners=False,
                         exclude_types="numeric"))[:10]
    for ne in nes:
        assert isinstance(ne, Span)
        assert ne.label_
        assert ne.label_ != "QUANTITY"

    regex_matches = list(extract.regex_matches(doc, "Mr\. Speaker"))[:10]
    for match in regex_matches:
        assert isinstance(match, Span)

    stmts = list(extract.semistructured_statements(doc, entity="I",
                                                   cue="be"))[:10]
    for stmt in stmts:
        assert isinstance(stmt, list)
        assert isinstance(stmt[0], str)
        assert len(stmt) == 3

    kts = kt.textrank(doc, topn=10)
    for keyterm in kts:
        assert isinstance(keyterm, tuple)
        assert isinstance(keyterm[0], str)
        assert isinstance(keyterm[1], float)
        assert keyterm[1] > 0.0
Exemple #12
0
 def test_ngrams_filter(self):
     expected = [
         'weeks ago', 'Kuwait participating', 'I.M.F. seminar', 'Arab educators',
         'technology trends', 'Middle East', 'Egyptian education', 'education official',
         'official raised', 'personal question', 'heard Donald', 'Donald Trump',
         'close mosques', 'United States', 'great sorrow']
     observed = [span.orth_ for span in extract.ngrams(
         self.spacy_doc, 2, filter_stops=True, filter_punct=True, filter_nums=True)]
     self.assertEqual(observed, expected)
Exemple #13
0
 def test_filter(self, spacy_doc):
     result = list(
         extract.ngrams(
             spacy_doc, 2, filter_stops=True, filter_punct=True, filter_nums=True
         )
     )
     assert not any(span[0].is_stop or span[-1].is_stop for span in result)
     assert not any(tok.is_punct for span in result for tok in span)
     assert not any(tok.like_num for span in result for tok in span)
Exemple #14
0
def get_ngrams(document, n, min_freq=1, filter_punct=True):
    res = sorted([
        n.text for n in ngrams(nlp(document),
                               n,
                               filter_stops=False,
                               min_freq=min_freq,
                               filter_punct=filter_punct)
    ])
    return res
Exemple #15
0
 def test_ngrams_2(self):
     expected = [
         'Two weeks', 'weeks ago', 'ago,', ', I', 'I was', 'was in', 'in Kuwait',
         'Kuwait participating', 'participating in', 'in an', 'an I.M.F.', 'I.M.F. seminar',
         'seminar for', 'for Arab', 'Arab educators', 'educators.', '. For', 'For 30',
         '30 minutes', 'minutes,', ', we', 'we discussed', 'discussed the', 'the impact',
         'impact of']
     observed = [span.orth_ for span in extract.ngrams(
         self.spacy_doc, 2, filter_stops=False, filter_punct=False, filter_nums=False)][:25]
     self.assertEqual(observed, expected)
Exemple #16
0
def test_ngrams_good_tag(spacy_doc):
    result = [
        span for span in extract.ngrams(spacy_doc,
                                        2,
                                        filter_stops=False,
                                        filter_punct=False,
                                        filter_nums=False,
                                        include_pos={'NOUN'})
    ]
    assert all(tok.pos_ == 'NOUN' for span in result for tok in span)
 def test_callable_args(self, spacy_doc):
     results = list(
         extract.terms(
             spacy_doc,
             ngs=lambda doc: extract.ngrams(doc, n=2),
             ents=extract.entities,
             ncs=extract.noun_chunks,
         ))
     assert results
     assert all(isinstance(result, Span) for result in results)
Exemple #18
0
 def test_ngrams_min_freq(self):
     expected = ['in the', 'in the']
     observed = [
         span.orth_ for span in extract.ngrams(self.spacy_doc,
                                               2,
                                               filter_stops=False,
                                               filter_punct=False,
                                               filter_nums=False,
                                               min_freq=2)
     ]
     self.assertEqual(observed, expected)
Exemple #19
0
def test_ngrams_n(spacy_doc):
    for n in (1, 2):
        result = [
            span for span in extract.ngrams(spacy_doc,
                                            n,
                                            filter_stops=False,
                                            filter_punct=False,
                                            filter_nums=False)
        ]
        assert all(len(span) == n for span in result)
        assert all(isinstance(span, SpacySpan) for span in result)
Exemple #20
0
 def test_ngrams_good_tag(self):
     expected = ['technology trends', 'education official']
     observed = [
         span.orth_ for span in extract.ngrams(self.spacy_doc,
                                               2,
                                               filter_stops=False,
                                               filter_punct=False,
                                               filter_nums=False,
                                               good_pos_tags={'NOUN'})
     ]
     self.assertEqual(observed, expected)
Exemple #21
0
def _get_pos_ngrams_sent(spacy_sent, n):
    """
    Returns a list (including duplicates) of the POS ngrams appearing in spacy_sent.
    """
    pos_ngrams = []
    for ngram in extract.ngrams(spacy_sent,
                                n=n,
                                filter_stops=False,
                                filter_punct=False):
        ngram_string = " ".join([word.pos_ for word in ngram])
        pos_ngrams.append(ngram_string)
    return pos_ngrams  # list of strings
Exemple #22
0
    def ngrams(self, n, **kwargs):
        """
        Extract an ordered sequence of n-grams (``n`` consecutive words) from doc,
        optionally filtering n-grams by the types and parts-of-speech of the
        constituent words.

        Args:
            n (int): number of tokens to include in n-grams;
                1 => unigrams, 2 => bigrams

        .. seealso:: :func:`extract.ngrams() <textacy.extract.ngrams>` for all function kwargs.
        """
        return extract.ngrams(self.spacy_doc, n, **kwargs)
Exemple #23
0
    def ngrams(self, n, **kwargs):
        """
        Extract an ordered sequence of n-grams (``n`` consecutive words) from doc,
        optionally filtering n-grams by the types and parts-of-speech of the
        constituent words.

        Args:
            n (int): number of tokens to include in n-grams;
                1 => unigrams, 2 => bigrams

        .. seealso:: :func:`extract.ngrams() <textacy.extract.ngrams>` for all function kwargs.
        """
        return extract.ngrams(self.spacy_doc, n, **kwargs)
 def transform(self, documents):
     result = []
     for doc_ in documents:
         doc = NGramTransformer.nlp(doc_)
         result.append(
             sorted([
                 t.text for t in ngrams(doc,
                                        n=self.n,
                                        filter_stops=False,
                                        min_freq=self.min_freq,
                                        filter_punct=self.filter_punct)
             ]))
     return result
Exemple #25
0
def test_ngrams_min_freq(spacy_doc):
    n = 2
    counts = collections.Counter()
    counts.update(spacy_doc[i:i + n].lower_
                  for i in range(len(spacy_doc) - n + 1))
    result = [
        span for span in extract.ngrams(spacy_doc,
                                        n,
                                        filter_stops=False,
                                        filter_punct=False,
                                        filter_nums=False,
                                        min_freq=2)
    ]
    assert all(counts[span.lower_] >= 2 for span in result)
Exemple #26
0
 def test_ngrams_filter(self):
     result = [
         span for span in extract.ngrams(self.spacy_doc,
                                         2,
                                         filter_stops=True,
                                         filter_punct=True,
                                         filter_nums=True)
     ]
     self.assertTrue(not any(span[0].is_stop or span[-1].is_stop
                             for span in result))
     self.assertTrue(not any(tok.is_punct for span in result
                             for tok in span))
     self.assertTrue(not any(tok.like_num for span in result
                             for tok in span))
Exemple #27
0
 def test_ngrams_1(self):
     expected = [
         'Two', 'weeks', 'ago', ',', 'I', 'was', 'in', 'Kuwait',
         'participating', 'in', 'an', 'I.M.F.', 'seminar', 'for', 'Arab',
         'educators', '.', 'For', '30', 'minutes', ',', 'we', 'discussed',
         'the', 'impact'
     ]
     observed = [
         span.orth_ for span in extract.ngrams(self.spacy_doc,
                                               1,
                                               filter_stops=False,
                                               filter_punct=False,
                                               filter_nums=False)
     ][:25]
     self.assertEqual(observed, expected)
Exemple #28
0
 def test_ngrams_filter(self):
     expected = [
         'weeks ago', 'Kuwait participating', 'I.M.F. seminar',
         'Arab educators', 'technology trends', 'Middle East',
         'Egyptian education', 'education official', 'official raised',
         'personal question', 'heard Donald', 'Donald Trump',
         'close mosques', 'United States', 'great sorrow'
     ]
     observed = [
         span.orth_ for span in extract.ngrams(self.spacy_doc,
                                               2,
                                               filter_stops=True,
                                               filter_punct=True,
                                               filter_nums=True)
     ]
     self.assertEqual(observed, expected)
Exemple #29
0
 def test_ngrams_2(self):
     expected = [
         'Two weeks', 'weeks ago', 'ago,', ', I', 'I was', 'was in',
         'in Kuwait', 'Kuwait participating', 'participating in', 'in an',
         'an I.M.F.', 'I.M.F. seminar', 'seminar for', 'for Arab',
         'Arab educators', 'educators.', '. For', 'For 30', '30 minutes',
         'minutes,', ', we', 'we discussed', 'discussed the', 'the impact',
         'impact of'
     ]
     observed = [
         span.orth_ for span in extract.ngrams(self.spacy_doc,
                                               2,
                                               filter_stops=False,
                                               filter_punct=False,
                                               filter_nums=False)
     ][:25]
     self.assertEqual(observed, expected)
Exemple #30
0
    def run_custom_task(self, temp_file, mongo_client: MongoClient):
        log('run custom task')
        n_num = self.get_integer('n', default=2)
        filter_stops = self.get_boolean('filter_stops', default=True)
        filter_punct = self.get_boolean('filter_punct', default=True)
        filter_nums = self.get_boolean('filter_nums', default=False)
        lemmas = self.get_boolean('lemmas', default=True)
        limit_to_termset = self.get_boolean('limit_to_termset', default=False)
        termset = self.pipeline_config.terms
        if not termset:
            termset = list()
        lower_termset = [x.lower() for x in termset]

        for doc in self.docs:
            ngrams = list()
            cln_txt = self.get_document_text(doc, clean=True)
            t_doc = make_spacy_doc(preprocess_text(cln_txt, lowercase=True), lang='en')
            res = extract.ngrams(t_doc, n_num, filter_stops=filter_stops, filter_punct=filter_punct,
                                 filter_nums=filter_nums)
            for r in res:
                if lemmas:
                    text = r.lemma_
                else:
                    text = r.text

                if limit_to_termset:
                    for t in lower_termset:
                        if text == t or t in text:
                            ngrams.append({
                                'text': text,
                                'count': 1
                            })
                else:
                    ngrams.append({
                        'text': text,
                        'count': 1
                    })
            self.write_multiple_result_data(temp_file, mongo_client, doc, ngrams)
    def run_custom_task(self, temp_file, mongo_client: MongoClient):
        print('run custom task')
        n_num = self.get_integer('n', default=2)
        filter_stops = self.get_boolean('filter_stops', default=True)
        filter_punct = self.get_boolean('filter_punct', default=True)
        filter_nums = self.get_boolean('filter_nums', default=False)
        lemmas = self.get_boolean('lemmas', default=True)
        limit_to_termset = self.get_boolean('limit_to_termset', default=False)
        termset = self.pipeline_config.terms
        if not termset:
            termset = list()
        lower_termset = [x.lower() for x in termset]

        for doc in self.docs:
            ngrams = list()
            cln_txt = self.get_document_text(doc, clean=True)
            t_doc = Doc(preprocess_text(cln_txt, lowercase=True))
            res = extract.ngrams(t_doc, n_num, filter_stops=filter_stops, filter_punct=filter_punct,
                                 filter_nums=filter_nums)
            for r in res:
                if lemmas:
                    text = r.lemma_
                else:
                    text = r.text

                if limit_to_termset:
                    for t in lower_termset:
                        if text == t or t in text:
                            ngrams.append({
                                'text': text,
                                'count': 1
                            })
                else:
                    ngrams.append({
                        'text': text,
                        'count': 1
                    })
            self.write_multiple_result_data(temp_file, mongo_client, doc, ngrams)
Exemple #32
0
def sgrank(doc, normalize='lemma', window_width=1500, n_keyterms=10, idf=None):
    """
    Extract key terms from a document using the [SGRank]_ algorithm.

    Args:
        doc (``textacy.Doc`` or ``spacy.Doc``)
        normalize (str or callable): if 'lemma', lemmatize terms; if 'lower',
            lowercase terms; if None, use the form of terms as they appeared in
            ``doc``; if a callable, must accept a ``spacy.Span`` and return a str,
            e.g. :func:`textacy.spacy_utils.normalized_str()`
        window_width (int): width of sliding window in which term
            co-occurrences are said to occur
        n_keyterms (int or float): if int, number of top-ranked terms
            to return as keyterms; if float, must be in the open interval (0, 1),
            is converted to an integer by ``round(len(doc) * n_keyterms)``
        idf (dict): mapping of ``normalize(term)`` to inverse document frequency
            for re-weighting of unigrams (n-grams with n > 1 have df assumed = 1);
            NOTE: results are better with idf information

    Returns:
        List[Tuple[str, float]]: sorted list of top ``n_keyterms`` key terms and
            their corresponding SGRank scores

    Raises:
        ValueError: if ``n_keyterms`` is a float but not in (0.0, 1.0]

    References:
        .. [SGRank] Danesh, Sumner, and Martin. "SGRank: Combining Statistical and
           Graphical Methods to Improve the State of the Art in Unsupervised Keyphrase
           Extraction". Lexical and Computational Semantics (* SEM 2015) (2015): 117.
    """
    n_toks = len(doc)
    if isinstance(n_keyterms, float):
        if not 0.0 < n_keyterms <= 1.0:
            raise ValueError('`n_keyterms` must be an int, or a float between 0.0 and 1.0')
        n_keyterms = int(round(n_toks * n_keyterms))
    window_width = min(n_toks, window_width)
    min_term_freq = min(n_toks // 1000, 4)

    # build full list of candidate terms
    # if inverse doc freqs available, include nouns, adjectives, and verbs;
    # otherwise, just include nouns and adjectives
    # (without IDF downweighting, verbs dominate the results in a bad way)
    include_pos = {'NOUN', 'PROPN', 'ADJ', 'VERB'} if idf else {'NOUN', 'PROPN', 'ADJ'}
    terms = itertoolz.concat(
        extract.ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False,
                       include_pos=include_pos, min_freq=min_term_freq)
        for n in range(1, 7))

    # get normalized term strings, as desired
    # paired with positional index in document and length in a 3-tuple
    if normalize == 'lemma':
        terms = [(term.lemma_, term.start, len(term)) for term in terms]
    elif normalize == 'lower':
        terms = [(term.orth_.lower(), term.start, len(term)) for term in terms]
    elif not normalize:
        terms = [(term.text, term.start, len(term)) for term in terms]
    else:
        terms = [(normalize(term), term.start, len(term)) for term in terms]

    # pre-filter terms to the top N ranked by TF or modified TF*IDF
    n_prefilter_kts = max(3 * n_keyterms, 100)
    term_text_counts = Counter(term[0] for term in terms)
    if idf:
        mod_tfidfs = {
            term: count * idf.get(term, 1) if ' ' not in term else count
            for term, count in term_text_counts.items()}
        terms_set = {
            term for term, _
            in sorted(mod_tfidfs.items(), key=itemgetter(1), reverse=True)[:n_prefilter_kts]}
    else:
        terms_set = {term for term, _ in term_text_counts.most_common(n_prefilter_kts)}
    terms = [term for term in terms if term[0] in terms_set]

    # compute term weights from statistical attributes:
    # not subsumed frequency, position of first occurrence, and num words
    term_weights = {}
    seen_terms = set()
    n_toks_plus_1 = n_toks + 1
    for term in terms:
        term_text = term[0]
        # we only want the *first* occurrence of a unique term (by its text)
        if term_text in seen_terms:
            continue
        seen_terms.add(term_text)
        pos_first_occ_factor = math.log(n_toks_plus_1 / (term[1] + 1))
        # TODO: assess how best to scale term len
        term_len = math.sqrt(term[2])  # term[2]
        term_count = term_text_counts[term_text]
        subsum_count = sum(term_text_counts[t2] for t2 in terms_set
                           if t2 != term_text and term_text in t2)
        term_freq_factor = term_count - subsum_count
        if idf and term[2] == 1:
            term_freq_factor *= idf.get(term_text, 1)
        term_weights[term_text] = term_freq_factor * pos_first_occ_factor * term_len

    # filter terms to only those with positive weights
    terms = [term for term in terms if term_weights[term[0]] > 0]

    n_coocs = defaultdict(lambda: defaultdict(int))
    sum_logdists = defaultdict(lambda: defaultdict(float))

    # iterate over windows
    log_ = math.log  # localize this, for performance
    for start_ind in range(n_toks):
        end_ind = start_ind + window_width
        window_terms = (term for term in terms
                        if start_ind <= term[1] <= end_ind)
        # get all token combinations within window
        for t1, t2 in itertools.combinations(window_terms, 2):
            n_coocs[t1[0]][t2[0]] += 1
            sum_logdists[t1[0]][t2[0]] += log_(window_width / max(abs(t1[1] - t2[1]), 1))
        if end_ind > n_toks:
            break

    # compute edge weights between co-occurring terms (nodes)
    edge_weights = defaultdict(lambda: defaultdict(float))
    for t1, t2s in sum_logdists.items():
        for t2 in t2s:
            edge_weights[t1][t2] = (sum_logdists[t1][t2] / n_coocs[t1][t2]) * term_weights[t1] * term_weights[t2]
    # normalize edge weights by sum of outgoing edge weights per term (node)
    norm_edge_weights = []
    for t1, t2s in edge_weights.items():
        sum_edge_weights = sum(t2s.values())
        norm_edge_weights.extend((t1, t2, {'weight': weight / sum_edge_weights})
                                 for t2, weight in t2s.items())

    # build the weighted directed graph from edges, rank nodes by pagerank
    graph = nx.DiGraph()
    graph.add_edges_from(norm_edge_weights)
    term_ranks = nx.pagerank_scipy(graph)

    return sorted(term_ranks.items(), key=itemgetter(1, 0), reverse=True)[:n_keyterms]
def sgrank(doc, window_width=1500, n_keyterms=10, idf=None):
    """
    Extract key terms from a document using the [SGRank]_ algorithm.

    Args:
        doc (``spacy.Doc``)
        window_width (int, optional): width of sliding window in which term
            co-occurrences are said to occur
        n_keyterms (int or float, optional): if int, number of top-ranked terms
            to return as keyterms; if float, must be in the open interval (0, 1),
            representing the fraction of top-ranked terms to return as keyterms
        idf (dict, optional): mapping of
            {`normalized_str(term) <textacy.spacy_utils.normalized_str>`: inverse document frequency}
            for re-weighting of unigrams (n-grams with n > 1 have df assumed = 1);
            NOTE: results are better with idf information

    Returns:
        list[(str, float)]: sorted list of top ``n_keyterms`` key terms and their
            corresponding SGRank scores

    Raises:
        ValueError: if ``n_keyterms`` is a float but not in (0.0, 1.0]

    References:
        .. [SGRank] Danesh, Sumner, and Martin. "SGRank: Combining Statistical and
           Graphical Methods to Improve the State of the Art in Unsupervised Keyphrase
           Extraction". Lexical and Computational Semantics (* SEM 2015) (2015): 117.
    """
    if isinstance(n_keyterms, float):
        if not 0.0 < n_keyterms <= 1.0:
            raise ValueError('`n_keyterms` must be an int, or a float between 0.0 and 1.0')
    n_toks = len(doc)
    min_term_freq = min(n_toks // 1500, 4)

    # build full list of candidate terms
    terms = list(itertoolz.concat(
        extract.ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False,
                       good_pos_tags={'NOUN', 'ADJ'}, min_freq=min_term_freq)
        for n in range(1, 7)))
    # if inverse document frequencies available, also add verbs
    # verbs without IDF downweighting dominate the results, and not in a good way
    if idf:
        terms.extend(itertoolz.concat(
            extract.ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False,
                           good_pos_tags={'VERB'}, min_freq=min_term_freq)
            for n in range(1, 7)))

    terms_as_strs = {id(term): spacy_utils.normalized_str(term)
                     for term in terms}

    # pre-filter terms to the top 20% ranked by TF or modified TF*IDF, if available
    n_top_20pct = int(len(terms) * 0.2)
    term_counts = Counter(terms_as_strs[id(term)] for term in terms)
    if idf:
        mod_tfidfs = {term: count * idf[term] if ' ' not in term else count
                      for term, count in term_counts.items()}
        top_term_texts = {term for term, _ in sorted(
            mod_tfidfs.items(), key=itemgetter(1), reverse=True)[:n_top_20pct]}
    else:
        top_term_texts = {term for term, _ in term_counts.most_common(n_top_20pct)}

    terms = [term for term in terms
             if terms_as_strs[id(term)] in top_term_texts]

    # compute term weights from statistical attributes
    term_weights = {}
    set_terms_as_str = {terms_as_strs[id(terms)] for terms in terms}
    n_toks_plus_1 = n_toks + 1
    for term in terms:
        term_str = terms_as_strs[id(term)]
        pos_first_occ_factor = math.log(n_toks_plus_1 / (term.start + 1))
        # TODO: assess if len(t) puts too much emphasis on long terms
        # alternative: term_len = 1 if ' ' not in term else math.sqrt(len(term))
        term_len = 1 if ' ' not in term else len(term)
        term_count = term_counts[term_str]
        subsum_count = sum(term_counts[t2] for t2 in set_terms_as_str
                           if t2 != term_str and term_str in t2)
        term_freq_factor = (term_count - subsum_count)
        if idf and ' ' not in term_str:
            term_freq_factor *= idf[term_str]
        term_weights[term_str] = term_freq_factor * pos_first_occ_factor * term_len

    # filter terms to only those with positive weights
    terms = [term for term in terms
             if term_weights[terms_as_strs[id(term)]] > 0]

    n_coocs = defaultdict(lambda: defaultdict(int))
    sum_logdists = defaultdict(lambda: defaultdict(float))

    # iterate over windows
    for start_ind in range(n_toks):
        end_ind = start_ind + window_width
        window_terms = (term for term in terms
                        if start_ind <= term.start <= end_ind)
        # get all token combinations within window
        for t1, t2 in itertools.combinations(window_terms, 2):
            if t1 is t2:
                continue
            n_coocs[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += 1
            try:
                sum_logdists[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += \
                    math.log(window_width / abs(t1.start - t2.start))
            except ZeroDivisionError:  # HACK: pretend that they're 1 token apart
                sum_logdists[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += \
                    math.log(window_width)
        if end_ind > n_toks:
            break

    # compute edge weights between co-occurring terms (nodes)
    edge_weights = defaultdict(lambda: defaultdict(float))
    for t1, t2s in sum_logdists.items():
        for t2 in t2s:
            edge_weights[t1][t2] = (sum_logdists[t1][t2] / n_coocs[t1][t2]) * term_weights[t1] * term_weights[t2]
    # normalize edge weights by sum of outgoing edge weights per term (node)
    norm_edge_weights = []
    for t1, t2s in edge_weights.items():
        sum_edge_weights = sum(t2s.values())
        norm_edge_weights.extend((t1, t2, {'weight': weight / sum_edge_weights})
                                 for t2, weight in t2s.items())

    # build the weighted directed graph from edges, rank nodes by pagerank
    graph = nx.DiGraph()
    graph.add_edges_from(norm_edge_weights)
    term_ranks = nx.pagerank_scipy(graph)

    if isinstance(n_keyterms, float):
        n_keyterms = int(len(term_ranks) * n_keyterms)

    return sorted(term_ranks.items(), key=itemgetter(1), reverse=True)[:n_keyterms]
Exemple #34
0
# pip install textacy
# pip install spacy
# python -m spacy download en_core_web_sm

import pandas as pd
import spacy
from textacy.extract import ngrams

nlp = spacy.load('en_core_web_sm')

text = nlp(
    'Data science is an inter-disciplinary field that uses'
    ' scientific methods, processes, algorithms, and systme to extract'
    ' knowledge and insights from many structural and unstructured data.')

n_grams = 2  # contiguous sequence of a word
min_freq = 1  # extract n -gram based on its frequency

print(
    pd.Series([n.text
               for n in ngrams(text, n=n_grams, min_freq=1)]).value_counts())
""" 
disciplinary field    1
scientific methods    1
unstructured data     1
Data science          1
extract knowledge     1
uses scientific       1
"""
Exemple #35
0
def sgrank(doc, window_width=1500, n_keyterms=10, idf=None):
    """
    Extract key terms from a document using the [SGRank]_ algorithm.

    Args:
        doc (``spacy.Doc``)
        window_width (int, optional): width of sliding window in which term
            co-occurrences are said to occur
        n_keyterms (int or float, optional): if int, number of top-ranked terms
            to return as keyterms; if float, must be in the open interval (0, 1),
            representing the fraction of top-ranked terms to return as keyterms
        idf (dict, optional): mapping of
            {`normalized_str(term) <textacy.spacy_utils.normalized_str>`: inverse document frequency}
            for re-weighting of unigrams (n-grams with n > 1 have df assumed = 1);
            NOTE: results are better with idf information

    Returns:
        list[(str, float)]: sorted list of top ``n_keyterms`` key terms and their
            corresponding SGRank scores

    Raises:
        ValueError: if ``n_keyterms`` is a float but not in (0.0, 1.0]

    References:
        .. [SGRank] Danesh, Sumner, and Martin. "SGRank: Combining Statistical and
           Graphical Methods to Improve the State of the Art in Unsupervised Keyphrase
           Extraction". Lexical and Computational Semantics (* SEM 2015) (2015): 117.
    """
    if isinstance(n_keyterms, float):
        if not 0.0 < n_keyterms <= 1.0:
            raise ValueError(
                '`n_keyterms` must be an int, or a float between 0.0 and 1.0')
    n_toks = len(doc)
    min_term_freq = min(n_toks // 1500, 4)

    # build full list of candidate terms
    terms = list(
        itertoolz.concat(
            extract.ngrams(doc,
                           n,
                           filter_stops=True,
                           filter_punct=True,
                           filter_nums=False,
                           good_pos_tags={'NOUN', 'ADJ'},
                           min_freq=min_term_freq) for n in range(1, 7)))
    # if inverse document frequencies available, also add verbs
    # verbs without IDF downweighting dominate the results, and not in a good way
    if idf:
        terms.extend(
            itertoolz.concat(
                extract.ngrams(doc,
                               n,
                               filter_stops=True,
                               filter_punct=True,
                               filter_nums=False,
                               good_pos_tags={'VERB'},
                               min_freq=min_term_freq) for n in range(1, 7)))

    terms_as_strs = {
        id(term): spacy_utils.normalized_str(term)
        for term in terms
    }

    # pre-filter terms to the top 20% ranked by TF or modified TF*IDF, if available
    n_top_20pct = int(len(terms) * 0.2)
    term_counts = Counter(terms_as_strs[id(term)] for term in terms)
    if idf:
        mod_tfidfs = {
            term: count * idf[term] if ' ' not in term else count
            for term, count in term_counts.items()
        }
        top_term_texts = {
            term
            for term, _ in sorted(mod_tfidfs.items(),
                                  key=itemgetter(1),
                                  reverse=True)[:n_top_20pct]
        }
    else:
        top_term_texts = {
            term
            for term, _ in term_counts.most_common(n_top_20pct)
        }

    terms = [
        term for term in terms if terms_as_strs[id(term)] in top_term_texts
    ]

    # compute term weights from statistical attributes
    term_weights = {}
    set_terms_as_str = {terms_as_strs[id(terms)] for terms in terms}
    n_toks_plus_1 = n_toks + 1
    for term in terms:
        term_str = terms_as_strs[id(term)]
        pos_first_occ_factor = log(n_toks_plus_1 / (term.start + 1))
        # TODO: assess if len(t) puts too much emphasis on long terms
        # alternative: term_len = 1 if ' ' not in term else sqrt(len(term))
        term_len = 1 if ' ' not in term else len(term)
        term_count = term_counts[term_str]
        subsum_count = sum(term_counts[t2] for t2 in set_terms_as_str
                           if t2 != term_str and term_str in t2)
        term_freq_factor = (term_count - subsum_count)
        if idf and ' ' not in term_str:
            term_freq_factor *= idf[term_str]
        term_weights[
            term_str] = term_freq_factor * pos_first_occ_factor * term_len

    # filter terms to only those with positive weights
    terms = [
        term for term in terms if term_weights[terms_as_strs[id(term)]] > 0
    ]

    n_coocs = defaultdict(lambda: defaultdict(int))
    sum_logdists = defaultdict(lambda: defaultdict(float))

    # iterate over windows
    for start_ind in range(n_toks):
        end_ind = start_ind + window_width
        window_terms = (term for term in terms
                        if start_ind <= term.start <= end_ind)
        # get all token combinations within window
        for t1, t2 in itertools.combinations(window_terms, 2):
            if t1 is t2:
                continue
            n_coocs[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += 1
            try:
                sum_logdists[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += \
                    log(window_width / abs(t1.start - t2.start))
            except ZeroDivisionError:  # HACK: pretend that they're 1 token apart
                sum_logdists[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += \
                    log(window_width)
        if end_ind > n_toks:
            break

    # compute edge weights between co-occurring terms (nodes)
    edge_weights = defaultdict(lambda: defaultdict(float))
    for t1, t2s in sum_logdists.items():
        for t2 in t2s:
            edge_weights[t1][t2] = (sum_logdists[t1][t2] / n_coocs[t1][t2]
                                    ) * term_weights[t1] * term_weights[t2]
    # normalize edge weights by sum of outgoing edge weights per term (node)
    norm_edge_weights = []
    for t1, t2s in edge_weights.items():
        sum_edge_weights = sum(t2s.values())
        norm_edge_weights.extend((t1, t2, {
            'weight': weight / sum_edge_weights
        }) for t2, weight in t2s.items())

    # build the weighted directed graph from edges, rank nodes by pagerank
    graph = nx.DiGraph()
    graph.add_edges_from(norm_edge_weights)
    term_ranks = nx.pagerank_scipy(graph)

    if isinstance(n_keyterms, float):
        n_keyterms = int(len(term_ranks) * n_keyterms)

    return sorted(term_ranks.items(), key=itemgetter(1),
                  reverse=True)[:n_keyterms]
Exemple #36
0
 def test_ngrams_less_than_1(self):
     with self.assertRaises(ValueError):
         list(extract.ngrams(self.spacy_doc, 0))
Exemple #37
0
 def test_ngrams_good_tag(self):
     expected = ['technology trends', 'education official']
     observed = [span.orth_ for span in extract.ngrams(
         self.spacy_doc, 2, filter_stops=False, filter_punct=False, filter_nums=False,
         good_pos_tags={'NOUN'})]
     self.assertEqual(observed, expected)
Exemple #38
0
 def test_ngrams_min_freq(self):
     expected = ['in the', 'in the']
     observed = [span.orth_ for span in extract.ngrams(
         self.spacy_doc, 2, filter_stops=False, filter_punct=False, filter_nums=False,
         min_freq=2)]
     self.assertEqual(observed, expected)
Exemple #39
0
 def test_n_less_than_1(self, spacy_doc):
     with pytest.raises(ValueError):
         _ = list(extract.ngrams(spacy_doc, 0))
                    text = r.lemma_
                else:
                    text = r.text

                if limit_to_termset:
                    for t in lower_termset:
                        if text == t or t in text:
                            ngrams.append({
                                'text': text,
                                'count': 1
                            })
                else:
                    ngrams.append({
                        'text': text,
                        'count': 1
                    })
            self.write_multiple_result_data(temp_file, mongo_client, doc, ngrams)


if __name__ == "__main__":
    content = "Can we forge against these enemies a grand and global alliance, North and South, East and West, that " \
              "can assure a more fruitful life for all mankind? Will you join in that historic effort? In the long " \
              "history of the world, only a few generations have been granted the role of defending freedom in its " \
              "hour of maximum danger. I do not shrink from this responsibility — I welcome it. I do not believe " \
              "that any of us would exchange places with any other people or any other generation. The energy, the " \
              "faith, the devotion which we bring to this endeavor will light our country and all who serve it — and " \
              "the glow from that fire can truly light the world."
    d = Doc(content)
    results = extract.ngrams(d, 3)
    print(results)
Exemple #41
0
 def test_n(self, spacy_doc):
     for n in (1, 2):
         result = list(extract.ngrams(spacy_doc, n))
         assert all(isinstance(span, Span) for span in result)
         assert all(len(span) == n for span in result)
Exemple #42
0
 def test_ngrams_less_than_1(self):
     with self.assertRaises(ValueError):
         list(extract.ngrams(self.spacy_doc, 0))