def select_extractors(use_unigrams=False): """ Extractors For Alignment :return: List of Extractors objects to use for text-text alignment note: ngram extractors below filter out stopwords and number words/symbols """ noun_chunk_extractor = Extractor( lambda doc: list(filter(lambda x: len(x) > 3, list(noun_chunks(doc))))) tetragram_extractor = Extractor( lambda doc: list(ngrams(doc, 4, filter_stops=True, filter_nums=True))) trigram_extractor = Extractor( lambda doc: list(ngrams(doc, 3, filter_stops=True, filter_nums=True))) bigram_extractor = Extractor(lambda doc: list( ngrams(doc, 2, filter_stops=False, filter_nums=False))) unigram_extractor = Extractor(lambda doc: list( ngrams(doc, 1, filter_stops=False, filter_nums=False))) extractor_list = [ noun_chunk_extractor, tetragram_extractor, trigram_extractor, bigram_extractor, ] if use_unigrams: extractor_list.append(unigram_extractor) return extractor_list
def test_pos(self, spacy_doc): result1 = list(extract.ngrams(spacy_doc, 2, include_pos={"NOUN"})) result2 = list(extract.ngrams(spacy_doc, 2, include_pos="NOUN")) assert all(tok.pos_ == "NOUN" for span in result1 for tok in span) assert all(tok.pos_ == "NOUN" for span in result2 for tok in span) result3 = list(extract.ngrams(spacy_doc, 2, exclude_pos={"NOUN"})) result4 = list(extract.ngrams(spacy_doc, 2, exclude_pos="NOUN")) assert not any(tok.pos_ == "NOUN" for span in result3 for tok in span) assert not any(tok.pos_ == "NOUN" for span in result4 for tok in span)
def test_extract_functionality(doc): bigrams = list( extract.ngrams(doc, 2, filter_stops=True, filter_punct=True, filter_nums=False))[:10] for bigram in bigrams: assert isinstance(bigram, SpacySpan) assert len(bigram) == 2 trigrams = list( extract.ngrams(doc, 3, filter_stops=True, filter_punct=True, min_freq=2))[:10] for trigram in trigrams: assert isinstance(trigram, SpacySpan) assert len(trigram) == 3 nes = list( extract.named_entities(doc, drop_determiners=False, exclude_types='numeric'))[:10] for ne in nes: assert isinstance(ne, SpacySpan) assert ne.label_ assert ne.label_ != 'QUANTITY' pos_regex_matches = list( extract.pos_regex_matches( doc, constants.POS_REGEX_PATTERNS['en']['NP']))[:10] for match in pos_regex_matches: assert isinstance(match, SpacySpan) stmts = list(extract.semistructured_statements(doc, 'I', cue='be'))[:10] for stmt in stmts: assert isinstance(stmt, list) assert isinstance(stmt[0], compat.unicode_) assert len(stmt) == 3 kts = keyterms.textrank(doc, n_keyterms=10) for keyterm in kts: assert isinstance(keyterm, tuple) assert isinstance(keyterm[0], compat.unicode_) assert isinstance(keyterm[1], float) assert keyterm[1] > 0.0
def ngrams(self, n, **kwargs): """ Extract an ordered sequence of n-grams (``n`` consecutive words) from doc, optionally filtering n-grams by the types and parts-of-speech of the constituent words. Args: n (int): number of tokens to include in n-grams; 1 => unigrams, 2 => bigrams **kwargs: filter_stops (bool, optional): if True, remove ngrams that start or end with a stop word filter_punct (bool, optional): if True, remove ngrams that contain any punctuation-only tokens filter_nums (bool, optional): if True, remove ngrams that contain any numbers or number-like tokens (e.g. 10, 'ten') good_pos_tags (set[str], optional): remove ngrams whose constituent tokens' part-of-speech tags are NOT all in the specified tags, using the universal POS tagset bad_pos_tags (set[str], optional): remove ngrams if any of their constituent tokens' part-of-speech tags are in the specified tags, using the universal POS tagset min_freq (int, optional): remove ngrams that occur in `doc` fewer than `min_freq` times Yields: ``spacy.Span``: the next ngram passing all specified filters, in order of appearance in the document .. seealso:: :func:`extract.ngrams() <textacy.extract.ngrams>` """ for ngram in extract.ngrams(self.spacy_doc, n, **kwargs): yield ngram
def align_data(self, text, data): if type(text) == str: doc = self.nlp(text) else: doc = text ngrams = [] for n in range(1, min(self.max_n + 1, len(doc))): ngrams.extend(extract.ngrams(doc, n)) subject_sims = [(ngram, self.similarity_metric(ngram.text, data['subject'])) for ngram in ngrams] self.logger.debug("Similarities from subject %s", list(zip(ngrams, subject_sims))) subject_span, subject_sim = max(subject_sims, key=lambda x: x[1]) self.logger.debug( f"Selected subject_span [{subject_span}] with similarity [{subject_sim}] for [{data['subject']}]" ) object_sims = [(ngram, self.similarity_metric(ngram.text, data['object'])) for ngram in ngrams] self.logger.debug("Similarities from object %s", list(zip(ngrams, object_sims))) object_span = None for span, sim in sorted(object_sims, key=lambda x: x[1], reverse=True): # tests if the current span doesn't overlaps the subject one if overlaps(span, subject_span): self.logger.debug( f"Span [{span.text}] overlaps subject_span [{subject_span.text}]" ) continue object_span = span self.logger.debug( f"Selected object_span [{object_span.text}] with similarity [{sim}] for [{data['object']}]" ) break if object_span is None: self.logger.warning("I can't extract object_span.") self.subject_align[data['subject']] = subject_span self.object_align[data['object']] = object_span return subject_span, object_span
def test_min_freq(self, spacy_doc): n = 2 counts = collections.Counter() counts.update(spacy_doc[i:i + n].lower_ for i in range(len(spacy_doc) - n + 1)) result = list(extract.ngrams(spacy_doc, 2, min_freq=2)) assert all(counts[span.lower_] >= 2 for span in result)
def test_extract_functionality(doc): bigrams = list( extract.ngrams(doc, 2, filter_stops=True, filter_punct=True, filter_nums=False))[:10] for bigram in bigrams: assert isinstance(bigram, Span) assert len(bigram) == 2 trigrams = list( extract.ngrams(doc, 3, filter_stops=True, filter_punct=True, min_freq=2))[:10] for trigram in trigrams: assert isinstance(trigram, Span) assert len(trigram) == 3 nes = list( extract.entities(doc, drop_determiners=False, exclude_types="numeric"))[:10] for ne in nes: assert isinstance(ne, Span) assert ne.label_ assert ne.label_ != "QUANTITY" pos_regex_matches = list( extract.pos_regex_matches( doc, constants.POS_REGEX_PATTERNS["en"]["NP"]))[:10] for match in pos_regex_matches: assert isinstance(match, Span) stmts = list(extract.semistructured_statements(doc, "I", cue="be"))[:10] for stmt in stmts: assert isinstance(stmt, list) assert isinstance(stmt[0], compat.unicode_) assert len(stmt) == 3 kts = textacy.ke.textrank(doc, topn=10) for keyterm in kts: assert isinstance(keyterm, tuple) assert isinstance(keyterm[0], compat.unicode_) assert isinstance(keyterm[1], float) assert keyterm[1] > 0.0
def test_ngrams_good_tag(self): expected = [ 'I.M.F. seminar', 'technology trends', 'Middle East', 'education official', 'Donald Trump', 'United States', 'what we'] observed = [span.orth_ for span in extract.ngrams( self.spacy_doc, 2, filter_stops=False, filter_punct=False, filter_nums=False, good_pos_tags={'NOUN'})] self.assertEqual(observed, expected)
def test_ngrams_1(self): expected = [ 'Two', 'weeks', 'ago', ',', 'I', 'was', 'in', 'Kuwait', 'participating', 'in', 'an', 'I.M.F.', 'seminar', 'for', 'Arab', 'educators', '.', 'For', '30', 'minutes', ',', 'we', 'discussed', 'the', 'impact'] observed = [span.orth_ for span in extract.ngrams( self.spacy_doc, 1, filter_stops=False, filter_punct=False, filter_nums=False)][:25] self.assertEqual(observed, expected)
def test_extract_functionality(doc): bigrams = list( extract.ngrams(doc, 2, filter_stops=True, filter_punct=True, filter_nums=False))[:10] for bigram in bigrams: assert isinstance(bigram, Span) assert len(bigram) == 2 trigrams = list( extract.ngrams(doc, 3, filter_stops=True, filter_punct=True, min_freq=2))[:10] for trigram in trigrams: assert isinstance(trigram, Span) assert len(trigram) == 3 nes = list( extract.entities(doc, drop_determiners=False, exclude_types="numeric"))[:10] for ne in nes: assert isinstance(ne, Span) assert ne.label_ assert ne.label_ != "QUANTITY" regex_matches = list(extract.regex_matches(doc, "Mr\. Speaker"))[:10] for match in regex_matches: assert isinstance(match, Span) stmts = list(extract.semistructured_statements(doc, entity="I", cue="be"))[:10] for stmt in stmts: assert isinstance(stmt, list) assert isinstance(stmt[0], str) assert len(stmt) == 3 kts = kt.textrank(doc, topn=10) for keyterm in kts: assert isinstance(keyterm, tuple) assert isinstance(keyterm[0], str) assert isinstance(keyterm[1], float) assert keyterm[1] > 0.0
def test_ngrams_filter(self): expected = [ 'weeks ago', 'Kuwait participating', 'I.M.F. seminar', 'Arab educators', 'technology trends', 'Middle East', 'Egyptian education', 'education official', 'official raised', 'personal question', 'heard Donald', 'Donald Trump', 'close mosques', 'United States', 'great sorrow'] observed = [span.orth_ for span in extract.ngrams( self.spacy_doc, 2, filter_stops=True, filter_punct=True, filter_nums=True)] self.assertEqual(observed, expected)
def test_filter(self, spacy_doc): result = list( extract.ngrams( spacy_doc, 2, filter_stops=True, filter_punct=True, filter_nums=True ) ) assert not any(span[0].is_stop or span[-1].is_stop for span in result) assert not any(tok.is_punct for span in result for tok in span) assert not any(tok.like_num for span in result for tok in span)
def get_ngrams(document, n, min_freq=1, filter_punct=True): res = sorted([ n.text for n in ngrams(nlp(document), n, filter_stops=False, min_freq=min_freq, filter_punct=filter_punct) ]) return res
def test_ngrams_2(self): expected = [ 'Two weeks', 'weeks ago', 'ago,', ', I', 'I was', 'was in', 'in Kuwait', 'Kuwait participating', 'participating in', 'in an', 'an I.M.F.', 'I.M.F. seminar', 'seminar for', 'for Arab', 'Arab educators', 'educators.', '. For', 'For 30', '30 minutes', 'minutes,', ', we', 'we discussed', 'discussed the', 'the impact', 'impact of'] observed = [span.orth_ for span in extract.ngrams( self.spacy_doc, 2, filter_stops=False, filter_punct=False, filter_nums=False)][:25] self.assertEqual(observed, expected)
def test_ngrams_good_tag(spacy_doc): result = [ span for span in extract.ngrams(spacy_doc, 2, filter_stops=False, filter_punct=False, filter_nums=False, include_pos={'NOUN'}) ] assert all(tok.pos_ == 'NOUN' for span in result for tok in span)
def test_callable_args(self, spacy_doc): results = list( extract.terms( spacy_doc, ngs=lambda doc: extract.ngrams(doc, n=2), ents=extract.entities, ncs=extract.noun_chunks, )) assert results assert all(isinstance(result, Span) for result in results)
def test_ngrams_min_freq(self): expected = ['in the', 'in the'] observed = [ span.orth_ for span in extract.ngrams(self.spacy_doc, 2, filter_stops=False, filter_punct=False, filter_nums=False, min_freq=2) ] self.assertEqual(observed, expected)
def test_ngrams_n(spacy_doc): for n in (1, 2): result = [ span for span in extract.ngrams(spacy_doc, n, filter_stops=False, filter_punct=False, filter_nums=False) ] assert all(len(span) == n for span in result) assert all(isinstance(span, SpacySpan) for span in result)
def test_ngrams_good_tag(self): expected = ['technology trends', 'education official'] observed = [ span.orth_ for span in extract.ngrams(self.spacy_doc, 2, filter_stops=False, filter_punct=False, filter_nums=False, good_pos_tags={'NOUN'}) ] self.assertEqual(observed, expected)
def _get_pos_ngrams_sent(spacy_sent, n): """ Returns a list (including duplicates) of the POS ngrams appearing in spacy_sent. """ pos_ngrams = [] for ngram in extract.ngrams(spacy_sent, n=n, filter_stops=False, filter_punct=False): ngram_string = " ".join([word.pos_ for word in ngram]) pos_ngrams.append(ngram_string) return pos_ngrams # list of strings
def ngrams(self, n, **kwargs): """ Extract an ordered sequence of n-grams (``n`` consecutive words) from doc, optionally filtering n-grams by the types and parts-of-speech of the constituent words. Args: n (int): number of tokens to include in n-grams; 1 => unigrams, 2 => bigrams .. seealso:: :func:`extract.ngrams() <textacy.extract.ngrams>` for all function kwargs. """ return extract.ngrams(self.spacy_doc, n, **kwargs)
def transform(self, documents): result = [] for doc_ in documents: doc = NGramTransformer.nlp(doc_) result.append( sorted([ t.text for t in ngrams(doc, n=self.n, filter_stops=False, min_freq=self.min_freq, filter_punct=self.filter_punct) ])) return result
def test_ngrams_min_freq(spacy_doc): n = 2 counts = collections.Counter() counts.update(spacy_doc[i:i + n].lower_ for i in range(len(spacy_doc) - n + 1)) result = [ span for span in extract.ngrams(spacy_doc, n, filter_stops=False, filter_punct=False, filter_nums=False, min_freq=2) ] assert all(counts[span.lower_] >= 2 for span in result)
def test_ngrams_filter(self): result = [ span for span in extract.ngrams(self.spacy_doc, 2, filter_stops=True, filter_punct=True, filter_nums=True) ] self.assertTrue(not any(span[0].is_stop or span[-1].is_stop for span in result)) self.assertTrue(not any(tok.is_punct for span in result for tok in span)) self.assertTrue(not any(tok.like_num for span in result for tok in span))
def test_ngrams_1(self): expected = [ 'Two', 'weeks', 'ago', ',', 'I', 'was', 'in', 'Kuwait', 'participating', 'in', 'an', 'I.M.F.', 'seminar', 'for', 'Arab', 'educators', '.', 'For', '30', 'minutes', ',', 'we', 'discussed', 'the', 'impact' ] observed = [ span.orth_ for span in extract.ngrams(self.spacy_doc, 1, filter_stops=False, filter_punct=False, filter_nums=False) ][:25] self.assertEqual(observed, expected)
def test_ngrams_filter(self): expected = [ 'weeks ago', 'Kuwait participating', 'I.M.F. seminar', 'Arab educators', 'technology trends', 'Middle East', 'Egyptian education', 'education official', 'official raised', 'personal question', 'heard Donald', 'Donald Trump', 'close mosques', 'United States', 'great sorrow' ] observed = [ span.orth_ for span in extract.ngrams(self.spacy_doc, 2, filter_stops=True, filter_punct=True, filter_nums=True) ] self.assertEqual(observed, expected)
def test_ngrams_2(self): expected = [ 'Two weeks', 'weeks ago', 'ago,', ', I', 'I was', 'was in', 'in Kuwait', 'Kuwait participating', 'participating in', 'in an', 'an I.M.F.', 'I.M.F. seminar', 'seminar for', 'for Arab', 'Arab educators', 'educators.', '. For', 'For 30', '30 minutes', 'minutes,', ', we', 'we discussed', 'discussed the', 'the impact', 'impact of' ] observed = [ span.orth_ for span in extract.ngrams(self.spacy_doc, 2, filter_stops=False, filter_punct=False, filter_nums=False) ][:25] self.assertEqual(observed, expected)
def run_custom_task(self, temp_file, mongo_client: MongoClient): log('run custom task') n_num = self.get_integer('n', default=2) filter_stops = self.get_boolean('filter_stops', default=True) filter_punct = self.get_boolean('filter_punct', default=True) filter_nums = self.get_boolean('filter_nums', default=False) lemmas = self.get_boolean('lemmas', default=True) limit_to_termset = self.get_boolean('limit_to_termset', default=False) termset = self.pipeline_config.terms if not termset: termset = list() lower_termset = [x.lower() for x in termset] for doc in self.docs: ngrams = list() cln_txt = self.get_document_text(doc, clean=True) t_doc = make_spacy_doc(preprocess_text(cln_txt, lowercase=True), lang='en') res = extract.ngrams(t_doc, n_num, filter_stops=filter_stops, filter_punct=filter_punct, filter_nums=filter_nums) for r in res: if lemmas: text = r.lemma_ else: text = r.text if limit_to_termset: for t in lower_termset: if text == t or t in text: ngrams.append({ 'text': text, 'count': 1 }) else: ngrams.append({ 'text': text, 'count': 1 }) self.write_multiple_result_data(temp_file, mongo_client, doc, ngrams)
def run_custom_task(self, temp_file, mongo_client: MongoClient): print('run custom task') n_num = self.get_integer('n', default=2) filter_stops = self.get_boolean('filter_stops', default=True) filter_punct = self.get_boolean('filter_punct', default=True) filter_nums = self.get_boolean('filter_nums', default=False) lemmas = self.get_boolean('lemmas', default=True) limit_to_termset = self.get_boolean('limit_to_termset', default=False) termset = self.pipeline_config.terms if not termset: termset = list() lower_termset = [x.lower() for x in termset] for doc in self.docs: ngrams = list() cln_txt = self.get_document_text(doc, clean=True) t_doc = Doc(preprocess_text(cln_txt, lowercase=True)) res = extract.ngrams(t_doc, n_num, filter_stops=filter_stops, filter_punct=filter_punct, filter_nums=filter_nums) for r in res: if lemmas: text = r.lemma_ else: text = r.text if limit_to_termset: for t in lower_termset: if text == t or t in text: ngrams.append({ 'text': text, 'count': 1 }) else: ngrams.append({ 'text': text, 'count': 1 }) self.write_multiple_result_data(temp_file, mongo_client, doc, ngrams)
def sgrank(doc, normalize='lemma', window_width=1500, n_keyterms=10, idf=None): """ Extract key terms from a document using the [SGRank]_ algorithm. Args: doc (``textacy.Doc`` or ``spacy.Doc``) normalize (str or callable): if 'lemma', lemmatize terms; if 'lower', lowercase terms; if None, use the form of terms as they appeared in ``doc``; if a callable, must accept a ``spacy.Span`` and return a str, e.g. :func:`textacy.spacy_utils.normalized_str()` window_width (int): width of sliding window in which term co-occurrences are said to occur n_keyterms (int or float): if int, number of top-ranked terms to return as keyterms; if float, must be in the open interval (0, 1), is converted to an integer by ``round(len(doc) * n_keyterms)`` idf (dict): mapping of ``normalize(term)`` to inverse document frequency for re-weighting of unigrams (n-grams with n > 1 have df assumed = 1); NOTE: results are better with idf information Returns: List[Tuple[str, float]]: sorted list of top ``n_keyterms`` key terms and their corresponding SGRank scores Raises: ValueError: if ``n_keyterms`` is a float but not in (0.0, 1.0] References: .. [SGRank] Danesh, Sumner, and Martin. "SGRank: Combining Statistical and Graphical Methods to Improve the State of the Art in Unsupervised Keyphrase Extraction". Lexical and Computational Semantics (* SEM 2015) (2015): 117. """ n_toks = len(doc) if isinstance(n_keyterms, float): if not 0.0 < n_keyterms <= 1.0: raise ValueError('`n_keyterms` must be an int, or a float between 0.0 and 1.0') n_keyterms = int(round(n_toks * n_keyterms)) window_width = min(n_toks, window_width) min_term_freq = min(n_toks // 1000, 4) # build full list of candidate terms # if inverse doc freqs available, include nouns, adjectives, and verbs; # otherwise, just include nouns and adjectives # (without IDF downweighting, verbs dominate the results in a bad way) include_pos = {'NOUN', 'PROPN', 'ADJ', 'VERB'} if idf else {'NOUN', 'PROPN', 'ADJ'} terms = itertoolz.concat( extract.ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False, include_pos=include_pos, min_freq=min_term_freq) for n in range(1, 7)) # get normalized term strings, as desired # paired with positional index in document and length in a 3-tuple if normalize == 'lemma': terms = [(term.lemma_, term.start, len(term)) for term in terms] elif normalize == 'lower': terms = [(term.orth_.lower(), term.start, len(term)) for term in terms] elif not normalize: terms = [(term.text, term.start, len(term)) for term in terms] else: terms = [(normalize(term), term.start, len(term)) for term in terms] # pre-filter terms to the top N ranked by TF or modified TF*IDF n_prefilter_kts = max(3 * n_keyterms, 100) term_text_counts = Counter(term[0] for term in terms) if idf: mod_tfidfs = { term: count * idf.get(term, 1) if ' ' not in term else count for term, count in term_text_counts.items()} terms_set = { term for term, _ in sorted(mod_tfidfs.items(), key=itemgetter(1), reverse=True)[:n_prefilter_kts]} else: terms_set = {term for term, _ in term_text_counts.most_common(n_prefilter_kts)} terms = [term for term in terms if term[0] in terms_set] # compute term weights from statistical attributes: # not subsumed frequency, position of first occurrence, and num words term_weights = {} seen_terms = set() n_toks_plus_1 = n_toks + 1 for term in terms: term_text = term[0] # we only want the *first* occurrence of a unique term (by its text) if term_text in seen_terms: continue seen_terms.add(term_text) pos_first_occ_factor = math.log(n_toks_plus_1 / (term[1] + 1)) # TODO: assess how best to scale term len term_len = math.sqrt(term[2]) # term[2] term_count = term_text_counts[term_text] subsum_count = sum(term_text_counts[t2] for t2 in terms_set if t2 != term_text and term_text in t2) term_freq_factor = term_count - subsum_count if idf and term[2] == 1: term_freq_factor *= idf.get(term_text, 1) term_weights[term_text] = term_freq_factor * pos_first_occ_factor * term_len # filter terms to only those with positive weights terms = [term for term in terms if term_weights[term[0]] > 0] n_coocs = defaultdict(lambda: defaultdict(int)) sum_logdists = defaultdict(lambda: defaultdict(float)) # iterate over windows log_ = math.log # localize this, for performance for start_ind in range(n_toks): end_ind = start_ind + window_width window_terms = (term for term in terms if start_ind <= term[1] <= end_ind) # get all token combinations within window for t1, t2 in itertools.combinations(window_terms, 2): n_coocs[t1[0]][t2[0]] += 1 sum_logdists[t1[0]][t2[0]] += log_(window_width / max(abs(t1[1] - t2[1]), 1)) if end_ind > n_toks: break # compute edge weights between co-occurring terms (nodes) edge_weights = defaultdict(lambda: defaultdict(float)) for t1, t2s in sum_logdists.items(): for t2 in t2s: edge_weights[t1][t2] = (sum_logdists[t1][t2] / n_coocs[t1][t2]) * term_weights[t1] * term_weights[t2] # normalize edge weights by sum of outgoing edge weights per term (node) norm_edge_weights = [] for t1, t2s in edge_weights.items(): sum_edge_weights = sum(t2s.values()) norm_edge_weights.extend((t1, t2, {'weight': weight / sum_edge_weights}) for t2, weight in t2s.items()) # build the weighted directed graph from edges, rank nodes by pagerank graph = nx.DiGraph() graph.add_edges_from(norm_edge_weights) term_ranks = nx.pagerank_scipy(graph) return sorted(term_ranks.items(), key=itemgetter(1, 0), reverse=True)[:n_keyterms]
def sgrank(doc, window_width=1500, n_keyterms=10, idf=None): """ Extract key terms from a document using the [SGRank]_ algorithm. Args: doc (``spacy.Doc``) window_width (int, optional): width of sliding window in which term co-occurrences are said to occur n_keyterms (int or float, optional): if int, number of top-ranked terms to return as keyterms; if float, must be in the open interval (0, 1), representing the fraction of top-ranked terms to return as keyterms idf (dict, optional): mapping of {`normalized_str(term) <textacy.spacy_utils.normalized_str>`: inverse document frequency} for re-weighting of unigrams (n-grams with n > 1 have df assumed = 1); NOTE: results are better with idf information Returns: list[(str, float)]: sorted list of top ``n_keyterms`` key terms and their corresponding SGRank scores Raises: ValueError: if ``n_keyterms`` is a float but not in (0.0, 1.0] References: .. [SGRank] Danesh, Sumner, and Martin. "SGRank: Combining Statistical and Graphical Methods to Improve the State of the Art in Unsupervised Keyphrase Extraction". Lexical and Computational Semantics (* SEM 2015) (2015): 117. """ if isinstance(n_keyterms, float): if not 0.0 < n_keyterms <= 1.0: raise ValueError('`n_keyterms` must be an int, or a float between 0.0 and 1.0') n_toks = len(doc) min_term_freq = min(n_toks // 1500, 4) # build full list of candidate terms terms = list(itertoolz.concat( extract.ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False, good_pos_tags={'NOUN', 'ADJ'}, min_freq=min_term_freq) for n in range(1, 7))) # if inverse document frequencies available, also add verbs # verbs without IDF downweighting dominate the results, and not in a good way if idf: terms.extend(itertoolz.concat( extract.ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False, good_pos_tags={'VERB'}, min_freq=min_term_freq) for n in range(1, 7))) terms_as_strs = {id(term): spacy_utils.normalized_str(term) for term in terms} # pre-filter terms to the top 20% ranked by TF or modified TF*IDF, if available n_top_20pct = int(len(terms) * 0.2) term_counts = Counter(terms_as_strs[id(term)] for term in terms) if idf: mod_tfidfs = {term: count * idf[term] if ' ' not in term else count for term, count in term_counts.items()} top_term_texts = {term for term, _ in sorted( mod_tfidfs.items(), key=itemgetter(1), reverse=True)[:n_top_20pct]} else: top_term_texts = {term for term, _ in term_counts.most_common(n_top_20pct)} terms = [term for term in terms if terms_as_strs[id(term)] in top_term_texts] # compute term weights from statistical attributes term_weights = {} set_terms_as_str = {terms_as_strs[id(terms)] for terms in terms} n_toks_plus_1 = n_toks + 1 for term in terms: term_str = terms_as_strs[id(term)] pos_first_occ_factor = math.log(n_toks_plus_1 / (term.start + 1)) # TODO: assess if len(t) puts too much emphasis on long terms # alternative: term_len = 1 if ' ' not in term else math.sqrt(len(term)) term_len = 1 if ' ' not in term else len(term) term_count = term_counts[term_str] subsum_count = sum(term_counts[t2] for t2 in set_terms_as_str if t2 != term_str and term_str in t2) term_freq_factor = (term_count - subsum_count) if idf and ' ' not in term_str: term_freq_factor *= idf[term_str] term_weights[term_str] = term_freq_factor * pos_first_occ_factor * term_len # filter terms to only those with positive weights terms = [term for term in terms if term_weights[terms_as_strs[id(term)]] > 0] n_coocs = defaultdict(lambda: defaultdict(int)) sum_logdists = defaultdict(lambda: defaultdict(float)) # iterate over windows for start_ind in range(n_toks): end_ind = start_ind + window_width window_terms = (term for term in terms if start_ind <= term.start <= end_ind) # get all token combinations within window for t1, t2 in itertools.combinations(window_terms, 2): if t1 is t2: continue n_coocs[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += 1 try: sum_logdists[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += \ math.log(window_width / abs(t1.start - t2.start)) except ZeroDivisionError: # HACK: pretend that they're 1 token apart sum_logdists[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += \ math.log(window_width) if end_ind > n_toks: break # compute edge weights between co-occurring terms (nodes) edge_weights = defaultdict(lambda: defaultdict(float)) for t1, t2s in sum_logdists.items(): for t2 in t2s: edge_weights[t1][t2] = (sum_logdists[t1][t2] / n_coocs[t1][t2]) * term_weights[t1] * term_weights[t2] # normalize edge weights by sum of outgoing edge weights per term (node) norm_edge_weights = [] for t1, t2s in edge_weights.items(): sum_edge_weights = sum(t2s.values()) norm_edge_weights.extend((t1, t2, {'weight': weight / sum_edge_weights}) for t2, weight in t2s.items()) # build the weighted directed graph from edges, rank nodes by pagerank graph = nx.DiGraph() graph.add_edges_from(norm_edge_weights) term_ranks = nx.pagerank_scipy(graph) if isinstance(n_keyterms, float): n_keyterms = int(len(term_ranks) * n_keyterms) return sorted(term_ranks.items(), key=itemgetter(1), reverse=True)[:n_keyterms]
# pip install textacy # pip install spacy # python -m spacy download en_core_web_sm import pandas as pd import spacy from textacy.extract import ngrams nlp = spacy.load('en_core_web_sm') text = nlp( 'Data science is an inter-disciplinary field that uses' ' scientific methods, processes, algorithms, and systme to extract' ' knowledge and insights from many structural and unstructured data.') n_grams = 2 # contiguous sequence of a word min_freq = 1 # extract n -gram based on its frequency print( pd.Series([n.text for n in ngrams(text, n=n_grams, min_freq=1)]).value_counts()) """ disciplinary field 1 scientific methods 1 unstructured data 1 Data science 1 extract knowledge 1 uses scientific 1 """
def sgrank(doc, window_width=1500, n_keyterms=10, idf=None): """ Extract key terms from a document using the [SGRank]_ algorithm. Args: doc (``spacy.Doc``) window_width (int, optional): width of sliding window in which term co-occurrences are said to occur n_keyterms (int or float, optional): if int, number of top-ranked terms to return as keyterms; if float, must be in the open interval (0, 1), representing the fraction of top-ranked terms to return as keyterms idf (dict, optional): mapping of {`normalized_str(term) <textacy.spacy_utils.normalized_str>`: inverse document frequency} for re-weighting of unigrams (n-grams with n > 1 have df assumed = 1); NOTE: results are better with idf information Returns: list[(str, float)]: sorted list of top ``n_keyterms`` key terms and their corresponding SGRank scores Raises: ValueError: if ``n_keyterms`` is a float but not in (0.0, 1.0] References: .. [SGRank] Danesh, Sumner, and Martin. "SGRank: Combining Statistical and Graphical Methods to Improve the State of the Art in Unsupervised Keyphrase Extraction". Lexical and Computational Semantics (* SEM 2015) (2015): 117. """ if isinstance(n_keyterms, float): if not 0.0 < n_keyterms <= 1.0: raise ValueError( '`n_keyterms` must be an int, or a float between 0.0 and 1.0') n_toks = len(doc) min_term_freq = min(n_toks // 1500, 4) # build full list of candidate terms terms = list( itertoolz.concat( extract.ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False, good_pos_tags={'NOUN', 'ADJ'}, min_freq=min_term_freq) for n in range(1, 7))) # if inverse document frequencies available, also add verbs # verbs without IDF downweighting dominate the results, and not in a good way if idf: terms.extend( itertoolz.concat( extract.ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False, good_pos_tags={'VERB'}, min_freq=min_term_freq) for n in range(1, 7))) terms_as_strs = { id(term): spacy_utils.normalized_str(term) for term in terms } # pre-filter terms to the top 20% ranked by TF or modified TF*IDF, if available n_top_20pct = int(len(terms) * 0.2) term_counts = Counter(terms_as_strs[id(term)] for term in terms) if idf: mod_tfidfs = { term: count * idf[term] if ' ' not in term else count for term, count in term_counts.items() } top_term_texts = { term for term, _ in sorted(mod_tfidfs.items(), key=itemgetter(1), reverse=True)[:n_top_20pct] } else: top_term_texts = { term for term, _ in term_counts.most_common(n_top_20pct) } terms = [ term for term in terms if terms_as_strs[id(term)] in top_term_texts ] # compute term weights from statistical attributes term_weights = {} set_terms_as_str = {terms_as_strs[id(terms)] for terms in terms} n_toks_plus_1 = n_toks + 1 for term in terms: term_str = terms_as_strs[id(term)] pos_first_occ_factor = log(n_toks_plus_1 / (term.start + 1)) # TODO: assess if len(t) puts too much emphasis on long terms # alternative: term_len = 1 if ' ' not in term else sqrt(len(term)) term_len = 1 if ' ' not in term else len(term) term_count = term_counts[term_str] subsum_count = sum(term_counts[t2] for t2 in set_terms_as_str if t2 != term_str and term_str in t2) term_freq_factor = (term_count - subsum_count) if idf and ' ' not in term_str: term_freq_factor *= idf[term_str] term_weights[ term_str] = term_freq_factor * pos_first_occ_factor * term_len # filter terms to only those with positive weights terms = [ term for term in terms if term_weights[terms_as_strs[id(term)]] > 0 ] n_coocs = defaultdict(lambda: defaultdict(int)) sum_logdists = defaultdict(lambda: defaultdict(float)) # iterate over windows for start_ind in range(n_toks): end_ind = start_ind + window_width window_terms = (term for term in terms if start_ind <= term.start <= end_ind) # get all token combinations within window for t1, t2 in itertools.combinations(window_terms, 2): if t1 is t2: continue n_coocs[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += 1 try: sum_logdists[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += \ log(window_width / abs(t1.start - t2.start)) except ZeroDivisionError: # HACK: pretend that they're 1 token apart sum_logdists[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += \ log(window_width) if end_ind > n_toks: break # compute edge weights between co-occurring terms (nodes) edge_weights = defaultdict(lambda: defaultdict(float)) for t1, t2s in sum_logdists.items(): for t2 in t2s: edge_weights[t1][t2] = (sum_logdists[t1][t2] / n_coocs[t1][t2] ) * term_weights[t1] * term_weights[t2] # normalize edge weights by sum of outgoing edge weights per term (node) norm_edge_weights = [] for t1, t2s in edge_weights.items(): sum_edge_weights = sum(t2s.values()) norm_edge_weights.extend((t1, t2, { 'weight': weight / sum_edge_weights }) for t2, weight in t2s.items()) # build the weighted directed graph from edges, rank nodes by pagerank graph = nx.DiGraph() graph.add_edges_from(norm_edge_weights) term_ranks = nx.pagerank_scipy(graph) if isinstance(n_keyterms, float): n_keyterms = int(len(term_ranks) * n_keyterms) return sorted(term_ranks.items(), key=itemgetter(1), reverse=True)[:n_keyterms]
def test_ngrams_less_than_1(self): with self.assertRaises(ValueError): list(extract.ngrams(self.spacy_doc, 0))
def test_ngrams_good_tag(self): expected = ['technology trends', 'education official'] observed = [span.orth_ for span in extract.ngrams( self.spacy_doc, 2, filter_stops=False, filter_punct=False, filter_nums=False, good_pos_tags={'NOUN'})] self.assertEqual(observed, expected)
def test_ngrams_min_freq(self): expected = ['in the', 'in the'] observed = [span.orth_ for span in extract.ngrams( self.spacy_doc, 2, filter_stops=False, filter_punct=False, filter_nums=False, min_freq=2)] self.assertEqual(observed, expected)
def test_n_less_than_1(self, spacy_doc): with pytest.raises(ValueError): _ = list(extract.ngrams(spacy_doc, 0))
text = r.lemma_ else: text = r.text if limit_to_termset: for t in lower_termset: if text == t or t in text: ngrams.append({ 'text': text, 'count': 1 }) else: ngrams.append({ 'text': text, 'count': 1 }) self.write_multiple_result_data(temp_file, mongo_client, doc, ngrams) if __name__ == "__main__": content = "Can we forge against these enemies a grand and global alliance, North and South, East and West, that " \ "can assure a more fruitful life for all mankind? Will you join in that historic effort? In the long " \ "history of the world, only a few generations have been granted the role of defending freedom in its " \ "hour of maximum danger. I do not shrink from this responsibility — I welcome it. I do not believe " \ "that any of us would exchange places with any other people or any other generation. The energy, the " \ "faith, the devotion which we bring to this endeavor will light our country and all who serve it — and " \ "the glow from that fire can truly light the world." d = Doc(content) results = extract.ngrams(d, 3) print(results)
def test_n(self, spacy_doc): for n in (1, 2): result = list(extract.ngrams(spacy_doc, n)) assert all(isinstance(span, Span) for span in result) assert all(len(span) == n for span in result)