def extract_key_terms(doc, num_terms=50, ngrams=(1, 2, 3), algo='yake'): """Compute post most important terms This is particularly useful for the search and related posts Args: doc (Spacy.doc): Doc to extract terms from. num_terms (int, optional): How many terms to return. Defaults to 100. ngrams (int, optional): which size of ngrams to consider algo (str, optional): which algorithm to use to find key terms """ if not len(doc): return [] # special case if len(doc) < 3: return softmax(str(doc).split(' ')) if algo == 'textrank': return softmax(textrank(doc, n_keyterms=NUM_TERMS)) elif algo == 'yake': return softmax(yake(doc, ngrams=ngrams, topn=NUM_TERMS), reverse=True) elif algo == 'scake': return softmax(scake(doc, topn=NUM_TERMS)) elif algo == 'sgrank': return softmax(sgrank(doc, ngrams=ngrams, n_keyterms=NUM_TERMS)) else: err = 'Unknown key term extraction method:%s' % algo raise Exception(err)
def get_vote_ranking(Q_A, include_POS=('NOUN', 'PROPN'), topN=0.99): # pd.DataFrame(textrank(Q_A, normalize='lemma', include_pos=('NOUN','PROPN'), window_size=2, edge_weighting='binary', position_bias=False, topn=0.99)).sort_values(by=1, ascending=False) x = pd.DataFrame( textrank(Q_A, normalize='lemma', include_pos=include_POS, window_size=3, edge_weighting='binary', position_bias=False, topn=topN), columns=['NOUN', 'rank']).sort_values( by=['rank'], ascending=False).reset_index().set_index( keys=['NOUN']).drop(columns=['rank']).add_prefix('textrank_') y = pd.DataFrame( yake(Q_A, normalize='lemma', ngrams=(1, 2, 3, 4, 5), include_pos=include_POS, window_size=3, topn=topN), columns=['NOUN', 'rank']).sort_values( by=['rank'], ascending=False).reset_index(drop=True).reset_index().set_index( keys=['NOUN']).drop(columns=['rank']).add_prefix('yake_') z = pd.DataFrame( sgrank( Q_A, normalize='lemma', ngrams=(1, 2, 3, 4, 5), include_pos=include_POS, window_size=1500, topn=topN, idf=None, ), columns=['NOUN', 'rank']).sort_values( by=['rank'], ascending=False).reset_index().set_index( keys=['NOUN']).drop(columns=['rank']).add_prefix('sgrank_') h = pd.DataFrame( scake(Q_A, normalize='lemma', include_pos=include_POS, topn=topN), columns=['NOUN', 'rank']).sort_values( by=['rank'], ascending=False).reset_index().set_index( keys=['NOUN']).drop(columns=['rank']).add_prefix('scake_') d = pd.concat([x, y, z, h], axis=1) dd = d.fillna( d.mean(axis=0)).sum(axis=1).sort_values().reset_index().drop(columns=0) dd.columns = ['rank', 'NOUN'] dd['rank'] = 1 / (1 + dd['rank']) # inverse weight return dd[['NOUN', 'rank']]
def process_file(key, input_filename, output_filename): # open the given file and unwrap it text = '' with open(input_filename, 'r') as f: text = f.read() # consolidate whitespace text = re.sub(r'\n+', ' ', text) doc = nlp(text) with open(output_filename, 'w') as handle: # output a header print("id\tkeyword", file=handle) # process and output each keyword and done; can't get much simpler for keyword, score in (yake(doc, ngrams=NGRAMS, topn=TOPN)): if (len(keyword) < 3): continue print("\t".join([key, keyword]), file=handle)
# sanity check if len(sys.argv) != 2: sys.stderr.write('Usage: ' + sys.argv[0] + " <file>\n") quit() # initialize file = sys.argv[1] # open the given file and unwrap it text = open(file).read() text = textacy.preprocessing.normalize.normalize_quotation_marks(text) text = textacy.preprocessing.normalize.normalize_hyphenated_words(text) text = textacy.preprocessing.normalize.normalize_whitespace(text) # compute the identifier id = os.path.basename(os.path.splitext(file)[0]) # initialize model maximum = len(text) + 1 model = spacy.load('en', max_length=maximum) doc = model(text) # output a header print("id\tkeyword") # process and output each keyword and done; can't get much simpler for keyword, score in (yake(doc, ngrams=NGRAMS, topn=TOPN)): print("\t".join([id, keyword])) exit()
text = textacy.preprocessing.normalize.normalize_whitespace(text) # compute the identifier id = os.path.basename(os.path.splitext(file)[0]) # initialize model maximum = len(text) + 1 model = spacy.load('en_core_web_sm', max_length=maximum) doc = model(text) # output a header print("id\tkeyword") # track found keywords to avoid duplicates keywords = set() # process and output each keyword with yake, will produce unigrams for keyword, score in (yake(doc, topn=TOPN)): if keyword not in keywords: print("\t".join([id, keyword])) keywords.add(keyword) # process and output each keyword with scake, will typically produce keyphrases # removing lemmatization with normalize=None seems to produce better results for keyword, score in (scake(doc, normalize=None, topn=TOPN)): if keyword not in keywords: print("\t".join([id, keyword])) keywords.add(keyword) exit()