def test_textrank_norm_normalized_str(spacy_doc): expected = ['Friedman', 'Beirut', 'New', 'Award', 'foreign'] observed = [ term for term, _ in keyterms.textrank( spacy_doc, normalize=spacy_utils.get_normalized_text, n_keyterms=5) ] assert len(expected) == len(observed)
def test_norm_none(self, spacy_doc): expected = ["Friedman", "Beirut", "New", "Arab", "Award"] observed = [ term for term, _ in keyterms.textrank( spacy_doc, normalize=None, n_keyterms=5) ] assert len(expected) == len(observed)
def key_terms(self, algorithm='sgrank', n=10): """ Extract key terms from a document using `algorithm`. Args: algorithm (str {'sgrank', 'textrank', 'singlerank'}, optional): name of algorithm to use for key term extraction n (int or float, optional): if int, number of top-ranked terms to return as keyterms; if float, must be in the open interval (0.0, 1.0), representing the fraction of top-ranked terms to return as keyterms Returns: list[(str, float)]: sorted list of top `n` key terms and their corresponding scores Raises: ValueError: if ``algorithm`` not in {'sgrank', 'textrank', 'singlerank'} .. seealso:: :func:`keyterms.sgrank() <textacy.keyterms.sgrank>` .. seealso:: :func:`keyterms.textrank() <textacy.keyterms.textrank>` .. seealso:: :func:`keyterms.singlerank() <textacy.keyterms.singlerank>` """ if algorithm == 'sgrank': return keyterms.sgrank(self.spacy_doc, window_width=1500, n_keyterms=n) elif algorithm == 'textrank': return keyterms.textrank(self.spacy_doc, n_keyterms=n) elif algorithm == 'singlerank': return keyterms.singlerank(self.spacy_doc, n_keyterms=n) else: raise ValueError( 'algorithm {} not a valid option'.format(algorithm))
def key_terms(self, algorithm='sgrank', n=10): """ Extract key terms from a document using `algorithm`. Args: algorithm (str {'sgrank', 'textrank', 'singlerank'}, optional): name of algorithm to use for key term extraction n (int or float, optional): if int, number of top-ranked terms to return as keyterms; if float, must be in the open interval (0.0, 1.0), representing the fraction of top-ranked terms to return as keyterms Returns: list[(str, float)]: sorted list of top `n` key terms and their corresponding scores Raises: ValueError: if ``algorithm`` not in {'sgrank', 'textrank', 'singlerank'} .. seealso:: :func:`keyterms.sgrank() <textacy.keyterms.sgrank>` .. seealso:: :func:`keyterms.textrank() <textacy.keyterms.textrank>` .. seealso:: :func:`keyterms.singlerank() <textacy.keyterms.singlerank>` """ if algorithm == 'sgrank': return keyterms.sgrank(self.spacy_doc, window_width=1500, n_keyterms=n) elif algorithm == 'textrank': return keyterms.textrank(self.spacy_doc, n_keyterms=n) elif algorithm == 'singlerank': return keyterms.singlerank(self.spacy_doc, n_keyterms=n) else: raise ValueError('algorithm {} not a valid option'.format(algorithm))
def keywords(): #print request.get_json() arg = request.get_json() doc = textacy.Doc(arg['content'], metadata={'title': arg['title']}, lang=unicode('en_core_web_sm')) sgrank_keywords = dict(keyterms.sgrank(doc)) singlerank_keywords = dict(keyterms.singlerank(doc)) textrank_keywords = dict(keyterms.textrank(doc)) sgrank_keywords.update((x, y * 0.9) for x, y in sgrank_keywords.items()) textrank_keywords.update( (x, y * 0.05) for x, y in textrank_keywords.items()) singlerank_keywords.update( (x, y * 0.05) for x, y in singlerank_keywords.items()) keywords = res = dict( Counter(sgrank_keywords) + Counter(textrank_keywords) + Counter(singlerank_keywords)) sorted_keywords = sorted(keywords.items(), key=operator.itemgetter(1), reverse=True) keyword_string = "" for i, key in enumerate(sorted_keywords): if (i == int(len(sorted_keywords) / 2)): keyword_string = keyword_string + "||" if (i == len(sorted_keywords) - 1 or i == int(len(sorted_keywords) / 2) - 1): keyword_string = keyword_string + key[0] else: keyword_string = keyword_string + key[0] + ",," return keyword_string
def preprocess_job_market(in_path, out_path): import os import json import codecs job_texts = [] for filename in glob.glob(in_path + '*.json'): try: with codecs.open(filename, encoding='utf-8') as job_file: content = json.load(job_file) job_texts.append(content.get('description', u'')) except: print("===Exception reading file " + filename) continue spacy_lang = en_core_web_sm.load() corpus = textacy.corpus.Corpus(spacy_lang) corpus_text = '\n'.join(text for text in job_texts) corpus.add_text(corpus_text) res_file = out_path + 'job_market.csv' if not os.path.isfile(res_file): termList1 = term_list( keyterms.textrank(corpus[0], normalize=u'lower', n_keyterms=30)) termList2 = term_list( keyterms.sgrank(corpus[0], ngrams=(1, 2), normalize=u'lower', window_width=100, n_keyterms=70, idf=None)) termSet1 = set(termList1) termSet2 = set(termList2) diffSet = termSet1 - termSet2 termList = termList2 + list(diffSet) save_terms_text(res_file, termList)
def preprocess_kags(in_path, out_path): import os spacy_lang = en_core_web_sm.load() for kag_path in glob.glob(KAG_BASE_PATH + '/*'): _, kag_name = os.path.split(kag_path) corpus = textacy.corpus.Corpus(spacy_lang) texts = [] for comp_path in glob.glob(kag_path + '/*'): for filename in glob.glob(comp_path + '/*.txt'): texts.append(open(filename, 'r').read().decode('utf-8')) corpus_text = '\n'.join(text for text in texts) corpus.add_text(corpus_text) # _ , comp_file = os.path.split(comp_path) # sindex = len(kag_name) + 1 # eindex = sindex + comp_file[sindex:].index('_') res_file = '{}.csv'.format(get_kag(kag_name)) termList1 = term_list( keyterms.textrank(corpus[0], normalize=u'lower', n_keyterms=30)) doc_idf = corpus.word_doc_freqs(lemmatize=None, weighting='idf', lowercase=True, as_strings=True) termList2 = term_list( keyterms.sgrank(corpus[0], ngrams=(1, 2, 3), normalize=u'lower', window_width=500, n_keyterms=70, idf=doc_idf)) termSet1 = set(termList1) termSet2 = set(termList2) diffSet = termSet1 - termSet2 termList = termList2 + list(diffSet) # save_terms_csv(out_path + res_file, termList) save_terms_text(out_path + res_file, termList)
def test_textrank(spacy_doc): expected = [ 'friedman', 'beirut', 'reporting', 'arab', 'new', 'award', 'foreign', 'year', 'times', 'jerusalem' ] observed = [term for term, _ in keyterms.textrank(spacy_doc)] assert len(expected) == len(observed)
def test_textrank_norm_none(spacy_doc): expected = ['Friedman', 'Beirut', 'New', 'Arab', 'Award'] observed = [ term for term, _ in keyterms.textrank( spacy_doc, normalize=None, n_keyterms=5) ] assert len(expected) == len(observed)
def test_norm_normalized_str(self, spacy_doc): expected = ["Friedman", "Beirut", "New", "Award", "foreign"] observed = [ term for term, _ in keyterms.textrank( spacy_doc, normalize=spacy_utils.get_normalized_text, n_keyterms=5) ] assert len(expected) == len(observed)
def test_textrank_norm_normalized_str(self): expected = ['Friedman', 'Beirut', 'New', 'Award', 'foreign'] observed = [ term for term, _ in keyterms.textrank( self.spacy_doc, normalize=spacy_utils.normalized_str, n_keyterms=5) ] self.assertEqual(len(expected), len(observed))
def _apply_keyterm_ranking(self, doc, params=None): if self.method == 'sgrank': keywords = textacy.keyterms.sgrank(doc, **params) \ if params else tck.sgrank(doc) elif self.method == 'textrank': keywords = textacy.keyterms.textrank(doc, **params) \ if params else tck.textrank(doc) elif self.method == 'singlerank': keywords = textacy.keyterms.singlerank(doc, **params) \ if params else tck.singlerank(doc) return keywords
def test_textrank_norm_lower(self): expected = ['friedman', 'beirut', 'reporting', 'arab', 'new'] observed = [ term for term, _ in keyterms.textrank(self.spacy_doc, normalize='lower', n_keyterms=5)] self.assertEqual(len(expected), len(observed)) # can't do this owing to randomness of results # for e, o in zip(expected, observed): # self.assertEqual(e, o) for term in observed: self.assertEqual(term, term.lower())
def test_textrank_norm_lower(spacy_doc): expected = ['friedman', 'beirut', 'reporting', 'arab', 'new'] observed = [ term for term, _ in keyterms.textrank(spacy_doc, normalize='lower', n_keyterms=5)] assert len(expected) == len(observed) # can't do this owing to randomness of results # for e, o in zip(expected, observed): # asert e == o for term in observed: assert term == term.lower()
def test_norm_lower(self, spacy_doc): expected = ["friedman", "beirut", "reporting", "arab", "new"] observed = [ term for term, _ in keyterms.textrank( spacy_doc, normalize="lower", n_keyterms=5) ] assert len(expected) == len(observed) # can't do this owing to randomness of results # for e, o in zip(expected, observed): # asert e == o for term in observed: assert term == term.lower()
def test_base(self, spacy_doc): expected = [ "friedman", "beirut", "reporting", "arab", "new", "award", "foreign", "year", "times", "jerusalem", ] observed = [term for term, _ in keyterms.textrank(spacy_doc)] assert len(expected) == len(observed)
def test_extract_functionality(doc): bigrams = list( extract.ngrams(doc, 2, filter_stops=True, filter_punct=True, filter_nums=False))[:10] for bigram in bigrams: assert isinstance(bigram, SpacySpan) assert len(bigram) == 2 trigrams = list( extract.ngrams(doc, 3, filter_stops=True, filter_punct=True, min_freq=2))[:10] for trigram in trigrams: assert isinstance(trigram, SpacySpan) assert len(trigram) == 3 nes = list( extract.named_entities(doc, drop_determiners=False, exclude_types='numeric'))[:10] for ne in nes: assert isinstance(ne, SpacySpan) assert ne.label_ assert ne.label_ != 'QUANTITY' pos_regex_matches = list( extract.pos_regex_matches( doc, constants.POS_REGEX_PATTERNS['en']['NP']))[:10] for match in pos_regex_matches: assert isinstance(match, SpacySpan) stmts = list(extract.semistructured_statements(doc, 'I', cue='be'))[:10] for stmt in stmts: assert isinstance(stmt, list) assert isinstance(stmt[0], compat.unicode_) assert len(stmt) == 3 kts = keyterms.textrank(doc, n_keyterms=10) for keyterm in kts: assert isinstance(keyterm, tuple) assert isinstance(keyterm[0], compat.unicode_) assert isinstance(keyterm[1], float) assert keyterm[1] > 0.0
def preprocess_category(in_path, out_path, category_name): import os spacy_lang = en_core_web_sm.load() print('===GIVEN CATEGORY: ' + category_name) for cat_path in glob.glob(in_path + '*'): _, cat_name = os.path.split(cat_path) print('===CATEGORY: ' + cat_name) if category_name == cat_name: print('###Fine, found category directory ...') # for comp_path in glob.glob(kag_path + '/*'): corpus = textacy.corpus.Corpus(spacy_lang) texts = [] for filename in glob.glob(cat_path + '/*.txt'): texts.append(open(filename, 'r').read().decode('utf-8')) corpus_text = '\n'.join(text for text in texts) corpus.add_text(corpus_text) # _ , comp_file = os.path.split(comp_path) # sindex = len(kag_name) + 1 # eindex = sindex + comp_file[sindex:].index('_') # res_file = '{}.csv'.format(comp_file[sindex:eindex]) res_file = '{}.csv'.format(category_name.lower()) termList1 = term_list( keyterms.textrank(corpus[0], normalize=u'lower', n_keyterms=30)) doc_idf = corpus.word_doc_freqs(lemmatize=None, weighting='idf', lowercase=True, as_strings=True) termList2 = term_list( keyterms.sgrank(corpus[0], ngrams=(1, 2, 3), normalize=u'lower', window_width=500, n_keyterms=70, idf=doc_idf)) termSet1 = set(termList1) termSet2 = set(termList2) diffSet = termSet1 - termSet2 termList = termList2 + list(diffSet) save_terms_text(out_path + res_file, termList) break
def preprocess_jobs_or_cvs_combined(in_path, out_path): import os for filename in glob.glob(in_path + '*.json'): _, cv_file = os.path.split(filename) res_file = out_path + '{0}.csv'.format(cv_file[0:cv_file.index('.')]) if not os.path.isfile(res_file): corpus = read_cv2(filename) termList1 = term_list( keyterms.textrank(corpus[0], normalize=u'lower', n_keyterms=30)) termList2 = term_list( keyterms.sgrank(corpus[0], ngrams=(1, 2), normalize=u'lower', window_width=100, n_keyterms=70, idf=None)) termSet1 = set(termList1) termSet2 = set(termList2) diffSet = termSet1 - termSet2 termList = termList2 + list(diffSet) # save_terms_csv(res_file, termList) save_terms_text(res_file, termList)
def test_textrank_n_keyterms(spacy_doc): expected = ['friedman', 'beirut', 'reporting', 'arab', 'new'] observed = [term for term, _ in keyterms.textrank(spacy_doc, n_keyterms=5)] assert len(expected) == len(observed)
inspec_files = os.listdir(inspec_folder) result = [] for myfile in inspec_files: if myfile.endswith("abstr"): #This is the actual content. content = open(os.path.join(inspec_folder, myfile), encoding="utf-8", errors="ignore").read().strip() keywords_file = myfile.replace(".abstr", ".uncontr") keywords = [ process(item) for item in open(os.path.join(inspec_folder, keywords_file), encoding="utf-8", errors="ignore").read().strip().split(";") ] result.append((content, keywords)) return result inspec = get_inspec() print(len(inspec)) spacymodel = spacy.load('en_core_web_sm', disable=('parser', 'ner')) fw = open("tempKPEout-allinspec-textrank.txt", "w") for content, keywords in inspec: fw.write(content + "\n") fw.write(str(keywords)) fw.write("\n") spacified = spacymodel(content) fw.write(str([term[0] for term in textrank(spacified)])) fw.write("\n\n") fw.close()
def test_textrank_n_keyterms(self, spacy_doc): expected = ["friedman", "beirut", "reporting", "arab", "new"] observed = [ term for term, _ in keyterms.textrank(spacy_doc, n_keyterms=5) ] assert len(expected) == len(observed)
def test_textrank_n_keyterms(self): expected = ['friedman', 'beirut', 'reporting', 'arab', 'new'] observed = [ term for term, _ in keyterms.textrank(self.spacy_doc, n_keyterms=5) ] self.assertEqual(len(expected), len(observed))
for r in res: print(r) print("---------------") print("sgrank:") res = keyterms.sgrank(doc, n_keyterms=50) for r in res: print(r) print("---------------") print("singlerank:") res = keyterms.singlerank(doc, n_keyterms=50) for r in res: print(r) print("---------------") print("textrank:") res = keyterms.textrank(doc, n_keyterms=50) for r in res: print(r) print("---------------") print("key_terms_from_semantic_network:") res = keyterms.key_terms_from_semantic_network(doc, n_keyterms=50) for r in res: print(r)