def get_doc_metadata(self, doc): phrase_counter = Counter() try: for phrase in doc._.phrases: if self._include_chunks: for chunk in phrase.chunks: phrase_counter[str(chunk)] += ( phrase.rank + self._rank_smoothing_constant) else: phrase_counter[phrase.text] += phrase.count * ( phrase.rank + self._rank_smoothing_constant) except: # Support for pytextrank<3 import pytextrank tr = pytextrank.TextRank() tr.doc = doc phrases = tr.calc_textrank() for phrase in phrases: if self._include_chunks: for chunk in phrase.chunks: phrase_counter[str(chunk)] += ( phrase.rank + self._rank_smoothing_constant) else: phrase_counter[phrase.text] += phrase.count * ( phrase.rank + self._rank_smoothing_constant) return phrase_counter
def main(args): """ For each publication, search for the abstract and extract key phrases if abstract exists and is not null. Report if the abstract is missing. """ graph = rc_graph.RCGraph("keyphr") graph.load_stopwords() # add PyTextRank into the spaCy pipeline nlp = spacy.load("en_core_web_sm") tr = pytextrank.TextRank(logger=None) nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) for partition, pub_iter in graph.iter_publications(graph.BUCKET_STAGE, filter=args.partition): pub_list = [] for pub in tqdm(pub_iter, ascii=True, desc=partition[:30]): extract_phrases(graph, nlp, partition, pub, pub_list) graph.write_partition(graph.BUCKET_STAGE, partition, pub_list) # report errors status = "{} publications parsed keyphrases from abstracts".format( graph.publications.key_hits) trouble = "publications which could not parse keyphrases" graph.report_misses(status, trouble)
def InitNLPPyTextRank(): nlpPyRank = spacy.load("en_core_web_sm") tr = pytextrank.TextRank() # add PyTextRank to the spaCy pipeline nlpPyRank.add_pipe(tr.PipelineComponent, name="textrank", last=True) return nlpPyRank
def __init__(self, sl_flag=1, should_remove_stop_words=True): """ Initialize handler. :param sl_flag: stem/lemmatize flag - 0: stem 1: lemmatize :param should_remove_stop_words: default: True """ nltk.download('stopwords') nltk.download('punkt') nltk.download('wordnet') self.sl_flag = sl_flag if sl_flag not in [0, 1]: raise ValueError("Invalid sl flag provided") self.should_remove_stop_words = should_remove_stop_words self.porter_stemmer = PorterStemmer() self.lemmatizer = WordNetLemmatizer() self.stop_words = set(stopwords.words('english')) self.punctuation_filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' self.tr = pytextrank.TextRank() # FIXME: sanga.s add abstractive techniques. # pytext rank computes similarity graph and returns # important sentence as the one that is most similar to all. self.spacy_pipeline = spacy.load("en_core_web_sm") self.spacy_pipeline.add_pipe(self.tr.PipelineComponent, name="TextRank", last=True) # FIXME self.word2vec_trained = True if not self.word2vec_trained: self.train_word2vec() self.word2vec_model = self.retrieve_model()
def text_summary(text, model_en, model_fr): # Initialize dictionary to contain all the topics of the text considered summ = {} # Initialize TextRank (Graph based) algorithm for text semantic identification tr = pytextrank.TextRank() # Load `french and english pre-trained Spacy models nlp_en = model_en nlp_fr = model_fr try: # If english model has not been added to the nlp pipe, do it nlp_en.add_pipe(tr.PipelineComponent, name="textrank", last=True) except ValueError: pass try: # If French model has not been added to the nlp pipe, do it nlp_fr.add_pipe(tr.PipelineComponent, name="textrank", last=True) except ValueError: pass # Detect language to know which pipelining to choose if detect(text) == 'en': doc = nlp_en(text) # In my case if the language detected is not english it is most certainly French else: doc = nlp_fr(text) # Retrieve top 20 tags considered as most reflecting of the content of the text based on Spacy model tags = doc._.phrases[0:21] # Save in initialized dictionary summ["tags"] = tags return summ
def setUp(self): """set up a spaCy pipeline""" self.nlp = spacy.load("en_core_web_sm") self.tr = pytextrank.TextRank(logger=None) self.nlp.add_pipe(self.tr.PipelineComponent, name="textrank", last=True)
def breakIntoWordsAndPhrases(text): nlp = spacy.load("en_core_web_sm") tr = pytextrank.TextRank() nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) doc = nlp(text) hashset = set() for p in doc._.phrases: for q in p.chunks: hashset.add(str(q)) indextophrases = {} for s in hashset: indextophrases[text.find(s)] = s i = 0 end = len(text) chunks = [] string = "" while i < end: if i in indextophrases: chunks.append(string) chunks.append(indextophrases[i]) i += len(indextophrases[i]) string = "" else: string += text[i] i += 1 if i == end: chunks.append(string) return chunks
def key_phrases(self): # example text text = self.text # load a spaCy model, depending on language, scale, etc. nlp = spacy.load("en_core_web_sm") # add PyTextRank to the spaCy pipeline tr = pytextrank.TextRank() nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) doc = nlp(text) # to examine the top-ranked phrases in the document: #for p in doc._.phrases: #print("{:.4f} {:5d} {}".format(p.rank, p.count, p.text)) #print(p.chunks) sentences = [] quiz_length = random.randint(4, 7) for sentence in doc._.textrank.summary(limit_phrases=1, limit_sentences=quiz_length): sentences.append(str(sentence).replace("\n", "")) return sentences
def get_TextRank(article, n): nlp = spacy.load("en_core_web_sm") # add PyTextRank to the spaCy pipeline tr = pytextrank.TextRank() nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) doc = nlp(article) # examine the top-ranked phrases in the document topKeywords = [] for i in range(n): # print(type(doc._.phrases[i].text)) # print(type(str(doc._.phrases[i].text))) topKeywords.append(doc._.phrases[i].text) ''' for p in doc._.phrases: #print("{:.4f} {:5d} {}".format(p.rank, p.count, p.text)) print(type(p.text)) print('***********') ''' # print(topKeywords) # print(type(topKeywords)) # print(type(topKeywords[0])) return topKeywords
def __init__(self, model="distilbert-base-nli-stsb-mean-tokens"): self.model = SentenceTransformer(model) self.tokenizer = spacy.load("en_core_web_lg") tr = pytextrank.TextRank() self.tokenizer.add_pipe(tr.PipelineComponent, name="textrank", last=True)
def extract_keyterms(): nlp = en_core_web_sm.load() tr = pytextrank.TextRank() nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True) texts = glob.glob("./text-extraction/*.txt") for text in texts: final_result_name = './results/' + get_file_name(text).replace( 'txt', 'csv') print('Extracting key terms from text: ' + get_file_name(text) + '.') print('This process may take time, please wait ...') # Reading the text file arquivo = open(text, 'r', encoding='utf8') text = arquivo.read() doc = nlp(text) # examine the top-ranked phrases in the document dictFinal = defaultdict(list) for p in doc._.phrases: # print('{:.4f} {:5d} {}'.format(p.rank, p.count, p.text)) if (len(p.text) > 3): dictFinal['phrases'].append(p.text) dictFinal['count'].append(p.count) dictFinal['rank'].append(p.rank) dictFinal print('process finished, the result is in the ' + final_result_name + ' file') df = pd.DataFrame(dictFinal) df.sort_values(by=['rank', 'count'], ascending=False, inplace=True) df.to_csv(final_result_name)
def pytextrank_output(text, ratio_val): #Insert path to "en_core_web_sm-2.2.5" in the local file nlp = spacy.load("insert path here") tr = pytextrank.TextRank() nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) # add PyTextRank into the spaCy pipeline tr = pytextrank.TextRank(logger=None) text = text bad_chars = [';', ':', '!', "*", '•', '’', '\ufeff'] for i in bad_chars: text = text.replace(i, '') doc = nlp(text) split = int(len(text.split('.')) * ratio_val) list1 = '' for sent in doc._.textrank.summary(limit_phrases=split, limit_sentences=split): list1 = list1 + str(sent) return list1
def __init__(self, argv): super().__init__(command=__file__, argv=argv) spacy.prefer_gpu() self.nlp = spacy.load('en_core_web_sm') #coref = neuralcoref.NeuralCoref(self.nlp.vocab) #self.nlp.add_pipe(coref, name='neuralcoref'); tr = pytextrank.TextRank() self.nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True) self.__text_processor = TextProcessor(self.nlp, self._driver) self.create_constraints()
def run_textrank_model( entry_id, phrase_limit ): # this will extract paragraph and header text from given json file and extract the topics from that mycol = refer_collection() comp_data_entry = mycol.find({"_id": entry_id}) data = [i for i in comp_data_entry] print("textrank model started", str(data[0]['_id']), data[0]['link']) try: h_p_data = data[0]["paragraph_text"] + data[0][ "header_text"] # do topic extraction on paragraph and header text combined_text = " ".join(h_p_data) # load a spaCy model, depending on language, scale, etc. nlp = spacy.load("en_core_web_sm") # add PyTextRank to the spaCy pipeline tr = pytextrank.TextRank() nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) nlp.max_length = 150000000 doc = nlp(combined_text) # examine the top-ranked phrases in the document tr_results = [] tr_words = [] for p in doc._.phrases[:phrase_limit]: tr_results.append([p.rank, p.count, p.text]) tr_words.append(p.text) # print(p.chunks) # summery_res = [] # for sent in doc._.textrank.summary(limit_sentences=summery_limit): # print(sent) # summery_res.append(str(sent)) # print(summery_res) if (len(tr_words)): print(tr_words) mycol.update_one({'_id': entry_id}, {'$set': { 'textrank_results': tr_words }}) print("Successfully extended the data entry with textrank results", entry_id) else: mycol.update_one({'_id': entry_id}, {'$set': { 'textrank_results': [] }}) print("vocabulary is empty") except Exception: mycol.update_one({'_id': entry_id}, {'$set': {'textrank_results': []}}) print("vocabulary is empty") # run_textrank_model("F://Armitage_project//crawl_n_depth//extracted_json_files//0_www.sureway.com.au_data.json",50,5)
def test_extraction_with_TEXTRANK(): tr = pytextrank.TextRank() pos_el = spacy.load("el_core_news_md") pos_el.add_pipe(tr.PipelineComponent, name="textrank", last=True) while True: input_doc = input() if input_doc == "end": break output = extract_keywords_TEXTRANK(pos_el, input_doc, 5) print(output)
def phrase_rank(text, count): nlp = spacy.load("en_core_web_sm") tr = pytextrank.TextRank() nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) doc = nlp(text) res = [] for phrase in doc._.phrases[:count]: res.append(str(phrase).capitalize()) return res
def get_keywords(self, text): lang = self.get_language(text) model = lang + "_core_news_sm" print("loading model: '" + model + "'") nlp = spacy.load(lang) tr = pytextrank.TextRank() nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) doc = nlp(text) key_words = [] for p in doc._.phrases: print("{:.4f} {:5d} {}".format(p.rank, p.count, p.text)) key_words.append(p.text) return key_words[:10]
def yelpTrendyPhrases(business_id='Iq7NqQD-sESu3vr9iEGuTA', periods=12, bagging_periods=3, days_per_period=30, topk=10): ''' 1. Get Yelp review texts 2. Bag review texts within certain period, e.g. 6 peridos (180 days) 3. Use Textrank to get scores 4. Return JSON format for the frontend visualization ''' # In Google Colab, run 6 period bagging: # CPU times: user 24.5 s, sys: 520 ms, total: 25 s # Wall time: 25 s # https://colab.research.google.com/drive/1r4uvFA6RNV35lO3JcYoO5Psz_EVhmNu0 df_reviews = pd.DataFrame(columns=['date', 'text']) current_date = datetime.strptime('2018-11-30', '%Y-%m-%d') past_date = current_date - timedelta(days=days_per_period * periods - 1) getYelpReviews(business_id, starting_date=current_date, ending_date=past_date) # load a spaCy model, depending on language, scale, etc. nlp = spacy.load("en_core_web_sm/en_core_web_sm-2.2.5") # cutomize lemmatizer # https://spacy.io/api/lemmatizer # ... textrank = pytextrank.TextRank() nlp.add_pipe(textrank.PipelineComponent, name="textrank", last=True) keywords = [] for period in range(periods): # [starting_date, ending_date] = 180 days # or ending_date - staring_date = 179 days ending_date = current_date - timedelta(days=days_per_period * period) starting_date = ending_date - timedelta( days=days_per_period * bagging_periods - 1) condition = ((df_reviews['date'] >= starting_date) & (df_reviews['date'] <= ending_date)) df_texts = df_reviews[condition][['text', 'date']] text = " ".join(df_texts['text'].to_list()) doc = nlp(text) for i, p in enumerate(doc._.phrases): keywords.append([ending_date, p.rank, p.count, p.text]) if i >= topk - 1: break del [df_reviews] df_keywords = pd.DataFrame(keywords, columns=['date', 'rank', 'count', 'keywords']) df_keywords = df_keywords['keywords'].value_counts().index[:topk]
def word_bag_list(org_text): """Take text and do sum, return sumed sentence list.""" # load language model nlp = spacy.load("en_core_web_sm") # init pytextrank, then add pipe tr = pytextrank.TextRank(logger=None) nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) # declear text doc = nlp(org_text) # merge results into one string whole_sent = "" for sent in doc._.textrank.summary(limit_phrases=15, limit_sentences=5): whole_sent = whole_sent + repr(sent).rstrip() + " " return [whole_sent]
def textrank(corpus): import pytextrank tr = pytextrank.TextRank() nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) doc = nlp(corpus) print("=" * 50) for phrase in doc._.phrases: print(phrase) print("{:.4f} {:5d} {}".format(phrase.rank, phrase.count, phrase.text)) print(phrase.chunks) print("=" * 50) for sent in doc._.textrank.summary(limit_phrases=15, limit_sentences=5): print(sent)
def __init__(self, name): super(App, self).__init__(name) # Load models print("[INFO] Loading spacy model") self.nlp = spacy.load('en_core_web_md') # Adding pipe print("[INFO] Adding pipe") tr = pytextrank.TextRank() self.nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) # Load classifier self.classifier = onnxruntime.InferenceSession("./models/classifier.onnx")
def keywords(self, text, ratio=0.2): nlp = spacy.load(self.model_name) tr = pytextrank.TextRank() nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) doc = nlp(text) if doc._.phrases: phrase_count = len(doc._.phrases) lower_limit_num = max(int(ratio * phrase_count), 1) return dict([(p.text, p.rank) for p in doc._.phrases[:lower_limit_num]]) else: return dict()
def get_key_phrases(textstr): nlp = spacy.load("en_core_web_sm") doc_list = [] # add PyTextRank to the spaCy pipeline tr = pytextrank.TextRank() nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) doc = nlp(textstr) # examine the top-ranked phrases in the document for p in doc._.phrases: #print("{}".format(p.text)) doc_list.append(p) return doc_list
def get_keywords(text): # load a spaCy model, depending on language, scale, etc. lang = get_language(text) os.system('python3 -m spacy download ' + lang) nlp = spacy.load(lang) nlp.max_length = 29204346 # add PyTextRank to the spaCy pipeline tr = pytextrank.TextRank() nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) doc = nlp(text) # examine the top-ranked phrases in the document key_words = [] for p in doc._.phrases: #print("{:.4f} {:5d} {}".format(p.rank, p.count, p.text)) key_words.append(p.text) return key_words
def setup(base_path=".", testing=False): """ add PyTextRank into the spaCy pipeline, then set up the input directory path for test vs. production env """ nlp = spacy.load("en_core_web_sm") tr = pytextrank.TextRank(logger=None) nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) if testing: resource_path = Path(base_path) / "example/pub" else: resource_path = Path(base_path) / "resources/pub" return nlp, resource_path
def generate_base_map(self, text): tr = pytextrank.TextRank() self.nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True) doc = self.nlp(text.lower()) aux_key = self.get_key_phrases(doc, 10) key_phrases = self.process_key_phrases(aux_key) key = [str(word) for word in key_phrases] clus = self.cluster_texts(key, int(len(key[0]) / 2)) key = array(key) base_map = dict() for label, indexs in clus.items(): if len(key[indexs].tolist()) > 1: base_map[key[indexs][0]] = key[indexs].tolist()[1:] return base_map
def run_textrank_model( posts, phrase_limit, summery_limit ): # this will extract paragraph and header text from given json file and extract the topics from that # data_words = list(sent_to_words(posts)) # data_words_nostops = remove_stopwords(data_words) # data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ']) # print(data_lemmatized) # all_tokens = [j for i in data_lemmatized for j in i] # combined_text = " ".join(all_tokens) combined_text = " ".join(posts) # combined_text = h_p_data # print(combined_text) print("running textrank model") # load a spaCy model, depending on language, scale, etc. nlp = spacy.load("en_core_web_sm") # add PyTextRank to the spaCy pipeline tr = pytextrank.TextRank() nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) nlp.max_length = 150000000 doc = nlp(combined_text) # examine the top-ranked phrases in the document tr_results = [] tr_words = [] for p in doc._.phrases[:phrase_limit]: print("{:.4f} {:5d} {}".format(p.rank, p.count, p.text)) tr_results.append([p.rank, p.count, p.text]) tr_words.append(p.text) # print(p.chunks) # summery_res = [] # for sent in doc._.textrank.summary(limit_sentences=summery_limit): # print(sent) # summery_res.append(str(sent)) # print(tr_results) # print(summery_res) return tr_words # print(summery_res) # data[0]['textrank_resutls'] = tr_results # dump the extracted topics back to the json file # data[0]['textrank_summery__resutls'] = summery_res # with open(path_to_json, 'w') as outfile: # json.dump(data, outfile) # run_textrank_model("F://Armitage_project//crawl_n_depth//extracted_json_files//0_www.sureway.com.au_data.json",50,5)
def top_keywords(self, n=50): nlp = spacy.load("en_core_web_sm") nlp.max_length = 4000000 pos = ['ADJ', 'NOUN', 'PROPN'] tr = pytextrank.TextRank(pos_kept=pos) nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) doc = nlp(self._text) keywords = [] count = 0 for p in doc._.phrases: logging.debug("{:.4f} {:5d} {}".format(p.rank, p.count, p.text)) keywords.append(p.text) if count == n: break count += 1 return keywords
def load_nlp_pipe(self, pipes): """ This function creates and loads all the pipes into the nlp-er :pipes: list of pipe names as strings """ for pipe in pipes: if pipe == 'sentencizer': #needs to go before the parser. nlp_pipe = self.nlp.create_pipe(pipe) self.nlp.add_pipe(nlp_pipe, before='parser') elif pipe == 'textrank': tr = pytextrank.TextRank() self.nlp.add_pipe(tr.PipelineComponent, name=pipe, last=True) else: nlp_pipe = self.nlp.create_pipe(pipe) self.nlp.add_pipe(nlp_pipe)
def main(): nlp = spacy.load("en_core_web_sm") tr = pytextrank.TextRank(logger=None) nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) dir_path = Path("resources/pub") txt_path = dir_path / "txt" task_ids = [] ray.init() for txt_file in tqdm(list(txt_path.glob(f"*txt")), ascii=True, desc=f"extracted text files"): id = extract_phrases.remote(txt_file, dir_path, nlp) task_ids.append(id) ray.get(task_ids)