def wordRank(): #Retreive text from elasticsearch results = es.get(index='nkdb', doc_type='nkdb', id='5dc9fc5033ec463330e97e94') texts = json.dumps(results['_source'], ensure_ascii=False) # split the text by sentences sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', texts) # normalize the text texts = [normalize(text, number=True) for text in sentences] wordrank_extractor = KRWordRank( min_count=3, # Minimum frequency of word max_length=10, # Maximum length of word verbose=True) beta = 0.85 # Decaying factor beta of PageRank max_iter = 10 keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter) result = [] dic = {} # Make a dictionary [word, weight] for word, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:30]: dic["y"] = r dic["label"] = word result.append(dic) dic = {} return json.dumps(result, ensure_ascii=False)
def sentrank_keyword(self): if self.content is not None: top_sents = [] if self.title: top_sents.append(self.title) first_sents = sent_tokenize(self.content)[0] if first_sents: top_sents.append(first_sents) if top_sents: top_sents += self.sentrank() tfidf_kv = self.tf_idf()[:16] top_tfidf = {k: v for k, v in tfidf_kv} keywords = {} for sent in top_sents: lower = sent.strip().lower() sub = re.sub(r"\d+", " ", lower) tokens = word_tokenize(sub) for token in tokens: lemma = self.lemmatizer.lemmatize(token) if lemma in top_tfidf: keywords[lemma] = top_tfidf[lemma] return sorted(keywords.items(), key=operator.itemgetter(1), reverse=True) else: return []
def keyword_gensim_lda(docs, k=5, num_topics=10, num_words=5): lines = [line.rstrip() for line in open('SmartStoplist.txt')] stop_list = set(lines) texts = [[ word for word in gensim.utils.tokenize( document, lowercase=True, deacc=True, errors='replace') if word not in stop_list ] for document in docs] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lda = gensim.models.LdaModel(corpus, id2word=dictionary, num_topics=num_topics) gensim_topics = [ t[1] for t in lda.show_topics( num_topics=num_topics, num_words=num_words, formatted=False) ] topics = [[(i[1], i[0]) for i in t] for t in gensim_topics] keywords = {} # Sum of probabilities for token in all topics for topic in topics: for t in topic: token = t[1] pr = t[0] if token in keywords: keywords[token] += pr else: keywords[token] = pr # Probability for each token multiplied by token frequency matrix = gensim.matutils.corpus2csc(corpus) for token, pr in keywords.items(): for d in dictionary.items(): if d[1] == token: token_index = d[0] break token_row = matrix.getrow(token_index) token_freq = token_row.sum(1).item() keywords[token] = pr * math.log(token_freq) # Sort keywords by highest score return sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:k]
def sentrank_entity(self): if self.content is not None: top_sents = [] if self.title: top_sents.append(self.title) # first_sents = sent_tokenize(self.content)[1] # if first_sents: # top_sents.append(first_sents) # print("len of topsents :{}".format(len(top_sents))) # if top_sents: top_sents += self.sentrank() print("len of topsents:{}".format(len(top_sents))) if top_sents: entities = [] keywords = {} tfidf_kv = self.tf_idf()[:16] top_tfidf = {k: v for k, v in tfidf_kv} sent_counter = 0 for sent in top_sents: # print(sent_counter,"top sent:",sent) sent_counter += 1 strip = sent.strip() ners, words = ner.ner(strip) # print("ners:",ners) for ne in ners: if ne not in entities: entities.append(ne) sent_words = " ".join(words) lower = sent_words.strip().lower() sub = re.sub(r"\d+", " ", lower) tokens = word_tokenize(sub) for token in tokens: lemma = self.lemmatizer.lemmatize(token) if lemma in top_tfidf: keywords[token] = top_tfidf[lemma] print("len of entities:{}".format(len(entities))) return entities, sorted(keywords.items(), key=operator.itemgetter(1), reverse=True) else: return [], {}
def sentrank_keyword_0(self): if self.content is not None: top_sents = self.sentrank() #sentrank(self.content,3) # print("top sents:",top_sents) if top_sents: tfidf_kv = self.tf_idf()[:20] top_tfidf = {k: v for k, v in tfidf_kv} # print("top_tfidf: ",top_tfidf) keywords = {} if self.title: top_sents = top_sents.append(self.title) for sent in top_sents: strip = sent.strip() lower = re.sub(r"\d+", " ", strip) tokens = word_tokenize(lower) for token in tokens: if token.lower() in top_tfidf: keywords[token] = top_tfidf[token.lower()] return sorted(keywords.items(), key=operator.itemgetter(1), reverse=True) else: return []