def test_tfidf(): lines = open('D:\\Python\\Data\\NBA.txt', encoding='utf-8').read() print(type(lines)) # 基于TF-IDF算法的关键词抽取 words = analyse.extract_tags(lines, topK=20, withWeight=True, allowPOS=()) print(words) # 基于TextRank算法的关键词抽取 words = analyse.textrank(lines, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) print(words) words = analyse.textrank(lines, topK=20, withWeight=False, allowPOS=('ns', 'n')) print(words) # 词性标注 words = pseg.cut('我爱自然语言处理') # print(list(words)) for word, flag in words: print(word, flag) # Tokenize:返回词语在原文的起止位置 result = jieba.tokenize('我爱自然语言处理') print(list(result))
def all_department_weight(): # file = open('../data/deal/context_percent.txt', mode='w+', encoding='utf-8') id_department_content = sql_select_id_department_content() contents = '' departments = '' ids = 1 for id, content, department in id_department_content: if departments == '': departments = department contents = content elif departments == department: try: contents += str(content) except Exception as e: print(e) else: keywords, weights = '', '' for keyword, weight in analyse.textrank(contents, topK=50, withWeight=True): if keyword not in STOPWORD: keywords += keyword + ',' weights += str(weight) + ',' # strs = str(ids), '\t', departments, '\t', keywords, '\t', weights, '\n' insert_department_word_frequency(department=departments, word=keywords, weight=weights) # print(strs) # file.writelines(strs) departments = department contents = content ids += 1 keywords, weights = '', '' for keyword, weight in analyse.textrank(contents, topK=50, withWeight=True): if keyword not in STOPWORD: keywords += keyword + ',' weights += str(weight) + ',' insert_department_word_frequency(department=departments, word=keywords, weight=weights) print('每个部门的词权重')
def generate_topics(): L.info("Start update topic.") db = Database() db.run("delete from topic") data = db.select("select pubDate, summary from news order by pubDate asc") now = datetime.date(2019, 12, 31) news = "" topic = {} today = [] yesterday = [] dead = [] new = [] sql = "insert into topic (date, topic, dead, new) values (%s, %s, %s, %s)" for line in data: if line[0].date() == now: news += line[1] else: topic.clear() today.clear() for keyword, weight in textrank(news, topK=40, withWeight=True): topic[keyword] = weight today.append(keyword) for keyword in today: if keyword not in yesterday: new.append(keyword) for keyword in yesterday: if keyword not in today: dead.append(keyword) db.execute(sql, [ now.strftime("%Y-%m-%d %H:%M:%S"), str(topic), str(dead), str(new) ]) L.info("\tNow processing {}".format(now.strftime("%Y-%m-%d"))) now = line[0].date() news = line[1] yesterday = today.copy() new.clear() dead.clear() topic.clear() today.clear() for keyword, weight in textrank(news, topK=20, withWeight=True): topic[keyword] = weight today.append(keyword) for keyword in today: if keyword not in yesterday: new.append(keyword) for keyword in yesterday: if keyword not in today: dead.append(keyword) db.execute( sql, [now.strftime("%Y-%m-%d %H:%M:%S"), str(topic), str(dead), str(new)]) L.info("\tFinished update topic.")
def generate_weibo_topics(): L.info("Start update weibo topic.") db_weibo = pymysql.connect("localhost", "root", "root", "weibo") cursor = db_weibo.cursor() cursor.execute( "select publish_time, content from weibo order by publish_time asc") data = cursor.fetchall() db = Database() now = datetime.date(2019, 12, 31) news = "" topic = {} sql = "insert into topic (date, topic) values (%s, %s)" for line in data: if line[0].date() == now: news += line[1] else: topic.clear() for keyword, weight in textrank(news, topK=20, withWeight=True): topic[keyword] = weight db.execute(sql, [now.strftime("%Y-%m-%d %H:%M:%S"), str(topic)]) L.info("\tNow processing {}".format(now.strftime("%Y-%m-%d"))) now = line[0].date() news = line[1] topic.clear() for keyword, weight in textrank(news, topK=20, withWeight=True): topic[keyword] = weight db.execute(sql, [now.strftime("%Y-%m-%d %H:%M:%S"), str(topic)]) L.info("\tFinished update weibo topic.")
def jsonFileTranslate(rdd): cnt = rdd.count() if cnt != 0: for rd in rdd.collect(): j_file = json.loads(rd) j_file["KeyWord"] = analyse.textrank(j_file["Text"]) j_file["SplitText"] = " ".join(jieba.cut_for_search(j_file["Text"])) j_file["TitleKey"] = analyse.textrank(j_file["Title"]) print(j_file["Title"]) print(j_file["KeyWord"]) saveJsonFileToHDFS(j_file)
def all_classification_weight(): for a in redisdriver.keys_get(): # 清理redis redisdriver.driver().delete(a) class_file = open('../data/deal/class.txt', 'r+', encoding='utf-8') title_context = sql_select_classification_context() categorys = {} for classification, context in title_context: if classification is not None: # print(classification) exist = redisdriver.key_exists(classification) # print(exist) if exist == 0: categorys[classification] = 1 redisdriver.classification_weight_set(classification, context) elif exist == 1: categorys[classification] += 1 context += redisdriver.value_get(classification).decode() redisdriver.delete_key(classification) # print(redisdriver.value_get(classification).decode()) redisdriver.classification_weight_set(classification, context) else: continue for b in redisdriver.keys_get(): one_classification = b.decode() all_classification_context = redisdriver.value_get(b).decode() keywords, weights = '', '' for keyword, weight in analyse.textrank(all_classification_context, topK=600, withWeight=True): if keyword not in STOPWORD: keywords += keyword + ',' weights += str(weight)[:12] + ',' sql_insert_classification_weight(one_classification, categorys[one_classification], keywords, weights) print('类别、数量、词、 权重')
def fenci(df, i): df_fenci = ("/".join( analyse.textrank(str(df[i]), topK=10, withWeight=False, allowPOS=('mq', 'nz', 'v', 'z', 'a', 'n', 'ns')))) return df_fenci
def analyse_key_word_of_post(session, url, number=10): # 每个帖子返回k个关键词,字符云 # 获取所有的发言内容 response = session.get(url, headers=headers) soup = BeautifulSoup(response.text, features="lxml") page_count = int( soup.find('div', class_='paging paging-top').find_all('div') [-2].get_text().split(' ')[1]) data = [] for page_idx in range(1, page_count + 1): a_url = url + '&page=' + str(page_idx) a_response = session.get(a_url, headers=headers) a_soup = BeautifulSoup(a_response.text, features="lxml") text_div = a_soup.find_all('div', class_='body file-read image-click-view') text_p = [] for div in text_div: p = div.find('p') if p.get('class') == None: text_p.append(p) text_p = [p.get_text() for p in text_p] for d in text_p: data.append(d) # 解析发言内容,返回关键词 rule = re.compile(u"[^a-zA-Z0-9\u4e00-\u9fa5]") for line in data: line = rule.sub('', line) data_text = '。'.join(data) key_words = analyse.textrank(data_text) return key_words[:number]
def rank_words(self, count=36, allow_pos=('ns', 'n', 'vn', 'v')): self._inner_init() return als.textrank(self.__text, topK=count, allowPOS=allow_pos, withWeight=False)
def get_wordcloud(save_path, path, song_name, author): author = str(author) print('\n请保证使用图片的保存位置与本程序的保存位置相同!!!\n否则程序将无法正常运行!!!\n') picture = input('请输入使用图片名称(例:picture.jpg):\n') color_mask = imread(picture) with open(save_path, 'r', encoding='utf-8') as f: string = f.read() words = ' '.join(jieba.cut(string)) top_words = analyse.textrank(words, topK=400, withWeight=True) ret_words = {} for word in top_words: ret_words[word[0]] = word[1] wordcloud = WordCloud(background_color='white', mask=color_mask, max_words=100, stopwords=STOPWORDS, font_path='C:\Windows\Fonts\simkai.ttf', max_font_size=100, random_state=30, margin=2) wc = wordcloud.generate_from_frequencies(frequencies=ret_words) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") save_path = path + os.path.sep + '{file_name}.{file_suffix}'.format( file_name=song_name + '(' + author + ')', file_suffix='jpg') wc.to_file(save_path) plt.show()
def get_detail_content(): weibos = [] target = 1 for button in getHot(): weibo = [] weibo.append(button.text) # print(button.text) button.click() browser.switch_to.window(browser.window_handles[target]) target += 1 html = browser.page_source # 把xmlns去掉,下面对a标签的remove才能生效 html = html.replace('xmlns="http://www.w3.org/1999/xhtml"', '') doc = pq(html) all_contents = doc('.m-main .m-wrap .m-con-l .card-wrap').items() all_contents = list(all_contents) true_content = '' for con in all_contents[1:6]: temp = con('.card .card-feed .content .txt') temp('a').remove() c = temp.text() c = c.replace('\n', '') true_content += c + ' ' # print(true_content) keywords = analyse.textrank(true_content, topK=20) weibo.append(keywords) time.sleep(6) browser.switch_to.window(browser.window_handles[0]) weibos.append(weibo) return weibos
def DrawWordCloud(): textfile = input('文件路径:') imagefile = input('图片路径:') fontpath = input('字体路径(字体路径不能有中文):') color_num = input('选择背景颜色:\n1.黑色;2.白色;3.红色;4.蓝色;5.黄色;6.绿色\n' '可直接用"#"开始的颜色编码来自定义背景色:') default_color = color_num if default_color != '': bg_color = dic.get(color_num, default_color) else: bg_color = None comment_text = open(textfile, 'r', encoding='utf-8').read() result = analyse.textrank(comment_text, topK=300, withWeight=True) keywords = dict() for i in result: keywords[i[0]] = i[1] color_mask = imread(imagefile) cloud = WordCloud(font_path=fontpath, background_color=bg_color, mode='RGBA', mask=color_mask, max_words=100, max_font_size=300) word_cloud = cloud.generate_from_frequencies(keywords) word_cloud.to_file("图云.png") plt.imshow(word_cloud) plt.axis('off') plt.show()
def output(ori_datasets, seg_datasets, labels): clusters = {} clusters_ori_datasets = {} clusters_seg_datasets = {} for idx, label in enumerate(labels): if label not in clusters: clusters[label] = "" clusters[label] += " " + seg_datasets[idx] if label not in clusters_ori_datasets: clusters_ori_datasets[label] = [] clusters_ori_datasets[label].append(ori_datasets[idx]) if label not in clusters_seg_datasets: clusters_seg_datasets[label] = [] clusters_seg_datasets[label].append(seg_datasets[idx]) for idx, label in enumerate(clusters): key_words = " ".join( analyse.textrank(clusters[label], topK=10, allowPOS=["a", "ng", "n", "nr", "ns", "nt", "nz"])) print "cluster: %d\tset size: %d\tkey words: %s" % ( idx + 1, len(clusters_ori_datasets[label]), key_words) for ii in range(len(clusters_ori_datasets[label])): print "\t\t\t\t", clusters_ori_datasets[label][ ii], "||", clusters_seg_datasets[label][ii]
def keyExtractor(filename, k=20, thresh=0.2): f = open(filename, encoding='utf-8') docs = f.read() # 分词 seg_list = jieba.cut(docs, cut_all=False) stopwords = get_stopword_list() word_list = [] for word in seg_list: if word != " " and word != "\n": # 剔除单字 if len(word) > 1: # 剔除停词 if word not in stopwords: word_list.append(word) # 重新组合成句 sentence = "".join(word_list) # 调用TextRank算法提取关键字 keywords_map = anal.textrank(sentence, topK=k, withWeight=True, allowPOS=('n', 'nr', 'ns')) keywords_list = [] # 显示关键字及对应权重 for item in keywords_map: # 分别为关键词和相应的权重 if item[1] >= thresh: keywords_list.append(item[0]) return keywords_list
def topWord(self, top=100): top_dict = {} # withFlag=True 标注词性 withWeight=True for keyword, weight in textrank(self.sentense, withWeight=True, topK=top, allowPOS=('nz', 'a', 'v', 'z')): top_dict[keyword] = weight print(top_dict) return top_dict
def get_tfidf_and_lsi(corpus, texts): # 根据texts获取每个text的textrank关键词,将corpus中关键词复制weight份,即提升关键词的权重 keywords = [] for i, text in enumerate(texts): text_k = textrank(text, withWeight=True, allowPOS=('n', 'nr', 'ns', 'nt', 'nz', 'nrt', 'j', 'v', 'vn')) keywords.append(text_k) words = corpus[i] weight = len(text_k) for word in text_k: if word[0] in words: words.extend(weight*[word[0]]) weight -= 1 dictionary = corpora.Dictionary(corpus) length_of_dictionary = len(dictionary) doc_vectors = [dictionary.doc2bow(text) for text in corpus] # TF-IDF特征 tfidf = models.TfidfModel(doc_vectors) tfidf_vectors = tfidf[doc_vectors] # LSI特征 lsi = models.LsiModel(tfidf_vectors, id2word=dictionary, num_topics=500) lsi_vectors = lsi[tfidf_vectors] vec = [] for i, ele in enumerate(lsi_vectors): feature = np.zeros(500) for idx, val in ele: feature[idx] = val vec.append(feature) return vec, lsi_vectors, keywords
def gwxz_anls(): cursor2 = conn.cursor(cursors.SSCursor) cursor2.execute( "select rzyq,year(fbrq),month(fbrq),category from zp_item where rzyq!='' and category is not NULL and category!=''" ) import_jb_library() result_list = [[ *anls.textrank(result[0], topK=10), str(result[-3]), str(result[-2]), str(result[-1]) ] for result in cursor2] result_dict = {} for i in result_list: if len(i) > 3: key = '_'.join(i[-3:]) result_dict[key] = result_dict.get(key, []) result_dict[key].extend(i[:-3]) cursor2.close() cursor = conn.cursor() cursor.execute('truncate table zp_word_by_zwlb') # 数据插入 sql = 'insert into zp_word_by_zwlb value (NULL,%s,%s,%s,%s,%s)' for k, v in result_dict.items(): words = Counter(result_dict[k]).most_common() key = k.split('_') year = key[0] month = key[1] category = key[2] parm_list = [(num, category, year, month, word) for (word, num) in words] cursor.executemany(sql, parm_list) conn.commit() cursor.close() conn.close()
def getKeywords(self, num=10): ''' # 提取文本关键词 ## 参数 - int num : 输出的关键词数 ## 算法选择 判断文本长度,选则使用 TF-IDF 或 TextRank 算法: - 短文本,使用 TextRank 算法 - 长文本,使用 TF-IDF 算法 ## 输出 ``` # jba.textrank(data, topK=num, withWeight=True) [('关键词', 权重), ...] # jba.textrank(data, topK=num) ['关键词', ...] ``` 现采用第二种策略 ''' length = len(self.getText()) if length < 50: # 短文本,使用 TextRank 算法 res = jba.textrank(self.getText(), topK=num) else: # 长文本,使用 TF-IDF 算法 res = jba.extract_tags(self.getText(), topK=num) return res
def disp(self): key_temp = [] keyword_list = [] res = self.result tt = self.tt for i in range(self.num_clusters): text = '' for j in range(len(res)): if res[j] == i: text += self.Q_list[j] keywords = analyse.textrank(text) #keyword_list.append(",".join(keywords[:3])) for i in keywords: key_temp.append(i) key_temp = list(set(key_temp)) keyword_list.append(",".join(keywords)) #keyword_list.append(key_temp[:3]) #tt[i][0],tt[i][1],tt[i][2],tt[i][3] ans = [[ res[i], keyword_list[res[i]], tt[i][0], tt[i][1], tt[i][2], tt[i][3], tt[i][4] ] for i in range(len(res))] ans = sorted(ans, key=lambda x: x[0]) dff = pd.DataFrame(ans) dff.to_csv('ans\\cluster_ans.csv', index=False, header=False, encoding='utf-8-sig') return key_temp, ans
def single(): content = u"" if request.method == 'GET': content = u"12日上午,国家主席习近平同美国总统特朗普通电话。两国元首就朝鲜半岛局势等共同关心的问题交换了意见。习近平强调,中方坚持实现半岛无核化目标,坚持维护半岛和平稳定,主张通过和平方式解决问题,愿同美方就半岛问题保持沟通协调。关于叙利亚问题,习近平指出,任何使用化学武器的行为都不可接受。叙利亚问题要坚持政治解决的方向。联合国安理会保持团结对解决叙利亚问题非常重要,希望安理会发出一致声音。两国元首同意通过各种方式保持密切联系。" if request.method == 'POST': content = request.form['content'] content = re.sub(u'(\s|\n|t)', u'', content) print content seg_list = [(word, flag) for word, flag in pseg.cut(content)] textrank_key_list = analyse.textrank(content, topK=5, withWeight=True) tf_idf_key_list = analyse.tfidf(content, topK=5, withWeight=True) s = sentiment.Sentiment() sentiment_score = s.single_review_sentiment_score(content) sentiment_score_up = (math.atan(sentiment_score) * 2 / math.pi + 1) / 2 * 100 sentiment_score_down = 100 - sentiment_score_up s = SnowNLP(content) summary = s.summary(3) # print key_list # print("Default Mode: " + "/ ".join(seg_list)) # 精确模式 return render_template( "single.html", seg_list=seg_list, textrank_key_list=textrank_key_list, tf_idf_key_list=tf_idf_key_list, sentiment_score_up=sentiment_score_up, sentiment_score_down=sentiment_score_down, summary=summary, content=content, )
def get_frequency_words(file): with open(file, 'r') as f: texts = f.read() top_words = analyse.textrank(texts, topK=400, withWeight=True) ret_words = {} for word in top_words: ret_words[word[0]] = word[1] return ret_words
def cut_word(sentence, tag='tfidf'): # 分词 -- 特定条件 if tag == 'textrank': return ' '.join( [i for i in analyse.textrank(sentence) if i not in stopwords]) else: return ' '.join( [i for i in analyse.tfidf(sentence) if i not in stopwords])
def jieba_textRank_extract_keyword(): from jieba.analyse import textrank with open('../data/peoples_daily_word_cut.txt', 'r', encoding='utf8') as f: original_corpus = f.read().replace(' ', '') keywords = textrank(original_corpus, topK=10) print(keywords)
def process_item(self, item, spider): # 数据来源 content = item['content'] keywords = ' '.join(analyse.textrank(content, topK=5)) # 关键词 item['keywords'] = keywords return item
def getTextRank(length, text): tags = ayse.textrank(text, topK=length, withWeight=True, allowPOS=ALLOW_SPEECH_TAGS) textRankScore = defaultdict(float) for item in tags: textRankScore[item[0]] = item[1] return textRankScore
def rank_fit(cut_docs, labels, path): label_pairs = dict() for cut_doc, label in zip(cut_docs, labels): pairs = textrank(cut_doc, topK=None, withWeight=True, allowPOS=pos_set) pairs = make_dict(pairs) label_pairs[label] = pairs with open(path, 'w') as f: json.dump(label_pairs, f, ensure_ascii=False, indent=4) if __name__ == '__main__': print(label_pairs)
def get_keywords(self,condition=None,topK=20): ''' 返回语料的关键词,算法是textrank ''' if condition is not None: texts=' '.join(self.texts_raw[condition]) else: texts=' '.join(self.texts_raw) keywords=analyse.textrank(texts,topK=topK) return keywords
def test_rumor(sentence): if not sentence: return None result = {} result['isRumor'] = is_rumor(sentence) result['related'] = [] for keyword, weight in textrank(sentence, topK=3, withWeight=True): for line in get_rumor_data(keyword, page=1, num=2)[0]: result['related'].append(line) return result
def text_summary(select_func, title_holder): title_holder.markdown('# ' + select_func.split()[-1].upper()) st.info('Raw content') raw_content = st.empty() demo_checkbox = st.sidebar.checkbox('Show news demo') st.sidebar.subheader('Input your texts here') content = st.sidebar.text_area('Click outside textarea after input', 'Texts need to be summarized.') st.sidebar.subheader('Method to compute similarity') metric_radio = st.sidebar.radio('', ('Co-occur', 'Cosine')) if metric_radio == 'Cosine': metric = cosine st.sidebar.selectbox('Method to compute sentence embedding', ('SIF', 'Average Embedding')) else: metric = similarity_with_coocurr if demo_checkbox: content = news_demo if content != 'Texts need to be summarized.': raw_content.markdown(content) summary = process_pipe(content, metric, tokens_counter, fasttext) st.success(f'Summarized content with {metric.__name__}') st.write(summary) st.warning(f'Summarized content with gensim API - Unstable') try: st.write(gensim_summarize(content)) except ValueError as e: st.error(e) st.sidebar.subheader('Keywords') show_kw = st.sidebar.checkbox('Show Keywords') if show_kw: kw_length = st.sidebar.slider(label='Keywords Number', min_value=3, max_value=10) kw_method = st.sidebar.selectbox( 'Method to extract keywords', ('TextRank', 'TextRank API', 'Tf-idf API')) if kw_method == 'TextRank': keywords = extract_keyword(co_occurrence(content, window_size=2), topk=kw_length) elif kw_method == 'TextRank API': keywords = textrank(content, topK=kw_length) else: keywords = extract_tags(content, topK=kw_length) st.sidebar.info('\t'.join(keywords)) content = pretty_output(content, keywords) raw_content.markdown(content, unsafe_allow_html=True) bokeh_figure()
def chinese(): string = "" with open('hotWords.txt', 'r', encoding='gbk') as f: for i in f: string += i liction = analyse.textrank(string, topK=50, withWeight=True) keywords = {} for word in liction: keywords[word[0]] = word[1] return keywords
def weibo_juhe(x, y): text_list = y user_weibo = "\n".join([valid_jsontxt(i) for i in text_list]) res = ja.textrank(user_weibo, topK=20, withWeight=True, allowPOS=('an', 'i', 'j', 'l', 'n', 'nr', 'nrfg', 'ns', 'nt', 'nz', 't', 'eng')) words = [] for ln in res: words.append(valid_jsontxt(ln[0]) + "_" + valid_jsontxt(ln[1])) return valid_jsontxt(x) + "\001" + "\t".join(words)
def getKeyWord(sentence, algorithm=None): if algorithm is None: print "關鍵字:" result = ana.extract_tags(sentence, 3, True) # sentence 为待提取的文本 # topK 為返回幾個 TF/IDF 權重最大的關鍵詞,預設值為20 # withWeight 為是否一併返回關鍵詞的權重值,預設值為False # allowPOS 僅包括指定詞性的詞,預設值為空,即不篩選 allowPOS=('ns', 'n', 'vn', 'v') elif algorithm == 'TextRank': print "關鍵字(With TextRank):" result = ana.textrank(sentence, 3, True) for keyWord, weight in result: print keyWord + ' w:' + str(weight) print('='*50)
def handle_requests(self): # app_id = self.request.headers.get('X-Appengine-Inbound-Appid', None) # logging.info("APPID:%s" % app_id) # if app_id in ALLOWED_APP_IDS: # pass # else: # self.abort(403) text = self.request.get("text", default_value="") withWeight = int(self.request.get("withWeight", default_value="0")) mode = self.request.get("mode", default_value="") topK = int(self.request.get("topK", default_value="20")) allowPOS = tuple(self.request.get_all("allowPOS")) md5sum = hashlib.md5((''.join(allowPOS) + text + mode + str(withWeight) + str(topK)).encode("utf-8")).hexdigest() json_data = memcache.get('{}:Analyse'.format(md5sum)) if json_data is None: # import jieba.analyse # jieba.analyse.set_stop_words(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'stop_words.txt')) # jieba.dt.tmp_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'tmp') data = {'text': text, 'mode': mode, 'topK': topK, 'allowPOS': allowPOS, 'md5sum': md5sum} if mode == "TF-IDF": import jieba.analyse.tfidf jieba.dt.tmp_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'tmp') default_tfidf = jieba.analyse.tfidf.TFIDF() extract_tags = default_tfidf.extract_tags default_tfidf.set_stop_words(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'stop_words.txt')) set_idf_path = default_tfidf.set_idf_path data["result"] = extract_tags(text, topK=topK, withWeight=bool(withWeight), allowPOS=allowPOS) else: import jieba.analyse.textrank jieba.dt.tmp_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'tmp') default_textrank = jieba.analyse.textrank.TextRank() textrank = default_textrank.extract_tags default_textrank.set_stop_words(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'stop_words.txt')) data["result"] = textrank(text, withWeight=bool(withWeight), topK=topK) json_data = json.dumps(data) memcache.add('{}:Analyse'.format(md5sum), json_data, MEMCACHE_TIMEOUT) self.response.headers['Content-Type'] = 'application/json' self.response.out.write(json_data)
def getWordwithWeight(sentence): """ return """ try: salary = float(sentence.split(',')[0]) jd = sentence[sentence.index(',')+1:] except Exception as e: return None,None,None,None ret = [] for (w,f) in textrank(jd,topK=30, withWeight=True, allowPOS=['n','eng','v','a','i','ns','vn']): ret.append((w,f)) wordlist = [r[0] for r in ret] flist = [r[1] for r in ret] return ret,wordlist,flist,salary
def extract_words(contents, num_words=10): '''每条评论提取num_words个关键词''' key_words = [] for content in contents: key_words.append(tr.textrank(content, num_words)) return key_words
# coding=utf-8 from __future__ import unicode_literals from jieba.analyse import textrank s = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目 2013年,实现营业收入0万元,实现净利润-139.13万元。" for x, w in textrank(s, withWeight=True): print x, w
import ftplib input=sys.argv[1] output=sys.argv[2] print(input) content=open(input,"r").read() from math import * length=ceil(sqrt(len(content))) e=extract_tags(content,length) t=textrank(content,length) #print(r.keywords(10, False)) #use 2 files as output for comparison outT="C:\\Users\\Tassadar\\Desktop\\Course\\weibo\\temp\\jiebaTDIFD.txt" outR="C:\\Users\\Tassadar\\Desktop\\Course\\weibo\\temp\\jiebaTextRank.txt" fT=open(outT,"w") fR=open(outR,"w") outF=open(output,"w") print("\n",file=fT) for x in e: print(x,end="\n",file=fT) print(x,end="\n",file=outF) print("\n",file=fR)
lines=file.readlines() for i in range(len(lines)): line=lines[i] if line==None or line=='': print("Line is empty") else: # calculate number of keywords length = len(line) from math import sqrt, floor root = int(floor(sqrt(length))) if root >= 500: print("Text is too long. Keep only 500 keywords.") root = 500 #print "Top k=",root keywords = textrank(line, withWeight=False,topK=root) #print "Got ",len(keywords)," keywords" keyword_list.append(keywords) file.close() # output print("Completed keyword extraction on ",len(keyword_list)," items") with open(outFile,"w",encoding="utf-8") as output: for i in range(len(keyword_list)): theWords=keyword_list[i] #print "The words are: ",theWords result=','.join(theWords) if i==0: output.write(result) else: output.write('\n'+result) print("Output to file")
from jieba.analyse import textrank import cPickle import sys def getWordwithWeight(sentence): """ return """ try: salary = float(sentence.split(",")[0]) jd = sentence[sentence.index(",") + 1 :] except Exception, e: return None, None, None ret = [] for (w, f) in textrank(jd, topK=30, withWeight=True, allowPOS=["n", "eng", "v", "a", "i", "ns", "vn"]): ret.append((w, f)) wordlist = [r[0] for r in ret] flist = [r[1] for r in ret] return ret, wordlist, flist def getCountedDict(count_dict, wl, fl, output="CountDict.pkl"): """ word list frequence list """ for i in range(len(wl)): if count_dict.get(wl[i]) == None: count_dict[wl[i]] = fl[i] else:
def get_ranked_response(model, test_post_seg, candidate_list, similar_post_dic, test_index): print test_post_seg tf_idf_in_test_post_seg = [] for word in test_post_seg: if word in model.vocab: tf_idf_in_test_post_seg.append([word, tf_idf_c(word, test_post_seg, candidate_list, similar_post_dic)]) sorted_tf_idf_in_test_post_seg = sorted(tf_idf_in_test_post_seg, key=lambda x:x[-1], reverse=True) similar_word_list = [] for l in sorted_tf_idf_in_test_post_seg[:3]: print l[0] for (word,wd) in model.most_similar(l[0]): similar_word_list.append(word) for w in test_post_seg.split(' '): if w not in similar_word_list: similar_word_list.append(w) mark_list = [u'。', u'.', u'!', u'!', u'?', u'?', u';', u';',u'~',u'~',u'(', u')', u'(', u')', u'-',u'+',u'=',u'、'] similar_word_list = [] test_post_seg_list = test_post_seg.split(' ') for w in set(test_post_seg_list): if w not in similar_word_list: similar_word_list.append(w) if w in model.vocab: for (word, wd) in model.most_similar(w, topn=3): similar_word_list.append(word) test_txt_no_seg = test_post_seg.replace(' ','') test_post_seg_keyword_list1 = analyse.textrank(test_txt_no_seg, topK=3, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) test_post_seg_keyword_list2 = analyse.extract_tags(test_txt_no_seg, topK=3, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) test_post_seg_keyword_list = [i for i in set(test_post_seg_keyword_list1+test_post_seg_keyword_list2)] for w in set(test_post_seg_keyword_list): if w not in similar_word_list: similar_word_list.append(w) if w in model.vocab: for (word, wd) in model.most_similar(w, topn=3): similar_word_list.append(word) test_post_seg_vec = get_sentence_vec(model, test_post_seg, candidate_list, similar_post_dic) for c in candidate_list: c_p_vec = get_sentence_vec(model, c[1], candidate_list, similar_post_dic) c_r_vec = get_sentence_vec(model, c[4], candidate_list, similar_post_dic) c[2] = c_p_vec c[5] = c_r_vec similar_word_in = 0 for w in set(c[4].split()): if w in similar_word_list: if w not in mark_list: similar_word_in += c[4].split().count(w) s2 = float(cosine_similarity(c_p_vec, c_r_vec)) s3 = float(cosine_similarity(test_post_seg_vec, c_r_vec)) c[7] = s2 c[8] = s3 c[9] = similar_word_in # rank_score = 1000*c[6]*c[7]*c[8] rank_score = c[6]*1+c[7]*1.8+c[8]*2+c[9]*0.17 c[10] = rank_score rank_candidate = sorted(candidate_list, key = lambda l: l[-1]) return rank_candidate
#!/usr/bin/python3 # coding: utf-8 import jieba.analyse text = "Automatic summarization is the process of reducing a text document with a computer program in order to create a summary that retains the most important points of the original document. " text = '本项目是针对航空器进近、着陆、起飞安全保障而研发的新型专用设备。机场起降安全是由机长和塔台管制员协同完成的。尽管现代飞机和机场都配备了各种先进的导航和监视设备(包括雷达和盲降系统),但目视仍然是机长和塔台指挥员都离不开的手段。' text = """本项目是针对航空器进近、着陆、起飞安全保障而研发的新型专用设备。机场起降安全是由机长和塔台管制员协同完成的。尽管现代飞机和机场都配备了各种先进的导航和监视设备(包括雷达和盲降系统),但目视仍然是机长和塔台指挥员都离不开的手段。大雾天机场关闭是因为目视条件不够,机场塔台越修越高主要也是为保障目视条件。 近年来的多数空难是在降落时机载和地面设备良好而目视条件不好情况下发生的,如2002年国航韩国釜山空难,2010年波兰总统专机库尔斯克空难,2010年伊春空难等。这就提示人们不仅要改进导航和监视引导系统,还要从提高机长和塔台管制员的目视能力着眼。这就需要利用新兴的"合成视觉"技术,把计算机生成的三维虚拟场景和多种传感器采集到的真实信息实时合成和融合,达到 (1)生成各种恶劣气候和复杂空中态势下高度逼真的机场视觉环境,对塔台管制员(或机长)进行系统的、有针对性的指挥和操控能力的反复训练,以提高恶劣气候和复杂态势下的应对处理能力。 (2)通过"虚实合成"大大拓展塔台管制员和机长在低能见度下的目视能力,能从屏幕上看见当时肉眼看不见的"真实"景象,从而增强飞机起降安全。 合成视觉(Synthetic Vision)是一门新兴的交叉学科,它由虚拟现实、图像处理、机器视觉、智能控制等学科交叉形成。合成视觉将真实世界的实时信息与虚拟环境融合起来,用实时获取的真实信息去校正配准虚拟环境,从而大大提高了虚拟环境的可信度和可用性;不仅仿真模拟训练效果更好,更重要的是可以实现身临其境地进行实时监视、操作、控制、指挥。 项目获多项国家重大科技项目支持,历时七年完成,实现了国内外第一个基于虚实合成技术的"虚拟机场"系统。不仅在民航机场塔台管制训练、军机飞行指挥训练中得到重大应用,而且在军队多个重大项目中实现了基于虚实合成的监视、指挥和控制。 项目主要研究内容包括:大规模高沉浸感完整半球虚拟空间(水平视角360 度,垂直视角90度)构建关键技术;超大视场,非规则曲面多重(四重以上)重叠投影区的几何校正、色彩校正、无缝融合技术;多源图像融合的精确配准技术和实时“虚实合成”技术;通用特效三维引擎等。 项目在虚实结合、视觉合成技术,完整半球虚拟空间构建技术,异形曲面大范围多重重叠区域无缝融合技术等方面有重大创新,已获发明专利授权4项。 项目总体水平达到国际先进,部分关键技术达到国际领先。不仅满足了民航、军航训练需求,而且还能用于实时监视、控制、指挥。大大拓展了应用领域,提升了技术水平。项目已在多个民航、军队重大项目中得到应用,初步应用直接经济效益达到34,047 万元,利税11,131 万元,大大提升了我国军、民航训练和指挥水平,为军民航飞机低能见度着陆起飞安全做出贡献。 -3- 2011.000.006.993""" keyword = '合成视觉;航空安全;虚实融合' text = '''本项目属于光电功能材料科学研究领域。氧化物薄膜铁电体和半导体是一种新型光电功能材料,可用于制备室温工作的红外焦平面器件、室温光发射器件和高灵敏湿度传感器,是未来 20 年光电和传感器技术的重要方向。本项目对氧化物功能薄膜材料微结构调控及其特性研究取得重要结果,为功能器件设计和制备提供材料和科学的基础。 (1)发现溶胶-凝胶法外延生长铁电薄膜材料的成核机制、生长机制和微结构调控方法,发展了溶胶-凝胶法生长铁电薄膜的理论和技术。首先采用高度稀释的前驱体溶液浓度和控制单层厚度,实现溶胶-凝胶法外延生长铁电薄膜材料,首先实现硅基择优取向导电薄膜和择优取向铁电薄膜生长,首先实现硅基读出电路允许温度下铁电薄膜低温生长(400°C )。 (2)首次获得铁电薄膜 BST、PZT、SBT、PMNT、LNO、LSCO 等的红外光学常数,填补了铁电薄膜材料光学常数研究的空白。发现铁电极化电滞回线和介电常数温度谱的晶粒尺寸效应和 BST 铁电薄膜介电常数温度谱在室温附近的增强峰,是 BST 铁电薄膜非制冷红外探测器的依据。发现了铁电极化来源于相对于立方相的有效电荷转移以及晶粒尺寸对铁电性、相结构、晶格动力学和光荧光等性质的影响规律。首次发现利用非对称电场可以使铁电材料的极化疲劳可逆现象,发现 BiFeO3 的弱磁效应。首次发现 PMNT 的极化自锁定特性,是一种可用于室温红外探测的新型铁电薄膜,建立了铁电薄膜复合结构非致冷红外探测模型,研制出室温下工作的红外焦平面器件。 (3)在国际上首次合成了二维生长的可控 ZnO 塔状纳米结构,并制备了氧化物功能材料 ZnO 的多种形貌的纳米结构和构造单元。发现 ZnO 纳米线和纳米棒的巨湿敏性现象,材料阻值随相对湿度的增加呈线性减少,改变量可达 4个数量级。为高灵敏湿度传感器制备提供科学基础。发现 ZnO/碳纳米管复合结构电子场发射性能的增强效应,多针状 ZnO 纳米结构的样品具有高达 10E5cm-2 的电子发射端,具有高达β≈8267 的场增强因子,和低的开启电场和阈值电场。在国际上首次采用 MBE 外延方法生长了 ZnO 单晶薄膜材料,解决了常温下 ZnO 激子光发射强度迅速淬灭问题,首次发现该 ZnO 薄膜材料在室温下的光泵激光发射。 本项目发表 SCI 论文 50 篇,授权发明专利 3项。论文他引 1947 次,其中 8篇论文被他人引用 1416 次,单篇最高引用 1181 次。研制的材料能够制备髙性能功能器件。美国《材料研究学会通报》 和《先进镀膜和表面技术》杂志发表专文报道了本项目的工作,认为研究结果“填补了该领域的空白”(So, they filled the gap by…)。美国《纳米科学技术百科全书》和《半导体科学聚焦》《热释电材料和传感器》邀请编写了章节。多项结果被写入美国《薄膜材料手册》等三本科学手册中。研究结果在国际上引领了该领域的研究工作。 本项目部分内容曾获 2006 年上海市自然科学一等奖。 3 2011.000.005.462''' keyword = '氧化物;铁电薄膜;氧化锌半导体薄膜;微结构调控;材料特性' ################################################################## ## textrank print(analyse.textrank(text, topK=4)) # 纯英文的无法识别 print(analyse.textrank(text, topK=4, withWeight=True)) print(analyse.extract_tags(text, topK=4)) ################################################################## ## stop word jieba.analyse.set_stop_words("/Users/coder352/datasets/gist/nltk/中文停用词表.txt") default_mode = list(jieba.cut(text)) # 并没有去掉停用词 stop_words = [line.strip() for line in open('/Users/coder352/datasets/gist/nltk/中文停用词表.txt').readlines()] print('原文:', '/'.join(default_mode)) print('默认模式:', '/'.join(set(default_mode))) print('搜索引擎模式:', '/'.join(set(default_mode) - set(stop_words))) print(jieba.analyse.extract_tags(text)) # 据说会自动去掉停用词
from sklearn.pipeline import Pipeline import jieba from jieba.analyse import textrank def getWordwithWeight(sentence): """ return """ try: salary = float(sentence.split(',')[0]) jd = sentence[sentence.index(',')+1:] except Exception,e: return None,None,None,None ret = [] for (w,f) in textrank(jd,topK=30, withWeight=True, allowPOS=['n','eng','v','a','i','ns','vn']): ret.append((w,f)) wordlist = [r[0] for r in ret] flist = [r[1] for r in ret] return ret,wordlist,flist,salary def getCountedDict(count_dict,wl,fl,output='CountDict.pkl'): """ word list frequence list """ for i in range(len(wl)): if count_dict.get(wl[i]) == None: count_dict[wl[i]] = fl[i] else: