def wordCloud(text, movie_id): original_text = text wordList = jieba.cut(original_text) tags = analyse.extract_tags(original_text, topK=500, withWeight=False) stags = " ".join(tags) with open(BASE_DIR + '\doubanshow\common\stopwords.txt', 'r', encoding='utf-8') as f: stopwords = list(f.read().split('\n')) outstr = '' for word in wordList: if word in stags: if word not in stopwords: if word != '\t': outstr += word outstr += " " cloud = WordCloud(font_path='C:/Windows/Fonts/msyhbd.ttc', background_color='white', mask=imread(BASE_DIR + '/static/img/mask.jpg'), max_words=500, max_font_size=60) # 设置词云参数,字体,模板,背景白色,最大词量100个,最大字体尺寸60 word_cloud = cloud.generate(outstr) # 产生词云数据 word_cloud path = BASE_DIR + '/static/img/' + movie_id + '.jpg' word_cloud.to_file(path)
def countCiYun(): comments = mongoutil.getCollection1() print('数据总条数count:', comments.estimated_document_count()) # pymongo.cursor.Cursor cursor = comments.find() # 遍历数据,把所有comment的字符都拼到一起 text = ''.join(map(lambda doc: doc.get('comment'), cursor)) #用户自定义分词 jieba.load_userdict(r'../analysis/user_dict.txt') #屏蔽关键词列表 analyse.set_stop_words(r'../analysis/stopwords.txt') m = collections.Counter(text) tags = analyse.extract_tags(text, topK=40, withWeight=False) #词云所有词列表 new_text = ' '.join(tags) #countFinalWordsList(text,new_text) # 对分词文本生成词云 # 生成词云,需要指定支持中文的字体,否则无法生成中文词云 wc = WordCloud( max_words=200, # 设置词云最大单词数 width=1099, # 设置词云图片宽、高 height=724, # 设置词云文字字体(美化和解决中文乱码问题) font_path=r'../example/fonts/FZXingKai-S04S.TTF').generate(new_text) # 绘图(标准长方形图) pyplot.imshow(wc, interpolation='bilinear') pyplot.figure() pyplot.axis('off') wc.to_file(r'../static/images/wc_8_changanCS35PLUS.png')
def key_word_extract(texts): return "".join(analyse.extract_tags( texts, topk=50, ), withWeight=False, allowPOS=()) #提取50个
def Cast(self, name, method=None, message=None, max_bin=100): ''' casts data ,and filters data with stopwords :param name: colname :param method: to decide the func returns a dict or tuple(attr,value) :param message: a message the user gives,if not None, will be adding to stopwords :param max_bin: the max number of words on wordcloud :return: ''' string = "".join(self.GetOneCol(name)) brokewords = map( str.strip, open('./config/stopwords/stopwords.txt', "r", encoding="utf-8").readlines()) if message: brokewords = itertools.chain(brokewords, message.split(",")[:]) stopwords = "".join(brokewords) lis = dict( Counter([ tag.strip() for tag in analyse.extract_tags(string, max_bin) if tag.strip() not in stopwords ])) lis = sorted(lis.items(), key=lambda x: x[1], reverse=True) if method is None: return Geo.cast(lis) elif method == "dict": return {k[0]: k[1] for k in lis}
def project_detail(request, name): r = requests.get("https://api.github.com/repos/newpanjing/{}".format(name)) rs = None readme = None tags = None if r.status_code == 200: rs = r.json() # 读取 readme if rs["description"]: arry = analyse.extract_tags(rs["description"], topK=5) tags = ','.join(arry) r = requests.get( "https://raw.githubusercontent.com/newpanjing/{}/master/README.md?_={}" .format(name, random.uniform(100, 999))) if r.status_code == 200: readme = markdown.markdown(r.text) sid = short_id.get_short_id() request.session['sid'] = sid comment = get_comment(3, name) return render( request, 'project_detail.html', { 'item': rs, 'readme': readme, 'name': name, 'tags': tags, "sid": sid, "comment": comment })
def jiebase(readPath, JiebaPath): fstop = "F:/englisgpaper2/text/words/stopWords.txt" stopwords = [line.strip("\r\n") for line in open(fstop, 'r').readlines()] fstop_sports = "F:/englisgpaper2/text/words/stopWords_sports.txt" stopwords_sports = [ line.strip("\r\n") for line in open(fstop_sports, 'r').readlines() ] stopwordsSet_all = set(stopwords + stopwords_sports) f = open(readPath, 'r') fwrite = open(JiebaPath, 'w+') lineNums = 0 analyse.set_stop_words("F:/englisgpaper2/text/words/stopWords.txt") #加载停用词 for line in f.readlines(): lineNums += 1 lineList = line.strip("\r\n").split(":::") lineClass = lineList[0] lineTitle = lineList[1].split(" title ")[0] seg_title = analyse.extract_tags(lineTitle, topK=7, allowPOS=usenature) # seg_title = list(jieba.cut(lineTitle, cut_all=False)) fwrite.write(lineClass.encode('utf-8') + ":::") for w in seg_title: tmpval = w.encode('utf-8') # for key, values in synonymdic.items(): # if(tmpval in values): # tmpval = key if (tmpval not in stopwordsSet_all): fwrite.write(tmpval + ",") fwrite.write("\n")
def draw_wordcloud(self): # 读入一个txt文件 fo = open('date.txt', 'r', encoding='UTF-8') comment_text = fo.read() fo.close() # 结巴分词,生成字符串,如果不通过分词,无法直接生成正确的中文词云 # cut_text = " ".join(jieba.cut(comment_text)) tag = analyse.extract_tags(comment_text, topK=100, withWeight=False, allowPOS=()) # 得到100个关键词 cut_text = " ".join(jieba.cut("".join(tag))) # 分词 d = path.dirname(__file__) # 当前文件文件夹所在目录 # color_mask = imread("Anne_Hathaway.png") # 读取背景图片 cloud = WordCloud( # 设置字体,不指定就会出现乱码 font_path="HYQiHei-25J.ttf", # font_path=path.join(d,'simsun.ttc'), # 设置背景色 background_color='white', # 词云形状 # mask=color_mask, # 允许最大词汇 max_words=500, # 最大号字体 max_font_size=40) word_cloud = cloud.generate(cut_text) # 产生词云 word_cloud.to_file("pjl_cloud4.jpg") # 保存图片 # 显示词云图片 plt.imshow(word_cloud) plt.axis('off') plt.show()
def save_model(self, request, obj, form, change): obj.user = request.user subject = get_subject(obj.content) # oss.put_object(obj.image.file.file) # 不超过200字 if len(subject) > 200: subject = subject[0:200] # 短id if not obj.sid: obj.sid = short_id.get_short_id() obj.subject = subject # 处理标签 tags = obj.tags # 自动生成 if not tags: r = analyse.extract_tags(subject, topK=5) tags = ",".join(r) obj.tags = tags # 如果没有封面就生成 if obj.image.name == '' or not obj.image.name: total = Cover.objects.count() c = Cover.objects.all()[random.randint(0, total - 1)] url = draw.draw(text=obj.title, url=c.image.url, font_size=c.font_size, color=c.color, x=c.x, y=c.y) obj.image.name = url super(ArticleAdmin, self).save_model(request, obj, form, change) cache.delete(cache.CACHE_HOME_KEY)
def fenci(self): print('开始分词...') fenciFileName = os.path.join(sys.path[0], self._city + '_分词结果.csv') CommentRecord = namedtuple( 'CommentRecord', ['user', 'date', 'eval', 'star', 'votes', 'content']) analyse.set_stop_words(os.path.join(sys.path[0], '中文停用词表.txt')) content = [] csvName = os.path.join(sys.path[0], self._city + '.csv') for emp in map( CommentRecord._make, csv.reader(open(csvName, mode='r', encoding='utf-8-sig'))): content.append(emp.content) tags = analyse.extract_tags(' '.join(content), topK=100, withWeight=True) with open(fenciFileName, 'w', encoding='utf-8-sig', newline='') as csvfile: writer = csv.writer(csvfile) [ writer.writerow([item[0], str(int(item[1] * 1000))]) for item in tags ] print('分词结束,保存结果在"%s"中...' % fenciFileName)
def countCiYun(): text = None with pymongo.MongoClient(host='127.0.0.1', port=27017) as client: comments = client.qichezhijia.qichezhijia1 print('数据总条数count:', comments.estimated_document_count()) # pymongo.cursor.Cursor cursor = comments.find() # 遍历数据,这里只遍历短评数据(好在数据量并不太大) text = ''.join(map(lambda doc: doc.get('comment'), cursor)) #用户自定义分词 jieba.load_userdict(r'../analysis/user_dict.txt') #屏蔽关键词列表 analyse.set_stop_words(r'../analysis/stopwords.txt') m = collections.Counter(text) tags = analyse.extract_tags(text, topK=30, withWeight=False) #词云所有词列表 new_text = ' '.join(tags) countFinalWordsList(text, new_text) # 对分词文本生成词云 # 生成词云,需要指定支持中文的字体,否则无法生成中文词云 wc = WordCloud( max_words=200, # 设置词云最大单词数 width=1099, # 设置词云图片宽、高 height=724, # 设置词云文字字体(美化和解决中文乱码问题) font_path=r'../example/fonts/FZXingKai-S04S.TTF').generate(new_text) # 绘图(标准长方形图) pyplot.imshow(wc, interpolation='bilinear') pyplot.figure() pyplot.axis('off') wc.to_file(r'../static/images/wc.png')
def test_tfidf(): lines = open('D:\\Python\\Data\\NBA.txt', encoding='utf-8').read() print(type(lines)) # 基于TF-IDF算法的关键词抽取 words = analyse.extract_tags(lines, topK=20, withWeight=True, allowPOS=()) print(words) # 基于TextRank算法的关键词抽取 words = analyse.textrank(lines, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) print(words) words = analyse.textrank(lines, topK=20, withWeight=False, allowPOS=('ns', 'n')) print(words) # 词性标注 words = pseg.cut('我爱自然语言处理') # print(list(words)) for word, flag in words: print(word, flag) # Tokenize:返回词语在原文的起止位置 result = jieba.tokenize('我爱自然语言处理') print(list(result))
def analyseKeyWord2Dict(path): #加载自行设置的停止词(一些无关紧要的术语和常用词) jieba.analyse.set_stop_words( stop_words_path= 'F:\\PythonDemo\\词云\\venv\\CiYunToFriendsCircle\\停止词.txt') #加载词典 jieba.load_userdict( "F:\\PythonDemo\\词云\\venv\\CiYunToFriendsCircle\\词典.txt") #为词典增加词汇(如果要增强程序的灵活性,可采取文件路径读取方式载入) # jieba.add_word("孔乙己") # jieba.add_word("鲁镇") jieba.add_word("死侍") #读入文件 f = open(path, 'r', encoding='utf-8') text = f.read() f.close() #关键词是一个含元组的list keyword = extract_tags(text, topK=100, withWeight=True, allowPOS=()) #此处只是为准备将关键字写入文件查看,便于筛选出停止词 # with open("F:\\PythonDemo\\词云\\venv\\CiYunToFriendsCircle\\dict.txt", 'w', encoding='utf-8') as file: # for word,weight in keyword: # file.write(word+'\n') # print(word) # print(dict(keyword)) return keyword
def login(): if request.method == 'POST': text = request.form['text'] option = request.form['option'] if option == 'sentiment analyse': blob_obj = TextBlob(text) sentiment_score = blob_obj.sentiment.polarity result = 'sentiment analyse: %.2f (-1.0 negative,1.0positive)' % sentiment_score elif option == 'keyword extraction': keywords = analyse.extract_tags(text) result = 'Top3 keyword: %s' % (' / '.join(keywords[:3])) elif option == 'Part-of-speech': blob_obj = TextBlob(text) tags = [v + '/' + t for v, t in blob_obj.tags] result = 'Part-of-speech Result:\n %s' % ('\n'.join(tags)) elif option == 'Noun Phrase Extraction': blob_obj = TextBlob(text) result = 'Noun Phrase:\n %s' % (' / '.join(blob_obj.noun_phrases)) return render_template('index.html', result=result, oldtext=text) return render_template('index.html', name=0)
def match(self, query): stopwords = ['你', '我', '她', '啊', '呢', '的', ','] result = set() if len(query) <= 5: for i in jieba.lcut(query): if i in self.inverted_word_dict.keys() and i not in stopwords: result = result.union(self.inverted_word_dict[i]) else: for i in extract_tags(query): if i in self.inverted_word_dict.keys(): result = result.union(self.inverted_word_dict[i]) questions = [self.titles[i] for i in result] #list_of_list的格式,随后做出修改 #print(questions) scores = [fuzz.ratio(query, i) for i in questions] #print('len_socre',len(scores)) if len(scores) == 0: print( random.choice(['你在说什么啊老弟...', '你说什么我听不见', 'what are you 说啥呢'])) else: max_score = max(scores) if max_score < 80: print( random.choice( ['你在说什么啊老弟...', '你说什么我听不见', 'what are you 说啥呢'])) #print(questions[scores.index(max_score)]) #print(questions[scores.index(max_score)]) else: return questions[scores.index(max_score)]
def rank(): text = ''.join([ 'This is the first document.', 'This is the second document.', 'And the third one', 'Is this the first document?' ]) result = extract_tags(text, topK=4, withWeight=True) print(result)
def is_get_astronomy(self, dict_data, string, top_num): #提取前十个关键词 keywords = analyse.extract_tags(string, topK=top_num, withWeight=True, allowPOS=('n', 'nr', 'ns')) astro_flag = 0 count = 0 keywords_astronomy = [] # 循环top10关键词 判断是否存在 for item in keywords: print(item[0], item[1]) try: if dict_data[item[0]]: print(count, ' : ') keywords_astronomy.append(item[0]) astro_flag = 1 count += 1 except: print('Not find!') if astro_flag == 1 and count >= 2: return True, keywords_astronomy else: return False, keywords_astronomy
def NLP(self,context,score,cla_id): #使用SnowNLP来进行情感度计算 #使用jeaba.analyse进行关键词提取 #用synonyms进行同义词相似度比较 tag_words = jae.extract_tags(context,topK = 5, withWeight = True, allowPOS = ()) Predict_Score = SnowNLP(context).sentiments Final_Tag_Score = {} #评论中五个关键词及其情感评分 for word,weight in tag_words: Rank_Dict = dict() for origin_tag in self.Taglist: Rank_Dict[origin_tag] = synonyms.compare(word,origin_tag,seg = True) Sorted_Rank_Dict = sorted(Rank_Dict.items, key = lambda x : x[1], reverse = True) Most_Similiar_Tag = Sorted_Rank_Dict[0][0] Similiar_Score = Sorted_Rank_Dict[0][1] Final_Tag_Score[Most_Similiar_Tag] = score * Predict_Score * Similiar_Score * weight #将评论中提取的特征:评分加入到课程的特征中 origin_feat_vector = self.DataGetter.Get_Cla_Feats({'cla_id':cla_id}) for tag,score in Final_Tag_Score.items(): index = self.Taglist.index(tag) origin_feat_vector[index] += 0.2 * score new_info = {'cla_id':cla_id,'feats':origin_feat_vector} self.DataGetter.Update_ClaFeats(new_info)
def halfcut(content): word_list=[] k=GrobalParament.n f = jieba._get_abs_path(os.path.join('extra_dict', 'computer.dict.txt')) jieba.load_userdict(f) #jieba.analyse.set_stop_words("stop_words.txt") while True: cut_content = extract_tags(content, k) word_list_temp=cut_content if not GrobalParament.ruler_list: r=r'[^/\d]{2,}' temp='/'.join(word_list_temp) word_list=re.findall(r,temp) else: for word in word_list_temp: if word not in GrobalParament.ruler_list: word_list.append(word) #print len(word_list) if (len(word_list)>=GrobalParament.n): break else: if k - GrobalParament.n > 5: break word_list=[] k+=1 return word_list
def crab(class_): news_link = class_.find("a", class_="card_link")["href"] artical_number = news_link.split("/")[-1] id = "storm-politic-" + artical_number news_title = class_.find("h3", class_="card_title").text news_create_time = class_.find("span", class_="info_time").text response2 = urlopen(news_link) html2 = BeautifulSoup(response2) artical = html2.find("div", class_="article_content_inner") content = [] for p in artical.find_all("p"): content.append(p.text) news_content = "".join(content) response3 = urlopen("https://service-pvapi.storm.mg/pvapi/get_pv/" + artical_number) html3 = BeautifulSoup(response3) #news_view = html3.text.split(":")[-1].replace("}", "") #print(news_view) news_tag = "politic" news_view = json.loads(html3.text)["total_count"] fb_msg = requests.get('https://graph.facebook.com/?id={}'.format(news_link)).json()['share'] print("=============================") print(id), print(news_link), print(news_title), print(news_create_time), print(news_content),print("關鍵詞:", extract_tags(news_content, 10)), print(news_tag), print(news_view), print(fb_msg)
def get_feature(no, level): if no == 0: return [] else: content = level[u"内容"] feature = ana.extract_tags(content, allowPOS=('n', 'v')) return feature
def getKeywords(self, num=10): ''' # 提取文本关键词 ## 参数 - int num : 输出的关键词数 ## 算法选择 判断文本长度,选则使用 TF-IDF 或 TextRank 算法: - 短文本,使用 TextRank 算法 - 长文本,使用 TF-IDF 算法 ## 输出 ``` # jba.textrank(data, topK=num, withWeight=True) [('关键词', 权重), ...] # jba.textrank(data, topK=num) ['关键词', ...] ``` 现采用第二种策略 ''' length = len(self.getText()) if length < 50: # 短文本,使用 TextRank 算法 res = jba.textrank(self.getText(), topK=num) else: # 长文本,使用 TF-IDF 算法 res = jba.extract_tags(self.getText(), topK=num) return res
def get_tags(text): from jieba.analyse import extract_tags num = 7 tags = [] if text: tags = [tag for tag in extract_tags(text, 20) if tag[0] not in STOP_WORDS][:num] return tags
def getCleanSubjiects(listSujects): objUtilString = StringUtil() #TfIdf的停用词 jiebaAns.set_stop_words(PATH_JIEBA_STOP_WORDS) listRetSubjects = [] for dictSubject in listSujects: #开始清洗数据 oriContent = dictSubject['txt_content'] if oriContent: #全角转半角 cleanContent = objUtilString.tfQ2B(oriContent) #去掉符号 cleanContent = objUtilString.replaceSubjectTag(cleanContent) #存回到集合中 dictSubject['txt_content_par'] = cleanContent #添加关键词字段 listContentKeywords = jiebaAns.extract_tags(cleanContent.lower(), topK=20, withWeight=False) dictSubject['txt_content_keywords'] = ','.join(listContentKeywords) #添加关键词词数字段 dictSubject['txt_content_keywords_len'] = len(listContentKeywords) listRetSubjects.append(dictSubject) else: dictSubject['txt_content_par'] = '' dictSubject['txt_content_keywords'] = '' dictSubject['txt_content_keywords_len'] = '' listRetSubjects.append(dictSubject) return listRetSubjects
def keyWord(text): exitWords = [] #退出词汇列表 noUseWords = [] #无效词汇列表 wordBook = xlrd.open_workbook("AiKeywords.xlsx") table = wordBook.sheet_by_name('CnWords') # nrows = table.nrows cell1 = table.cell(2, 3).value.split(';') # 无效词汇 .split(';')变成列表 cell2 = table.cell(3, 3).value.split(';') # 退出词汇 .split(';')变成列表 cell3 = table.cell(1, 3).value.split(';') # 专业词汇 .split(';')变成列表 for ProfessionalVocabulary in cell3: jieba.add_word(ProfessionalVocabulary) #往jieba库里添加专业词汇 keywords = analyse.extract_tags(text,topK=9,withWeight=False,allowPOS=()) # print(keywords) for words in keywords: # 注意:for遍历循环中的remove操作出现奇怪现象 if words in cell1: noUseWords.append(words) elif words in cell2: exitWords.append(words) for noUse in noUseWords: #解决办法-不在for操作keywords的同时又remove keywords.remove(noUse) #如此 # print(exitWords) if exitWords and keywords is not None: #判断退出词汇的占比 result = len(exitWords)/len(keywords) # print(result) if result >= 1/2: keywords = ['exit'] else: for no in exitWords: keywords.remove(no) print(keywords) return keywords
def main(): # 0、背景图 back_color = imageio.imread(back_img) # 解析该图片 # 1、读入txt文本数据 text = get_text() # 2、结巴分词,默认精确模式。可以添加自定义词典userdict.txt,然后jieba.load_userdict(file_name) ,file_name为文件类对象或自定义词典的路径 # 自定义词典格式和默认词库dict.txt一样,一个词占一行:每一行分三部分:词语、词频(可省略)、词性(可省略),用空格隔开,顺序不可颠倒 # cut_text = jieba.cut(text) # result = "/".join(cut_text) # 必须给个符号分隔开分词结果来形成字符串,否则不能绘制词云 analyse.set_stop_words("stopwords.txt") # withWeight=True为显示字符出现的频率,格式为[('aa',0.23),('a',0.11)] # 不加这个参数,或者参数值为false的时候格式为['a','b'] result = analyse.extract_tags(text, topK=100, withWeight=True) # 3、生成词云图,这里需要注意的是WordCloud默认不支持中文,所以这里需已下载好的中文字库 # 无自定义背景图:需要指定生成词云图的像素大小,默认背景颜色为黑色,统一文字颜色:mode='RGBA'和colormap='pink' wc = WordCloud( font_path=r"msyh.ttc", background_color='white', width=800, height=600, max_font_size=50, mask=back_color, # 以该参数值作图绘制词云,这个参数不为空时,width和height会被忽略 max_words=200) # ,min_font_size=10)#,mode='RGBA',colormap='pink') # wc.generate(result) wc.fit_words(dict(result)) # 基于彩色图像生成相应彩色 image_colors = ImageColorGenerator(back_color) # 4、显示图片 plt.imshow(wc) # 以图片的形式显示词云 plt.figure("词云图") # 指定所绘图名称 plt.axis("off") # 关闭图像坐标系 wc.to_file("result.png") # 按照设置的像素宽高度保存绘制好的词云图,比下面程序显示更清晰
def creatWordDict(self): """ creat self.word_dict && self.word_list according to self.infolist, select top 10% of the keywords :return: Null """ temp_dict = {} word_num = 0 for info_item in self.infolist: # print(info_item.title) # print(info_item.text) # info_weight 待调 info_weight = info_item.attitudecount + 2 * info_item.commentcount # print(info_weight) keynum = math.ceil(len(info_item.title) / 5) keywords = analyse.extract_tags(info_item.title, topK=keynum) for word_item in keywords: if word_item in temp_dict: temp_dict[word_item] += 5 else: temp_dict[word_item] = info_weight + 5 text_list = re.split(',|。|?|!', info_item.text) for sentence in text_list: # print(sentence) keynum = math.ceil(len(sentence) / 10) keywords = analyse.extract_tags(sentence, topK=keynum) # print(keywords) for word_item in keywords: if word_item in temp_dict: temp_dict[word_item] += 1 else: temp_dict[word_item] = 1 word_temp_list = sorted(temp_dict.items(), key=lambda item: item[1], reverse=True) # 出来后是列表形式 topnum = math.ceil(len(word_temp_list) / 10) # creat self.word_dict i = 0 while word_num < topnum: w = word_temp_list[i][0] if not isdigit(w) and w not in stopwords: self.word_dict[w] = word_num self.word_list.append(w) word_num += 1 # print(word_temp_list[i]) i += 1
def getTextKeywords(self, n=10): """ 从文字题中提取关键字 """ querySet = SampleItem.objects.filter(question=self) text = ' '.join([rec['content'] for rec in querySet.values('content')]) tags = extract_tags(text, topK=n) return tags
def analisysUserContent(self): contents = session.query(TitleDetail).filter(TitleDetail.user_url==self.user_url).all() # @UndefinedVariable if len(contents) == 0: return [] contents = map(lambda c : c.content ,contents) sentence = reduce(lambda c1,c2:c1 + c2,contents) tags = analyse.extract_tags(sentence=sentence ,topK=200,allowPOS=('n','ns','vn'),withWeight=True,withFlag=True) return tags
def extractTags(self, sentence): """ 使用jieba默认的tfidf算法,取得句子中topN的关键词 :param sentence: :return: """ topK = configutil.config.getint("json","keywords_num") return jiebays.extract_tags(sentence, topK=topK)
def participle_content(content,n=30): 'participle key words in content' if not content: return content_words='' words=topk.extract_tags(content,topK=n) words=[w for w in words if w not in stopwords] for word in words: content_words+=word.encode('utf-8')+'+' return content_words[:-1]
def cut_words(file_path): word_list = [] with open(file_path, encoding="utf-8") as f: text = f.read() for key in analyse.extract_tags(text, 50, withWeight=True): # print(key) word_list.append(key) return word_list
def extract_keyword(e): text1.delete(1.0, END) # 删除关键词文本框内的内容 content = ts.get(1.0, END) # 获取新闻文本框中的文本 segment = split_word(content) clean_segment = drop_stopwords(segment) keywords = analyse.extract_tags(' '.join(clean_segment), topK=3, allowPOS=('ns','nr','nt','nz','vn')) # 提取5个特征词 # keywords = analyse.textrank(' '.join(clean_segment), topK=3, allowPOS=('ns','n')) text1.insert(0.0, ' '.join(keywords))
def analysisContent(content): extract_tags = analyse.extract_tags(sentence=content, topK=10, withWeight=True, allowPOS=('ns', 'n', 'vn', 'v', 'nr'), withFlag=True) for tag in extract_tags: print(tag)
def get_keywords(articles): text="" text=" ".join(articles) keywords_tfidf = analyse.extract_tags(text, topK=10, withWeight=False, allowPOS=('n', 'ns', 'vn', 'v', 'nz')) return keywords_tfidf
def loadDataFromCutFile(self,totalnum): doc = [] cut = Cut() for i in range(1,totalnum): line = cut.getRow(i,Global.cutnews_dir,Global.filesize) if not line: break data = json.loads(line) keyword = analyse.extract_tags(data['content'],topK=20) seg = " ".join(keyword) print seg doc.append(seg) return doc
def segment_word(word): """ 文本切词排序 :param word: :return: """ seg_list = posse.cut(''.join(analyse.extract_tags(word, max(len(word) / 7, 5)))) key_words = jieba.cut_for_search(''.join(set(map(lambda x: '' if x.flag[0] in ['q', 'd', 'p', 'u'] else x.word, seg_list)))) for item in key_words: print item return list(key_words)
def getKeyWord(sentence, algorithm=None): if algorithm is None: print "關鍵字:" result = ana.extract_tags(sentence, 3, True) # sentence 为待提取的文本 # topK 為返回幾個 TF/IDF 權重最大的關鍵詞,預設值為20 # withWeight 為是否一併返回關鍵詞的權重值,預設值為False # allowPOS 僅包括指定詞性的詞,預設值為空,即不篩選 allowPOS=('ns', 'n', 'vn', 'v') elif algorithm == 'TextRank': print "關鍵字(With TextRank):" result = ana.textrank(sentence, 3, True) for keyWord, weight in result: print keyWord + ' w:' + str(weight) print('='*50)
def segment_user_input(word): """ 用户输入切词 :param word: :return: """ # lexical = ['n', 'v', 'a', 't'] # seg_words = posse.cut(word) # key_words = map(lambda x: x.word, sorted(filter(lambda x: x.flag[0] in lexical, seg_words), # key=lambda x: lexical.index(x.flag[0]))) key_words = analyse.extract_tags(word, max(len(word) / 7, 5)) if not key_words: return [word.decode('utf-8')] return key_words
def key_wordsf(uid, time_choise): ''' 根据用户uid和选定的时间,可以求出临近时间的微博 dict = {user1:[date:"",date:"",date:"",date:"",date:""], user2:[key1,key2,key3,key4,key5], user3:[key1,key2,key3,key4,key5], user4:[key1,key2,key3,key4,key5], user5:[key1,key2,key3,key4,key5], ... } ''' identify = string.maketrans('', '') delEStr = string.punctuation + ' ' + string.digits + ' ' + string.letters #ASCII 标点符号,空格和数字 delCStr = '#《》()&%¥#@!{}【】' a=[1,2,3] b=[2,3] set(a).difference(set(b)) #差集 1 set(a).intersection(set(b)) #交集 2 3 set(a).union(set(b)) #并集 1 2 3 users_keys = {} for i in range(5): all_weibo = [] tm = int(time_choise)-i contents=WeiboContent.objects.filter(date__contains=tm) for d in contents: # print type(d.content.encode('utf-8')) all_weibo.append(d.content) userid = d.userid date1 = d.date if i==0: users_keys.setdefault(userid, {}) #定义嵌套字典 stop = [line.strip().decode('utf-8') for line in open('stopword.txt').readlines()] all_weibo = [v for v in all_weibo if not str(v).isalpha()] all_weibo = ','.join(all_weibo) segs = jieba.cut(all_weibo) all_weibo = ','.join(list(set(segs)-set(stop))) # print type(str(all_weibo)) # all_weibo = all_weibo.translate(identify, delEStr) # all_weibo = all_weibo.translate(identify, delCStr) keywords = analyse.extract_tags(all_weibo,topK=3) users_keys[userid][tm] = ','.join(keywords) for k,v in users_keys.items(): for x,y in v.items(): print ("uid:%s,datetime:%s,keyword:%s") % (k,x,y)
def word_count(): text = '' with open(STATIC_PATH + '/vis_data/word_raw.txt', 'r') as fr: for line in fr.read(): if line: text += line words = ja.extract_tags(text, 30) with open(STATIC_PATH + '/vis_data/words.csv', 'w') as fw: fw.write('text,size\n') for idx, w in enumerate(words): fw.write(w.encode('utf-8') + ',' + str((30 - idx) * (30 - idx)) + '\n')
def treemap(username, most_common=50): """Generate json data needed by treemap for d3.js.""" kw_str = generate_data(username) #jieba.load_userdict(app.config['USERDICT']) kw = analyse.extract_tags(kw_str, most_common) #颠倒位置, 使name作为key, size作为value, dict()处理时就不会覆盖具有 #相同权重的值了. #TODO 剔除数字 kw = dict([tuple(reversed(i)) for i in kw]) j = json.dumps({"name": "kw", "children": [{"name": key, "size": value} for key, value\ in kw.items()]}, indent=2) return j
def key_words(uid, time_choise): ''' 根据用户uid和选定的时间,可以求出临近时间的微博 dict = {user1:[date:"",date:"",date:"",date:"",date:""], user2:[key1,key2,key3,key4,key5], user3:[key1,key2,key3,key4,key5], user4:[key1,key2,key3,key4,key5], user5:[key1,key2,key3,key4,key5], ... } ''' a=[1,2,3] b=[2,3] set(a).difference(set(b)) #差集 1 set(a).intersection(set(b)) #交集 2 3 set(a).union(set(b)) #并集 1 2 3 users_keys = {} for i in range(5): all_weibo = "" tm = int(time_choise)-i contents=WeiboContent.objects.filter(userid=uid, date__contains=tm) for d in contents: print type(d.content.encode('utf-8')) all_weibo = all_weibo +','+ d.content userid = d.userid date1 = d.date if i==0: users_keys.setdefault(userid, {}) #定义嵌套字典 all_weibo = all_weibo.translate(identify, delEStr) all_weibo = all_weibo.translate(identify, delCStr) stop = [line.strip().decode('utf-8') for line in open('stopword.txt').readlines()] segs = jieba.cut(all_weibo) all_weibo = ','.join(list(set(segs)-set(stop))) print all_weibo keywords = analyse.extract_tags(all_weibo,topK=3) users_keys[userid][date1] = keywords for k,v in users_keys.items(): for x,y in v.items(): print k,x,y,'\n' for z in y: print z
def get_content(url): raw_html =http_request(url) # detect the language of the content,if chinese we apply StopWordsChinese lan = language_id(content=raw_html) if lan: if lan[0] == 'zh': g = Goose({'stopwords_class': StopWordsChinese}) else: g = Goose() else: g = Goose() article = g.extract(raw_html=raw_html) keywords = ja.extract_tags(sentence=raw_html,allowPOS=['nm','n','x','eng','vn'],withWeight=False) words = jieba.cut_for_search(sentence=raw_html) return article,keywords,words
def tf_if_parse(content, keywords=None, topK = 50): """ keywords must be include """ import jieba.analyse as analyse import jieba tfidf_path = os.path.join(resource_dir,'resources','text','tf-idf.txt') user_dict_path = os.path.join(resource_dir,'resources','text','user_dict.txt') stopwords_path = os.path.join(resource_dir,'resources','text','stopwords-zh.txt') jieba.load_userdict(user_dict_path) analyse.set_stop_words(stopwords_path) analyse.set_idf_path(tfidf_path) tags = analyse.extract_tags(content, topK=topK) return tags
def jieba(): import jieba as Jieba Jieba.load_userdict('userdict.txt') import jieba.analyse as KeywordsAnalyse import jieba.posseg as pseg content = request.POST.get('content') words = pseg.cut(content) word_array = [] for w in words: if len(w.word.strip()) > 0: word_array.append(w.word + '/' + w.flag) response = { 'result' : ' '.join(word_array), 'keywords' : KeywordsAnalyse.extract_tags(content), 'word_frequency' : self._word_frequency(word_array) } return self._response(response)
def post(uid): post = Post.query.get_or_404(uid) if post.is_duplicate: abort(404) if post.blog.is_protected: abort(404) if post.content: content_tree = html.fromstring(post.content) scripts = content_tree.cssselect('script') # 去除script标签 for script in scripts: content_tree.remove(script) content = content_tree.text_content() content = content.replace('.', '') tags = analyse.extract_tags(content, topK=20, withWeight=True) tags_ = [{'text': tag, 'weight': weight} for tag, weight in tags] else: tags_ = [] return render_template('blog/post.html', post=post, tags=json.dumps(tags_))
def analyse(self): print u'开始分析职位需求---->' strx = open(self.config.file, 'r').read() strx = strx.upper() tags = analyse.extract_tags(strx, topK=50,withWeight=False) #export to html file with(open(self.config.result_file, 'w')) as f: f.writelines('<html><head>') f.writelines('<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>') f.writelines('<title>Job Crawer Result</title></head><body>') f.writelines('<table rules=all>') f.writelines('<h1>' + prog_info + '</h1>') f.writelines('<ul>') for tag in tags: f.writelines('<li>{0}</li>'.format(tag.capitalize())) f.writelines('</ul>') f.writelines('</body></html>')
def halfcut(content): word_list=[] k=GrobalParament.n while True: cut_content = extract_tags(content, k) word_list_temp=cut_content if not GrobalParament.ruler_list: r=r'[^/\d]{2,}' temp='/'.join(word_list_temp) word_list=re.findall(r,temp) else: for word in word_list_temp: if word not in GrobalParament.ruler_list: word_list.append(word) #print len(word_list) if (len(word_list)>=GrobalParament.n): break else: word_list=[] k+=1 return word_list
import sys import ftplib input=sys.argv[1] output=sys.argv[2] print(input) content=open(input,"r").read() from math import * length=ceil(sqrt(len(content))) e=extract_tags(content,length) t=textrank(content,length) #print(r.keywords(10, False)) #use 2 files as output for comparison outT="C:\\Users\\Tassadar\\Desktop\\Course\\weibo\\temp\\jiebaTDIFD.txt" outR="C:\\Users\\Tassadar\\Desktop\\Course\\weibo\\temp\\jiebaTextRank.txt" fT=open(outT,"w") fR=open(outR,"w") outF=open(output,"w") print("\n",file=fT) for x in e: print(x,end="\n",file=fT) print(x,end="\n",file=outF)
def tf_idf(content): print "TF-IDF:"+"|".join(analyser.extract_tags(content, 5)) return list(analyser.extract_tags(content, 5))
def get_ranked_response(model, test_post_seg, candidate_list, similar_post_dic, test_index): print test_post_seg tf_idf_in_test_post_seg = [] for word in test_post_seg: if word in model.vocab: tf_idf_in_test_post_seg.append([word, tf_idf_c(word, test_post_seg, candidate_list, similar_post_dic)]) sorted_tf_idf_in_test_post_seg = sorted(tf_idf_in_test_post_seg, key=lambda x:x[-1], reverse=True) similar_word_list = [] for l in sorted_tf_idf_in_test_post_seg[:3]: print l[0] for (word,wd) in model.most_similar(l[0]): similar_word_list.append(word) for w in test_post_seg.split(' '): if w not in similar_word_list: similar_word_list.append(w) mark_list = [u'。', u'.', u'!', u'!', u'?', u'?', u';', u';',u'~',u'~',u'(', u')', u'(', u')', u'-',u'+',u'=',u'、'] similar_word_list = [] test_post_seg_list = test_post_seg.split(' ') for w in set(test_post_seg_list): if w not in similar_word_list: similar_word_list.append(w) if w in model.vocab: for (word, wd) in model.most_similar(w, topn=3): similar_word_list.append(word) test_txt_no_seg = test_post_seg.replace(' ','') test_post_seg_keyword_list1 = analyse.textrank(test_txt_no_seg, topK=3, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) test_post_seg_keyword_list2 = analyse.extract_tags(test_txt_no_seg, topK=3, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) test_post_seg_keyword_list = [i for i in set(test_post_seg_keyword_list1+test_post_seg_keyword_list2)] for w in set(test_post_seg_keyword_list): if w not in similar_word_list: similar_word_list.append(w) if w in model.vocab: for (word, wd) in model.most_similar(w, topn=3): similar_word_list.append(word) test_post_seg_vec = get_sentence_vec(model, test_post_seg, candidate_list, similar_post_dic) for c in candidate_list: c_p_vec = get_sentence_vec(model, c[1], candidate_list, similar_post_dic) c_r_vec = get_sentence_vec(model, c[4], candidate_list, similar_post_dic) c[2] = c_p_vec c[5] = c_r_vec similar_word_in = 0 for w in set(c[4].split()): if w in similar_word_list: if w not in mark_list: similar_word_in += c[4].split().count(w) s2 = float(cosine_similarity(c_p_vec, c_r_vec)) s3 = float(cosine_similarity(test_post_seg_vec, c_r_vec)) c[7] = s2 c[8] = s3 c[9] = similar_word_in # rank_score = 1000*c[6]*c[7]*c[8] rank_score = c[6]*1+c[7]*1.8+c[8]*2+c[9]*0.17 c[10] = rank_score rank_candidate = sorted(candidate_list, key = lambda l: l[-1]) return rank_candidate
def Extract_Tags(sentence): words = jalz.extract_tags(sentence,20) print ",".join(words) """
def extract_topic(text): return extract_tags(text, TAG_COUNT)
hbase(main):021:0> scan 'test' ROW COLUMN+CELL row1 column=f1:a, timestamp=1401883411986, value=value1 row1 column=f1:b, timestamp=1401883415212, value=value2 row2 column=f1:, timestamp=1401883417858, value=value3 row3 column=f1:, timestamp=1401883420805, value=value4 4 row(s) in 0.0240 seconds """ def wordcut(v): try: x=eval("'%s'"%v['value']) except Exception,ex: x='invalid' # seglist=jieba.cut(x) seglist=analyse.extract_tags(x,10) myvalue='|'.join(seglist) return myvalue def content_analyse(v): try: x=eval("'%s'"%v['value']) except Exception,ex: x='invalid' # seglist=jieba.cut(x) seglist=analyse.extract_tags(x,10) myvalue='|'.join(seglist) return myvalue def inverted(v): url=v[0]
def Extract_Tags(sentence): jieba.load_userdict("/home/cylin/Code/user_dict") #自訂詞庫檔 jieba.analyse.set_stop_words("/home/cylin/Code/stopwords") words = jalz.extract_tags(sentence,15,allowPOS=['n']) #print ",".join(words) return words
def getKeyWords( sentence, num=20 ): kw = jieba.extract_tags(sentence, num) return kw
#!/usr/bin/python3 # coding: utf-8 import jieba.analyse text = "Automatic summarization is the process of reducing a text document with a computer program in order to create a summary that retains the most important points of the original document. " text = '本项目是针对航空器进近、着陆、起飞安全保障而研发的新型专用设备。机场起降安全是由机长和塔台管制员协同完成的。尽管现代飞机和机场都配备了各种先进的导航和监视设备(包括雷达和盲降系统),但目视仍然是机长和塔台指挥员都离不开的手段。' text = """本项目是针对航空器进近、着陆、起飞安全保障而研发的新型专用设备。机场起降安全是由机长和塔台管制员协同完成的。尽管现代飞机和机场都配备了各种先进的导航和监视设备(包括雷达和盲降系统),但目视仍然是机长和塔台指挥员都离不开的手段。大雾天机场关闭是因为目视条件不够,机场塔台越修越高主要也是为保障目视条件。 近年来的多数空难是在降落时机载和地面设备良好而目视条件不好情况下发生的,如2002年国航韩国釜山空难,2010年波兰总统专机库尔斯克空难,2010年伊春空难等。这就提示人们不仅要改进导航和监视引导系统,还要从提高机长和塔台管制员的目视能力着眼。这就需要利用新兴的"合成视觉"技术,把计算机生成的三维虚拟场景和多种传感器采集到的真实信息实时合成和融合,达到 (1)生成各种恶劣气候和复杂空中态势下高度逼真的机场视觉环境,对塔台管制员(或机长)进行系统的、有针对性的指挥和操控能力的反复训练,以提高恶劣气候和复杂态势下的应对处理能力。 (2)通过"虚实合成"大大拓展塔台管制员和机长在低能见度下的目视能力,能从屏幕上看见当时肉眼看不见的"真实"景象,从而增强飞机起降安全。 合成视觉(Synthetic Vision)是一门新兴的交叉学科,它由虚拟现实、图像处理、机器视觉、智能控制等学科交叉形成。合成视觉将真实世界的实时信息与虚拟环境融合起来,用实时获取的真实信息去校正配准虚拟环境,从而大大提高了虚拟环境的可信度和可用性;不仅仿真模拟训练效果更好,更重要的是可以实现身临其境地进行实时监视、操作、控制、指挥。 项目获多项国家重大科技项目支持,历时七年完成,实现了国内外第一个基于虚实合成技术的"虚拟机场"系统。不仅在民航机场塔台管制训练、军机飞行指挥训练中得到重大应用,而且在军队多个重大项目中实现了基于虚实合成的监视、指挥和控制。 项目主要研究内容包括:大规模高沉浸感完整半球虚拟空间(水平视角360 度,垂直视角90度)构建关键技术;超大视场,非规则曲面多重(四重以上)重叠投影区的几何校正、色彩校正、无缝融合技术;多源图像融合的精确配准技术和实时“虚实合成”技术;通用特效三维引擎等。 项目在虚实结合、视觉合成技术,完整半球虚拟空间构建技术,异形曲面大范围多重重叠区域无缝融合技术等方面有重大创新,已获发明专利授权4项。 项目总体水平达到国际先进,部分关键技术达到国际领先。不仅满足了民航、军航训练需求,而且还能用于实时监视、控制、指挥。大大拓展了应用领域,提升了技术水平。项目已在多个民航、军队重大项目中得到应用,初步应用直接经济效益达到34,047 万元,利税11,131 万元,大大提升了我国军、民航训练和指挥水平,为军民航飞机低能见度着陆起飞安全做出贡献。 -3- 2011.000.006.993""" keyword = '合成视觉;航空安全;虚实融合' text = '''本项目属于光电功能材料科学研究领域。氧化物薄膜铁电体和半导体是一种新型光电功能材料,可用于制备室温工作的红外焦平面器件、室温光发射器件和高灵敏湿度传感器,是未来 20 年光电和传感器技术的重要方向。本项目对氧化物功能薄膜材料微结构调控及其特性研究取得重要结果,为功能器件设计和制备提供材料和科学的基础。 (1)发现溶胶-凝胶法外延生长铁电薄膜材料的成核机制、生长机制和微结构调控方法,发展了溶胶-凝胶法生长铁电薄膜的理论和技术。首先采用高度稀释的前驱体溶液浓度和控制单层厚度,实现溶胶-凝胶法外延生长铁电薄膜材料,首先实现硅基择优取向导电薄膜和择优取向铁电薄膜生长,首先实现硅基读出电路允许温度下铁电薄膜低温生长(400°C )。 (2)首次获得铁电薄膜 BST、PZT、SBT、PMNT、LNO、LSCO 等的红外光学常数,填补了铁电薄膜材料光学常数研究的空白。发现铁电极化电滞回线和介电常数温度谱的晶粒尺寸效应和 BST 铁电薄膜介电常数温度谱在室温附近的增强峰,是 BST 铁电薄膜非制冷红外探测器的依据。发现了铁电极化来源于相对于立方相的有效电荷转移以及晶粒尺寸对铁电性、相结构、晶格动力学和光荧光等性质的影响规律。首次发现利用非对称电场可以使铁电材料的极化疲劳可逆现象,发现 BiFeO3 的弱磁效应。首次发现 PMNT 的极化自锁定特性,是一种可用于室温红外探测的新型铁电薄膜,建立了铁电薄膜复合结构非致冷红外探测模型,研制出室温下工作的红外焦平面器件。 (3)在国际上首次合成了二维生长的可控 ZnO 塔状纳米结构,并制备了氧化物功能材料 ZnO 的多种形貌的纳米结构和构造单元。发现 ZnO 纳米线和纳米棒的巨湿敏性现象,材料阻值随相对湿度的增加呈线性减少,改变量可达 4个数量级。为高灵敏湿度传感器制备提供科学基础。发现 ZnO/碳纳米管复合结构电子场发射性能的增强效应,多针状 ZnO 纳米结构的样品具有高达 10E5cm-2 的电子发射端,具有高达β≈8267 的场增强因子,和低的开启电场和阈值电场。在国际上首次采用 MBE 外延方法生长了 ZnO 单晶薄膜材料,解决了常温下 ZnO 激子光发射强度迅速淬灭问题,首次发现该 ZnO 薄膜材料在室温下的光泵激光发射。 本项目发表 SCI 论文 50 篇,授权发明专利 3项。论文他引 1947 次,其中 8篇论文被他人引用 1416 次,单篇最高引用 1181 次。研制的材料能够制备髙性能功能器件。美国《材料研究学会通报》 和《先进镀膜和表面技术》杂志发表专文报道了本项目的工作,认为研究结果“填补了该领域的空白”(So, they filled the gap by…)。美国《纳米科学技术百科全书》和《半导体科学聚焦》《热释电材料和传感器》邀请编写了章节。多项结果被写入美国《薄膜材料手册》等三本科学手册中。研究结果在国际上引领了该领域的研究工作。 本项目部分内容曾获 2006 年上海市自然科学一等奖。 3 2011.000.005.462''' keyword = '氧化物;铁电薄膜;氧化锌半导体薄膜;微结构调控;材料特性' ################################################################## ## textrank print(analyse.textrank(text, topK=4)) # 纯英文的无法识别 print(analyse.textrank(text, topK=4, withWeight=True)) print(analyse.extract_tags(text, topK=4)) ################################################################## ## stop word jieba.analyse.set_stop_words("/Users/coder352/datasets/gist/nltk/中文停用词表.txt") default_mode = list(jieba.cut(text)) # 并没有去掉停用词 stop_words = [line.strip() for line in open('/Users/coder352/datasets/gist/nltk/中文停用词表.txt').readlines()] print('原文:', '/'.join(default_mode)) print('默认模式:', '/'.join(set(default_mode))) print('搜索引擎模式:', '/'.join(set(default_mode) - set(stop_words))) print(jieba.analyse.extract_tags(text)) # 据说会自动去掉停用词
with open(inFile,"r",encoding="utf-8") as file: lines=file.readlines() for i in range(len(lines)): line=lines[i] if line==None or line=='': print("Line is empty") else: # calculate number of keywords length = len(line) from math import sqrt, floor root = int(floor(sqrt(length))) if root >= 500: print("Text is too long. Keep only 500 keywords.") root = 500 #print("Expecting number of keywords ", root) keywords = extract_tags(line, topK=root, withWeight=False, allowPOS=()) #print "Got ",len(keywords)," keywords" keyword_list.append(keywords) file.close() # output print("Completed keyword extraction on ",len(keyword_list)," items") with open(outFile,"w",encoding="utf-8") as output: for i in range(len(keyword_list)): theWords=keyword_list[i] #print "The words are: ",theWords result=','.join(theWords) if i==0: output.write(result) else: output.write('\n'+result) print("Output to file")
from jieba import analyse tfidf = analyse.extract_tags if __name__ == '__main__': text=""" 据半岛电视台援引叙利亚国家电视台称,叙利亚已经对美国、英国、法国的空袭进行了反击。据介绍,在叙军武器库中,对西方最具威慑力的当属各型战术地对地弹道导弹。 尽管美英法是利用巡航导弹等武器发动远程空袭,但叙军要对等还击却几乎是“不可能完成的任务”。目前叙军仍能作战的战机仍是老旧的苏制米格-29、米格-23、米格-21战斗机和苏-22、苏-24轰炸机,它们在现代化的西方空军面前难有自保之力,因此叙军的远程反击只能依靠另一个撒手锏——地对地战术弹道导弹。 """ # 关键词提取所使用停用词文本语料库可以切换成自定义语料库的路径。 analyse.set_stop_words("stopwords.txt") # 引入TextRank关键词抽取接口 textrank = analyse.textrank # 基于TextRank算法进行关键词抽取 keywords_textrank = textrank(text,topK = 10, withWeight = False, allowPOS = ('n','ns','vn','v','nz')) # 输出抽取出的关键词 for keyword in keywords_textrank: print keyword + "/" print "TFIDF" # TFIDF keywords_tfidf = analyse.extract_tags(text,topK = 10, withWeight = False, allowPOS = ('n','ns','vn','v','nz')) # 输出抽取出的关键词 for keyword in keywords_tfidf: print keyword + "/"
def __call__(self): logger = logging.getLogger(".getgovnotice.GetGovNotice") titleLogger = logging.getLogger("分詞案名") segLogger = logging.getLogger("分詞結果") #取得公告首頁 try: getHtml = urllib2.urlopen(GOV_NOTICE_URL) except: raise IOError('web site NO Response') hrefList = list() for line in getHtml: if TEST_STRING in line: href = line.split('href="')[1].split('">')[0] hrefList.append('%s%s' % (PCC_DOMAIN, href)) hrefList.reverse() #依連結取得各頁面 portal = api.portal.get() catalog = api.portal.get_tool(name='portal_catalog') add_count = 0 for link in hrefList: # 比對 link,或已存在catalog,continue if len(catalog(noticeUrl=link)) > 0: continue try: getNoticeHtml = urllib2.urlopen(link) except: continue doc = getNoticeHtml.read() soup = BeautifulSoup(doc.decode('utf-8')) findT11bTags = soup.findAll('th','T11b') #get value teststring=list() for T11b in findT11bTags: if hasattr(T11b.string, 'strip'): T11bString = T11b.string.strip() else: T11bString == 'no string in here' # teststring.append(T11bString) # return str(teststring) # if True: if T11bString == NOTICE_KEYWORDS[0]: [text for text in T11b.find_next_siblings("td")[0].stripped_strings] govDepartment = text elif T11bString == NOTICE_KEYWORDS[1]: [text for text in T11b.find_next_siblings("td")[0].stripped_strings] govBranch = text elif T11bString == NOTICE_KEYWORDS[2]: [text for text in T11b.find_next_siblings("td")[0].stripped_strings] govAddress = text elif T11bString == NOTICE_KEYWORDS[3]: [text for text in T11b.find_next_siblings("td")[0].stripped_strings] contact = text elif T11bString == NOTICE_KEYWORDS[4]: [text for text in T11b.find_next_siblings("td")[0].stripped_strings] telNo = text elif T11bString == NOTICE_KEYWORDS[5]: [text for text in T11b.find_next_siblings("td")[0].stripped_strings] faxNo = text elif T11bString == NOTICE_KEYWORDS[6]: [text for text in T11b.find_next_siblings("td")[0].stripped_strings] emailAddress = text elif T11bString == NOTICE_KEYWORDS[7]: [text for text in T11b.find_next_siblings("td")[0].stripped_strings] noticeId = text elif T11bString == NOTICE_KEYWORDS[8]: [text for text in T11b.find_next_siblings("td")[0].stripped_strings] noticeName = text elif T11bString == NOTICE_KEYWORDS[9]: [text for text in T11b.find_next_siblings("td")[0].stripped_strings] budget = text elif T11bString == NOTICE_KEYWORDS[10]: [text for text in T11b.find_next_siblings("td")[0].stripped_strings] bidWay = text elif T11bString == NOTICE_KEYWORDS[11]: [text for text in T11b.find_next_siblings("td")[0].stripped_strings] decideWay = text elif T11bString == NOTICE_KEYWORDS[12]: [text for text in T11b.find_next_siblings("td")[0].stripped_strings] noticeTimes = text elif T11bString == NOTICE_KEYWORDS[13]: [text for text in T11b.find_next_siblings("td")[0].stripped_strings] noticeState = text elif T11bString == NOTICE_KEYWORDS[14]: [text for text in T11b.find_next_siblings("td")[0].stripped_strings] splitText = text.split('/') startDate = (int(splitText[0])+1911, int(splitText[1]), int(splitText[2]),) elif T11bString == NOTICE_KEYWORDS[15]: [text for text in T11b.find_next_siblings("td")[0].stripped_strings] splitText = text.split() splitTextToDate = splitText[0].split('/') splitTextToTime = splitText[1].split(':') endDate = (int(splitTextToDate[0])+1911, int(splitTextToDate[1]), int(splitTextToDate[2]), int(splitTextToTime[0]), int(splitTextToTime[1]),) elif T11bString == NOTICE_KEYWORDS[16]: [text for text in T11b.find_next_siblings("td")[0].stripped_strings] splitText = text.split() splitTextToDate = splitText[0].split('/') splitTextToTime = splitText[1].split(':') bidDate = (int(splitTextToDate[0])+1911, int(splitTextToDate[1]), int(splitTextToDate[2]), int(splitTextToTime[0]), int(splitTextToTime[1]),) elif T11bString == NOTICE_KEYWORDS[17]: [text for text in T11b.find_next_siblings("td")[0].stripped_strings] bidAddress = text elif T11bString == NOTICE_KEYWORDS[18]: [text for text in T11b.find_next_siblings("td")[0].stripped_strings] bidDeposit = text elif T11bString == NOTICE_KEYWORDS[19]: [text for text in T11b.find_next_siblings("td")[0].stripped_strings] documentSendTo = text elif T11bString == NOTICE_KEYWORDS[20]: [text for text in T11b.find_next_siblings("td")[0].stripped_strings] companyQualification = text elif T11bString == NOTICE_KEYWORDS[21]: [text for text in T11b.find_next_siblings("td")[0].stripped_strings] companyAbility = text elif T11bString == NOTICE_KEYWORDS[22]: [text for text in T11b.find_next_siblings("td")[0].stripped_strings] organizationCode = text # return '%s\n%s\n' % (noticeName,str(endDate)) #assign value contentId = '%s%s' % (str(datetime.now().strftime('%Y%m%d%H%M')), str(randrange(10000000,100000000))) try: api.content.create(container=portal['gov_notice'], type='twgov.content.govnotice', title=noticeName, id=contentId, endDate=datetime(endDate[0], endDate[1], endDate[2], endDate[3], endDate[4])) except: logger.error(' : create content fail, %s' % link) continue # raise TypeError('endDate is %s ' % str(endDate)) brain = catalog(id=contentId) item = brain[0].getObject() try: item.govDepartment = govDepartment item.govBranch = govBranch item.govAddress = govAddress item.contact = contact item.telNo = telNo item.faxNo = faxNo item.emailAddress = emailAddress item.noticeId = noticeId item.noticeName = noticeName item.budget = budget item.bidWay = bidWay item.decideWay = decideWay item.noticeTimes = noticeTimes item.noticeState = noticeState item.startDate = datetime(startDate[0], startDate[1], startDate[2]) item.bidDate = datetime(bidDate[0], bidDate[1], bidDate[2], bidDate[3], bidDate[4]) item.bidAddress = bidAddress item.bidDeposit = bidDeposit item.documentSendTo = documentSendTo item.companyQualification = companyQualification item.companyAbility = companyAbility item.organizationCode = organizationCode item.noticeUrl = link except: logger.error(' url: %s' % link) api.content.delete(item) continue # setting hotPoint, viewPoint, budgetPoint and importantPoint if len(organizationCode.split('.')) == 1 and len(organizationCode) < 3: item.viewPoint = 90 + (10 * random() * int(choice(['-1' ,'1']))) elif len(organizationCode.split('.')) == 2: item.viewPoint = 75 + (10 * random() * int(choice(['-1' ,'1']))) elif len(organizationCode.split('.')) == 3: item.viewPoint = 60 + (10 * random() * int(choice(['-1' ,'1']))) elif len(organizationCode.split('.')) == 4: item.viewPoint = 52 + (10 * random() * int(choice(['-1' ,'1']))) elif len(organizationCode.split('.')) >= 5: item.viewPoint = 45 + (10 * random() * int(choice(['-1' ,'1']))) else: item.viewPoint = 45 + (10 * random() * int(choice(['-1' ,'1']))) if len(budget) >= 16: item.budgetPoint = 45 + (10 * random() * int(choice(['-1' ,'1']))) elif len(budget) > 12 and len(budget) < 16: item.budgetPoint = 90 + (10 * random() * int(choice(['-1' ,'1']))) elif len(budget) == 12 and int(budget[0]) > 2: item.budgetPoint = 80 + (10 * random() * int(choice(['-1' ,'1']))) elif len(budget) == 12 and int(budget[0]) <= 2: item.budgetPoint = 70 + (10 * random() * int(choice(['-1' ,'1']))) elif len(budget) == 11 and int(budget[0]) > 5: item.budgetPoint = 60 + (10 * random() * int(choice(['-1' ,'1']))) elif len(budget) == 11 and int(budget[0]) <= 5: item.budgetPoint = 52 + (10 * random() * int(choice(['-1' ,'1']))) elif len(budget) <= 10: item.budgetPoint = 45 + (10 * random() * int(choice(['-1' ,'1']))) else: item.budgetPoint = 45 + (10 * random() * int(choice(['-1' ,'1']))) item.hotPoint = (item.viewPoint * 0.5) + (item.budgetPoint * 0.5) item.importantPoint = (item.viewPoint + item.budgetPoint + item.hotPoint) / 3 # setup metadate resultsFromNoticeName, subjectFromNoticeName = str(), list() # for seg in scseg.seg_keywords(safe_unicode(item.noticeName)): for seg in extract_tags(safe_unicode(item.noticeName), 5): #去除含數字的seg if re.search('\d', seg): continue resultsFromNoticeName += "'%s'," % seg if len(safe_unicode(seg)) > 2: subjectFromNoticeName.append(seg) #輸出分詞結果,供人工分析 titleLogger.info(item.noticeName.encode('utf-8')) segLogger.info(resultsFromNoticeName.encode('utf-8')) randomKeyword = choice(METADATA_KEYWORD_LIST) item.setSubject([item.noticeName, randomKeyword] + subjectFromNoticeName) item.setDescription(u'%s公告,本案採購名稱:「%s」,招標方式為%s,並以%s決標' % (item.govDepartment, item.noticeName, item.bidWay, item.decideWay)) #設定QR Code itemPhysicalPath = item.getPhysicalPath() qrCodeContent = '%s, %s/%s/%s' % (item.noticeName, WEB_SITE_URL, itemPhysicalPath[-2], itemPhysicalPath[-1]) qrCodeImage = qrcode.make(qrCodeContent) qrCodeImage.save('/tmp/tmpQrCodeImage.png') with open('/tmp/tmpQrCodeImage.png') as tmpQrCodeImage: qrCodeImage = tmpQrCodeImage.read() item.qrCodeImage = namedfile.NamedBlobImage(data=qrCodeImage, filename=safe_unicode('%s.png' % item.id)) # exclude from nav and reindex object item.exclude_from_nav = True item.reindexObject() add_count += 1 api.portal.send_email(recipient=LOG_MAIL_RECIPIENT, sender=LOG_MAIL_SENDER, subject="Play公社回報, govnotice 取得:%s" % add_count, body="Done!",) return logger.info(' : OK,this time additional content: %s' % add_count)