def main(): vocab=calcu_voacb() #print(vocab) sents1=load_sents() sents2=load_corpus() out=[] for i in range(len(sents1)): token1=segment(sents1[i]) token2=segment(sents2[i]) vec1=[0 for i in range(len(vocab))] vec2 = [0 for i in range(len(vocab))] for word in token1: if word in list(vocab.keys()): vec1[vocab[word]]+=1 else: vec1[-1]+=1 for word in token2: if word in list(vocab.keys()): vec2[vocab[word]] += 1 else: vec2[-1] += 1 out.append(cos_sim(vec1[:-1],vec2[:-1])) with open("res.txt","w") as f: for i in out: f.write(str(i)+'\n') f.close()
def get_similarity_score(): params = request.get_json(force=True) wiki_content = params['wiki'] exp_content = params['exp'] wiki_keyword = tfidf.get_keyword(wiki_content) wiki_keyword_simi = [] for keyword in wiki_keyword.split(): try: key_simi = model.most_similar([keyword], topn=15) for elem in key_simi: wiki_keyword_simi.append(elem[0]) except KeyError: continue wiki_cut = [word for word in segmenter.segment(wiki_content).split()] wiki_cut = wiki_cut + wiki_keyword_simi wiki_keyword_set = set(wiki_cut) exp_cut = [word for word in segmenter.segment(exp_content).split()] exp_set = set(exp_cut) intersection = len(wiki_keyword_set & exp_set) if len(exp_set) == 0: return str(0) else: return str(intersection / len(exp_set))
def __iter__(self): with open(self.filepath, 'r') as input_file: for line in input_file: raw_sentence = line.strip().split('\t')[1] sentence = segmenter.segment(raw_sentence) yield [word for word in sentence.split() ] #['word_1','word_2','word_3']
def submit(): text = request.form.get('text') thu_segmenter = thulac.thulac(seg_only=True) thu_list = thu_segmenter.cut(text) thu_array = np.array(thu_list) seg_list, keyword = segmenter.segment(text, keywords) jieba_list = jieba.cut(text) seg_result = "/".join(seg_list) jieba_result = "/".join(jieba_list) thu_result = "/".join(thu_array[:, 0]) jieba_similarity, jieba_result = test.similarity(seg_list, jieba_result.split("/")) thulac_similarity, thu_result = test.similarity(seg_list, thu_result.split("/")) jieba_similarity = util.float2percentage(jieba_similarity) thulac_similarity = util.float2percentage(thulac_similarity) keyword = list(keyword.keys()) return render_template("submit.html", seg=seg_list, jieba=jieba_result, thulac=thu_result, jieba_similarity=jieba_similarity, thulac_similarity=thulac_similarity, keyword_first=keyword[0:10], keyword_second=keyword[10:20])
def get_idf(inputfile, idffile): # idf generator inputfile = inputfile outputfile = idffile doc = [] with open(inputfile, 'r') as ins: for line in ins: line = line.strip().split('\t')[1] doc.append(segment(line)) id_freq = {} i = 0 for cut_doc in doc: #print "doc: ",doc for x in cut_doc.split(): id_freq[x] = id_freq.get(x, 0) + 1 if i % 1000 == 0: print('Documents processed: ', i, ', time: ', datetime.datetime.now()) i += 1 with open(outputfile, 'w') as f: for key, value in id_freq.items(): f.write( key.encode('utf-8') + '\t' + str(math.log(i / value + 1, 2)) + '\n')
def __iter__(self): for dirfile in os.walk(self.dirname): for fname in dirfile[2]: text = open(os.path.join(dirfile[0], fname), 'r', encoding='utf-8', errors='ignore').read() yield segment(text) # time consuming
def __iter__(self): # os.walk() 遞迴印出資料夾中所有目錄及檔名 # ('idf', [], ['1.txt', '2.txt']) # dirfile[2] =>檔名 for fname in self.fname: # 內文 docs = self.read_file(self.folderName + '/' + fname, 'json') for i in range(len(docs)): yield segment(docs[i]['content'])
def generate(source='articles/cnbeta'): 'combines cleaner and segmenter' import cleaner, segmenter documents = [] items = cleaner.clean(source) documents = segmenter.segment(items) publish(documents, source) return documents
def __iter__(self): for dirfile in os.walk(self.dirname): for fname in dirfile[2]: # text = open(os.path.join(dirfile[0], fname), # 'r', encoding='utf-8', errors='ignore').read() with open(os.path.join(dirfile[0], fname), 'r', encoding='utf-8', errors='ignore') as f: text = '' for i, line in enumerate(f): text = text + line if i % 1000 == 0 and i != 0: yield segment(text) # time consuming text = ''
def get_similarity_score(wiki, exp): wiki_content = wiki exp_content = exp wiki_keyword = tfidf.get_keyword(wiki_content) wiki_keyword_simi = [] for keyword in wiki_keyword.split(): try: key_simi = model.most_similar([keyword], topn=15) for elem in key_simi: wiki_keyword_simi.append(elem[0]) except KeyError: continue wiki_cut = [word for word in segmenter.segment(wiki_content).split()] wiki_cut = wiki_cut + wiki_keyword_simi wiki_keyword_set = set(wiki_cut) exp_cut = [word for word in segmenter.segment(exp_content).split()] exp_set = set(exp_cut) intersection = len(wiki_keyword_set & exp_set) if len(exp_set) == 0: return 0 else: return intersection / len(exp_set)
def extract_keywords(self, sentence, topK=20): # 提取关键词 # 过滤 seg_list = segment(sentence) freq = {} for w in seg_list: freq[w] = freq.get(w, 0.0) + 1.0 total = sum(freq.values()) for k in freq: # 计算 TF-IDF freq[k] *= self.idf_freq.get(k, self.mean_idf) / total tags = sorted(freq, key=freq.__getitem__, reverse=True) # 排序 if topK: return tags[:topK] else: return tags
def extract_keywords(self, sentence, topK=20): # 提取关键词 # 过滤 seg_list = segment(sentence) freq = {} # 统计每个句子中词语出现的频率TF for w in seg_list: freq[w] = freq.get(w, 0.0) + 1.0 total = sum(freq.values()) # 文章中的总的词语数量 for k in freq: # 计算 # TF-IDF = TF*IDF freq[k] *= self.idf_freq.get(k, self.mean_idf) / total tags = sorted(freq, key=freq.__getitem__, reverse=True) # 降序排列,TF-IDF值越大,说明越重要! if topK: return tags[:topK] # 取前topK=20个关键词 else: return tags
def main(): # idf generator sents = load_sents() sents = [segment(x) for x in sents] ignored = {'', ' ', '', '。', ':', ',', ')', '(', '!', '?', '”', '“'} id_freq = {} i = 0 for doc in sents: doc = set(x for x in doc if x not in ignored) for x in doc: id_freq[x] = id_freq.get(x, 0) + 1 if i % 1000 == 0: print('Documents processed: ', i, ', time: ', datetime.datetime.now()) i += 1 with open("idf.txt", 'w', encoding='utf-8') as f: for key, value in id_freq.items(): f.write(key + ' ' + str(math.log(i / value, 2)) + '\n')
def extract_keywords(self, sentence, topK=15): # 提取关键词 # 过滤 seg_list = segment(sentence) freq = {} for w in seg_list.split(): freq[w] = freq.get(w, 0.0) + 1.0 if '' in freq: del freq[''] total = sum(freq.values()) for k in freq: # 计算 TF-IDF,这里不产生异常,因为idf不存在则用平均值替代 freq[k] *= self.idf_freq.get(k, self.mean_idf) / total tags = sorted(freq, key=freq.__getitem__, reverse=True) # 排序 if topK: return tags[:topK] else: return tags
def __iter__(self): for i in range(len(self.docs)): yield segment(self.docs[i]['title']) yield segment(self.docs[i]['content'])
# -*- coding: utf-8 -*- # @Author : lin.xiong from __future__ import division import segmenter out_file = open('C:/Users/lin/Desktop/out_uid_content.data','w') with open('C:/Users/lin/Desktop/uid_content.data','r') as uid_content_file: for line in uid_content_file: line = line.strip() uid = line.split('\t')[0] comment_list = line.split('\t')[1].split('|') comment_arr = [] union_comment = set() if len(comment_list) < 2: out_file.write(uid + '\t' + '-1' + '\t' + '1' +'\n') else: for comment in comment_list: comment_set = set(segmenter.segment(comment).split()) comment_arr.append(comment_set) union_comment |= comment_set inter_comment = reduce(lambda x ,y : x & y, comment_arr) if len(union_comment) == 0: simi = 1 else: simi = len(inter_comment)/len(union_comment) out_file.write(uid + '\t' + str(simi) + '\t' + str(len(comment_list)) + '\n') out_file.close()
def __iter__(self): text = open(self.inputfile, 'r').readlines() for line in text: line = line.strip().split('\t')[1] yield segment(line) # time consuming