def craw_result_process(root=Dir.res + "/data/"): files = ftools.get_files(root) data = [] for i in range(len(files)): filename = files[i] if len(data) > 10: break lines = ftools.read_lines(root + filename) for line in lines: tmp = line.split(",") # print("news",len(tmp[2])) # print("news",tmp[2]) # # print("abstract",len(tmp[1])) # print("abstract",tmp[1]) abstract = tools.seperate_sentences(tmp[1]) news = tools.seperate_sentences(tmp[2]) print(abstract) print(news) # input() jude = data_filter(news, abstract) if jude > 0.5: data.append(['\n'.join(abstract), '\n'.join(news)]) return data
def textrank(self, text): sentences = tools.seperate_sentences(text) words = {} words_list = [] res = {} sen_words = [] for sen in sentences: ws = tools.seperate(sen) sen_words.append(ws) for w in ws: if w not in words.keys(): words_list.append(w) words[w] = len(words) matrix = np.zeros((len(words), len(words))) # matrix = [[0] * len(words) for var in range(len(words))] for sen_w in sen_words: for i in range(len(sen_w)): for j in range(i, len(sen_w)): # print(words[sen_w[i]],words[sen_w[j]],len(words)) matrix[words[sen_w[i]], words[sen_w[j]]] += 1 matrix[words[sen_w[j]], words[sen_w[i]]] += 1 nx_graph = nx.from_numpy_matrix(matrix) nx_parameter = {'alpha': 0.85} score = nx.pagerank(nx_graph, **nx_parameter) sorted_score = sorted(score.items(), key=lambda item: item[1], reverse=True) for index, value in sorted_score: if words_list[index] not in res.keys(): res[words_list[index]] = value return res
def weighted_vectorize(self, text): res = [] sentences = tools.seperate_sentences(text) tr_text = self.tr.textrank(text) for sen in sentences: tmp = [] tmp_weight = [] sen_words = tools.seperate(sen) for w in sen_words: if self.model.wv.vocab.__contains__(w): tmp.append(self.model.__getitem__(w)) if w in tr_text: tmp_weight.append(tr_text[w]) else: tmp_weight.append(1 / len(sen_words)) else: tmp.append([0] * self.vec_length) tmp_weight.append(1 / len(sen_words)) for i in range(len(tmp)): tmp[i] = tools.vector_multi(tmp[i], tmp_weight[i] / sum(tmp_weight)) sen_vec = tools.vector_add_multi(tmp) if len(sen_vec) == 0: print(sen) res.append(sen_vec) return res
def generate_data(self,text): if isinstance(text,str): sens = tools.seperate_sentences(text) else: sens = text words =[] sen_words = [] for sen in sens: wp = tools.sen_pog( sen) tmp = [] for w,p in wp: if "n" in p or "v" in p or "m" in p: tmp.append(w) if w not in words: words.append(w) sen_words.append(tmp) vector = [] for sen_w in sen_words: tmp =[0]*len(words) for i in range(len(words)): w= words[i] if w in sen_w: tmp[i] = 1 vector.append(tmp) return words,vector
def summarize(self, essay, num=3): sentences = tools.seperate_sentences(essay) if sentences.__len__() <= num: return sentences # print(sentences.__len__()) mid_graph = self.build_graph(sentences) graph = self.generate_normal_graph(mid_graph[1]) # print_graph(graph) if graph.__len__() == 0: return sentences[:num] au, hub = HITS.HITS(graph) sorted_au = sorted(au.items(), key=lambda item: item[1], reverse=True) sorted_hub = sorted(hub.items(), key=lambda item: item[1], reverse=True) result = [] for res in sorted_au[:num]: # print(res) result.append(int(res[0])) result.sort() abstract = [] for res in result: abstract.append(sentences[res]) # for sent in abstract: # print(sent) return abstract
def analyze(self, text): sens_words, sens_tag = [], [] sens = tools.seperate_sentences(text) for sen in sens: tmp_words, tmp_tag = tools.seperate_pog(sen) sens_words.append(tmp_words) sens_tag.append(tmp_tag) return sens, sens_words, sens_tag
def preprocess(self,text): sens_words, sens_tag = [], [] sens = tools.seperate_sentences(text) for sen in sens: tmp_words, tmp_tag = [], [] for w, t in tools.sen_pog(sen): tmp_words.append(w) tmp_tag.append(t) sens_words.append(tmp_words) sens_tag.append(tmp_tag) return sens, sens_words, sens_tag
def get_sens_words(self,text): sens = tools.seperate_sentences(text) sens_words = [] for line in sens: words, tags = tools.seperate_pog(line) for i in range(len(words)): w = words[i] if w not in self.words_tags_dict.keys(): self.words_tags_dict[w] = tags[i] sens_words.append(words) return sens_words
def vectorize(self, text): sens = tools.seperate_sentences(text) matrix = [] for sen in sens: tmp = tools.sen_pog(sen) pog_tmp = [] for w, p in tmp: if p == "n" or "v" in p: pog_tmp.append(w) matrix.append(pog_tmp) tr_res = self.tr.textrank_matrix(matrix)
def analyze(self, text): sens_words, sens_tag = [], [] sens = tools.seperate_sentences(text) tmp = [] for sen in sens: if "原标题" in sen: continue tmp.append(sen) tmp_words, tmp_tag = tools.seperate_pog(sen) sens_words.append(tmp_words) sens_tag.append(tmp_tag) return tmp, sens_words, sens_tag
def text2pic(text): sens = tools.seperate_sentences(text) nodes = [] nodes_dict = {} sen_words =[] sen_noun_words =[] for sen in sens: wp = tools.sen_pog(sen) tmp_w =[] tmp_p = [] tmp_noun =[] for w,p in wp: if "n" in p or "v" in p or "m" in p: if w not in nodes: nodes.append(w) if w not in nodes_dict.keys(): nodes_dict[w] = 0 nodes_dict[w]+=1 tmp_noun.append(w) # tmp.append([w,p]) tmp_w.append(w) tmp_p.append(p) sen_noun_words.append(tmp_noun) sen_words .append([tmp_w,tmp_p]) # nodes = [] # tmp = sorted(nodes_dict.items(), key= lambda d:d[1],reverse=True) # for var,count in tmp: # nodes.append(var) # # print(tmp) matrix = [[0]*len(nodes) for var in range(len(nodes))] for k in range(len(sen_noun_words)): var = sen_noun_words[k] for i in range(len(var)-1): for j in range(i+1,len(var)): # matrix[nodes.index(var[i])][nodes.index(var[j])] += 1 matrix[nodes.index(var[j])][nodes.index(var[i])] += 1 # nouni_index = sen_words[k][0].index(var[i]) # nounj_index = sen_words[k][0].index(var[j]) # if nouni_index == nounj_index-1 and True: # matrix[nodes.index(var[i])][nodes.index(var[j])] +=1 # matrix[nodes.index(var[j])][nodes.index(var[i])] +=1 # else: # for p in sen_words[k][1][nouni_index:nounj_index]: # if "v" in p or "m" in p: # matrix[nodes.index(var[i])][nodes.index(var[j])] += 1 # matrix[nodes.index(var[j])][nodes.index(var[i])] += 1 # break return matrix,nodes
def preprocess(self, text): sens_words, sens_tag = [], [] sens = tools.seperate_sentences(text) tmp = [] for i in range(1, len(sens)): sen = sens[i] # for sen in sens: if "原标题" in sen: continue tmp.append(sen) tmp_words, tmp_tag = tools.seperate_pog(sen) sens_words.append(tmp_words) sens_tag.append(tmp_tag) return tmp, sens_words, sens_tag
def unweighted_vectorize(self, text): res = [] sentences = tools.seperate_sentences(text) for line in sentences: tmp = [] for word in tools.seperate(line): if self.model.wv.vocab.__contains__(word): wv = self.model.__getitem__(word) tmp.append(wv) else: tmp.append([0] * self.vec_length) tmp = tools.vector_add_multi(tmp) tmp = tools.vector_multi(tmp, 1 / (len(tmp))) res.append(tmp) return res
def sum2pic(text,nodes): sens = tools.seperate_sentences(text) sen_n =[] sen_w =[] sen_p = [] # nodes_dict = {} for sen in sens: wp = tools.sen_pog(sen) tmp_sen_n =[] tmp_sen_w = [] tmp_sen_p =[] for w,p in wp: if ("n" in p or "v" in p or "m" in p )and w in nodes: tmp_sen_n.append(w) # if w not in nodes_dict.keys(): # nodes_dict[w] = 0 # nodes_dict[w] += 1 tmp_sen_w.append(w) tmp_sen_p.append(p) sen_n.append(tmp_sen_n) sen_w.append(tmp_sen_w) sen_p.append(tmp_sen_p) # nodes = [] # tmp = sorted(nodes_dict.items(), key=lambda d: d[1], reverse=True) # for var, count in tmp: # nodes.append(var) # print(tmp) matrix = [[0]*len(nodes) for var in range(len(nodes))] for i in range(len(sen_n)): for j in range(len(sen_n[i])): for k in range(j+1,len(sen_n[i])): # nouni_index = sen_w[i].index(sen_n[i][j]) # nounj_index = sen_w[i].index(sen_n[i][k]) matrix[nodes.index(sen_n[i][j])][nodes.index(sen_n[i][k])] += 1 matrix[nodes.index(sen_n[i][k])][nodes.index(sen_n[i][j])] += 1 # if nouni_index == nounj_index-1 and True : # matrix[nodes.index(sen_n[i][j])][nodes.index(sen_n[i][k])] +=1 # matrix[nodes.index(sen_n[i][k])][nodes.index(sen_n[i][j])] +=1 # for p in sen_p[i][nouni_index:nounj_index+1]: # if "v" in p or "m" in p: # matrix[nodes.index(sen_n[i][j])][nodes.index(sen_n[i][k])] += 1 # matrix[nodes.index(sen_n[i][k])][nodes.index(sen_n[i][j])] += 1 # break return matrix
def loaddata(path): # flist = ftools.get_files(data_root) # count =1 # for name in flist: # print(count,len(flist)) # count+=1 # path = data_root+name trainformat_sentences = [] content = ftools.read_lines(path) for line in content: article = line[line.rindex(",") + 1:] sentences = tools.seperate_sentences(article) for sen in sentences: trainformat_sentences.append(tools.seperate(sen)) return trainformat_sentences
def vectorize(self, text): sens = tools.seperate_sentences(text) short_text = [] for sen in sens: short_text.append(self.ltp.short_sentences(sen)) s_w_tr = self.tr.textrank_matrix(short_text) sen_vs = [] for sen in sens: tmp = [] for w in s_w_tr.keys(): if w in sen: tmp.append(s_w_tr[w]) else: tmp.append(0.0) sen_vs.append(tmp) return sen_vs
def vectorize(self, text): sentences = tools.seperate_sentences(text) res = [] words = {} sen_w = [] for i in range(len(sentences)): sen_words = tools.seperate(sentences[i]) sen_w.append(sen_words) for w in sen_words: if w not in words.keys(): words[w] = len(words) for i in range(len(sen_w)): tmp = [0] * len(words) for var in sen_w[i]: tmp[words[var]] += 1 res.append(tmp) return res
def summarize(self,essay,num=3,fname = None): sentences = tools.seperate_sentences(essay) if sentences.__len__() <= num: return sentences # print(sentences.__len__()) mid_graph = self.build_graph(sentences) bigraph = self.generate_bigraph(mid_graph) graph = self.generate_normal_graph(mid_graph[1]) # print_graph(graph) au,hub = HITS.HITS(bigraph) od = {} for node in graph.keys(): od[node] = len(graph[node])/(node+1) e = {} for node in mid_graph[1].keys(): e[node] = len(mid_graph[1][node]) options = self.optimization(au,od,e) abstract = [] for var in options: abstract.append(sentences[var]) return abstract
def filter_craw_data(data_dir=Dir.res + "/craw_data/data/", save_dir=Dir.res + "/cleandata_none"): if os.path.lexists(save_dir): shutil.rmtree(save_dir) files = tools.get_files(data_dir) cleandata = [] count = 0 bad_sample = [] for i in range(len(files)): print(i, len(files), len(cleandata)) fname = files[i] path = data_dir + fname lines = tools.read_lines(path) for line in lines: line = line.strip() # try: if 1: last_ = line.rindex(",") first_ = line.index(",") if first_ == last_: continue tmp = [line[:first_], line[first_ + 1:last_], line[last_ + 1:]] abstracts = tls.seperate_sentences(tmp[1]) news = tls.seperate_sentences(tmp[2]) tmp = get_abstract_index(news, abstracts) count += 1 if len(tmp) != len(abstracts): continue # print(tmp) # cmd = input() # if "1" in cmd: # print('\n'.join(abstracts)) # print("--------------------") # print('\n'.join(news)) # # print("--------------------") # print("words:",w_count) w_count = 0 for li in news: w_count += len(tls.seperate(li)) if w_count < 520: continue if sum(tmp[:3]) <= 3: continue cleandata.append([abstracts, news]) tools.write( save_dir + "/abstract/trainning_" + str(len(cleandata)) + ".txt", '\n'.join(abstracts)) tools.write( save_dir + "/news/trainning_" + str(len(cleandata)) + ".txt", '\n'.join(news)) # except Exception as e: # print(str(e),e.with_traceback(e.__traceback__)) # print("error",line) # bad_sample.append(line) print(count, len(bad_sample), len(cleandata))
def ed_sentence(self, essay): self.sentence = tools.seperate_sentences(essay)
def summarize(self, essay, num=3, fname=None): sentences = tools.seperate_sentences(essay) return sentences[:num]