def stdOut(sentence_result_lists, dicts, scope): lists = list() j = 0 sentence_nbr = len(dicts) sentence_result_lists_tmp = map( lambda (key, score): (key, score * (1 - math.log( (key + 1)) / math.log(sentence_nbr))), sentence_result_lists) sentence_result_lists2 = sorted(sentence_result_lists_tmp, key=lambda (key, score): score, reverse=True) try: for distattr3 in sentence_result_lists2: sentence_id = distattr3[0] tmp = dicts[sentence_id] tmp2 = filter(lambda x: is_chinese(x), tmp) if (len(tmp2) < 8 or contain_redundant( redundant_dict='../resource/redundant_dict.txt', string_with_redundant=tmp)): continue j += 1 result_str = removePrefix(tmp.strip(" "), "”".decode("utf8")) result = distattr2(sentence_id, result_str) lists.append(result) if (j >= scope): break std = sorted(lists, key=lambda x: 0.5 * len(x.strs) / (x.ids + 1), reverse=True) except: std = lists return std
def evaluate_words(dictssentence,keyword,new=True): # for word !!! dictsword_tmp = {key: sentence_evaluate(value) for key, value in dictssentence.items() if len(sentence_evaluate(value)) > 0} dictsword_tmp2 = map(lambda z: z[1], dictsword_tmp.items()) corpus = map(lambda z: " ".join(z), dictsword_tmp2) wordlists = top50words(corpus) # sentence lists if(new): if keyword[0] is None: words_final = map(lambda (word, importance): word, wordlists)[:100] else: dictsword_tmp3 = list(set(flatten(dictsword_tmp2))) word_potential = map(lambda t: (t,similar_check_higher(t, keyword)), flatten(dictsword_tmp3)) words_2 = sorted(word_potential, key = lambda (word, score):score, reverse =True) words_3 = filter(lambda (key,score):score != -1,words_2) words_final = map(lambda (key, score): key, words_3)[:100] # word importance based on word2vec # ids:word index: order id score: vector word_vector_dicts = [distattr2(word, word2vec_evaluate(word)) for word in words_final if len(word2vec_evaluate(word)) != 0] try: final_list = map(lambda x: (x.ids,x.score),textrankgetter(word_vector_dicts, False)) return final_list except: return wordlists else: return wordlists
def evaluate_sentence_tradition(dictssentence, words_importance): result = list() words_lookup = dict(words_importance) sentence_nbr = len(dictssentence) for key, value in dictssentence.items(): cut_sentence = sentence_evaluate(value) if len(cut_sentence) == 0: continue sentence_id = key importance_list = map(lambda x: words_lookup.get(x, 0), cut_sentence) score = reduce(lambda x, y: x + y, importance_list) * (1 - math.log( (key + 1)) / math.log(sentence_nbr)) result.append(distattr2(sentence_id, score)) result_final = sorted(result, key=lambda x: 0.5 * len(x.strs) / (sentence_id + 1), reverse=True) return result_final
def evaluate_sentence(dictssentence): dictsword = { key: sentence_evaluate(value) for key, value in dictssentence.items() if len(sentence_evaluate(value)) > 6 } if len(dictsword) == 0: result_list_final = [] return result_list_final else: sentence_vector_dicts = [ distattr2(key, model_abstract.infer_vector(value)) for key, value in dictsword.items() ] try: result_list = textrankgetter(sentence_vector_dicts) result_list_final = map(lambda x: (x.ids, x.score), result_list) except: result_list_final = [] return result_list_final
def stdOut(self, rank, dicts, top): lists = list() j = 0 try: for sentence_id in rank: tmp = dicts[sentence_id] tmp2 = filter(lambda x: is_chinese(x), tmp) if (len(tmp2) < 8 or contain_redundant( redundant_dict='../resource/redundant_dict.txt', string_with_redundant=tmp)): continue j += 1 result_str = removePrefix(tmp.replace(" ", ""), "”".decode("utf8")) result = distattr2(sentence_id, result_str) lists.append(result) if (j >= top): break std = sorted(lists, key=lambda x: x.ids) except: std = lists return std
def main(): # load data conn = MySQLdb.connect(host=args.host, user=args.user, passwd=args.passwd, db=args.db, charset='utf8') cur = conn.cursor() cur.execute('select id, content_html from t_crawler_obj limit ' + args.file[0] + ',' + args.file[1]) data = cur.fetchall() # load model model = doc2vec.Doc2Vec.load(args.model) # parse data by beautiful soup dicts1 = dict() for line in data: ids, content_html = line content = BeautifulSoup(content_html, "html.parser") dicts1[ids] = content.get_text() # split sentence # nested dict dict2-> key: paper, value: dicttmp-> key: sentence id, value: sentence string dicts2 = defaultdict(dict) for key, value in dicts1.items(): lists = cut_sentence_new(value) dicttmp = dict() for key2, value2 in enumerate(lists): dicttmp[key2] = value2 dicts2[key] = dicttmp # split words dict3-> key: paper, value: dicttmp-> key: sentence id, value: sentence split list dicts3 = defaultdict(dict) analyse.set_stop_words('../resource/stop_words.txt') for key, value in dicts2.items(): dicttmp = dict() for key2, value2 in value.items(): seg_list = jieba.cut( string_parser(punc_file='../resource/punc_file.txt', string_with_punc=value2)) seg_list = filter(lambda x: x != " ", seg_list) lists = list(seg_list) if (len(lists) >= 3): #save sentence with length greater than 3 dicttmp[key2] = lists dicts3[key] = dicttmp # vectorization and textrank for key, value in dicts3.items(): dictrember = dict() X = list() i = 0 for key2, value2 in value.items(): dictrember[i] = key2 # i: X index; key2: sentence order X.append(model.infer_vector(value2)) i += 1 X = np.array(X, dtype='float32') distance_matrix = pairwise_distances(X, metric='cosine') rank = rankgetter(distance_matrix=distance_matrix, dictrember=dictrember) j = 0 try: lists = list() for info in rank: ind = info.ids # sentence order tmp = dicts2[key][ind] tmp2 = filter(lambda x: is_chinese(x), tmp) if (len(tmp2) < 8 or contain_redundant( redundant_dict='../resource/redundant_dict.txt', string_with_redundant=dicts2[key][ind])): continue j += 1 result_str = removePrefix(dicts2[key][ind].replace(" ", ""), "”".decode("utf8")) result = distattr2(ind, result_str) lists.append(result) if (j >= args.top): break stdOut = sorted( lists, key=lambda x: x.ids ) # print the result according to the order sentence for key3, sentence3 in enumerate(stdOut): print str(key) + " " + str(key3 + 1) + ": " + sentence3.strs except: print("No More Qualified Sentence!")