def _rank_all_words(): segmenter = Segmenter() # 诗句分段器 stopwords = get_stopwords() # 停用词列表 print "Start TextRank over the selected quatrains ..." quatrains = get_quatrains() # 四行诗集合 adjlist = dict() for idx, poem in enumerate(quatrains): # 对于每首诗 if 0 == (idx + 1) % 10000: print "[TextRank] Scanning %d/%d poems ..." % (idx + 1, len(quatrains)) for sentence in poem['sentences']: # 对于每一句诗 segs = filter(lambda word: word not in stopwords, segmenter.segment(sentence)) # 得到不再停用词中的词段 for seg in segs: # 对于每个词段 if seg not in adjlist: adjlist[seg] = dict() # 每个词段生成一个字典dict for i, seg in enumerate(segs): # 对于每个词段 for _, other in enumerate( segs[i + 1:]): # 去和后面的每个词段比较,实际是源于text_rank需要的网状结构图 if seg != other: # 精巧的code adjlist[seg][other] = adjlist[seg][other]+1 \ if other in adjlist[seg] else 1.0 adjlist[other][seg] = adjlist[other][seg]+1 \ if seg in adjlist[other] else 1.0 for word in adjlist: w_sum = sum( weight for other, weight in adjlist[word].items()) # 求该word对应的所有词的权重综合 for other in adjlist[word]: adjlist[word][other] /= w_sum # 求该word中每个value对应的权重平均值 print "[TextRank] Weighted graph has been built." _text_rank(adjlist)
def _get_adjlists(self): print("[TextRank] Generating word graph ...") segmenter = Segmenter() poems = Poems() adjlists = dict() # Count number of co-occurrence. for poem in poems: for sentence in poem: words = [] for word in segmenter.segment(sentence): if word not in self.stopwords: words.append(word) for word in words: if word not in adjlists: adjlists[word] = dict() for i in range(len(words)): for j in range(i + 1, len(words)): if words[j] not in adjlists[words[i]]: adjlists[words[i]][words[j]] = 1.0 else: adjlists[words[i]][words[j]] += 1.0 if words[i] not in adjlists[words[j]]: adjlists[words[j]][words[i]] = 1.0 else: adjlists[words[j]][words[i]] += 1.0 # Normalize weights. for a in adjlists: sum_w = sum(w for _, w in adjlists[a].items()) for b in adjlists[a]: adjlists[a][b] /= sum_w return adjlists
def _gen_train_data(): segmenter = Segmenter() poems = get_pop_quatrains() random.shuffle(poems) ranks = get_word_ranks() print("Generating training data ...") data = [] kw_data = [] for idx, poem in enumerate(poems): sentences = poem['sentences'] if len(sentences) == 4: flag = True rows = [] kw_row = [] for sentence in sentences: rows.append([sentence]) segs = list(filter(lambda seg: seg in ranks, segmenter.segment(sentence))) if 0 == len(segs): flag = False break keyword = reduce(lambda x,y: x if ranks[x] < ranks[y] else y, segs) kw_row.append(keyword) rows[-1].append(keyword) if flag: data.extend(rows) kw_data.append(kw_row) if 0 == (idx+1)%2000: print("[Training Data] %d/%d poems are processed." %(idx+1, len(poems))) with codecs.open(train_path, 'w', 'utf-8') as fout: for row in data: fout.write('\t'.join(row)+'\n') with codecs.open(kw_train_path, 'w', 'utf-8') as fout: for kw_row in kw_data: fout.write('\t'.join(kw_row)+'\n') print("Training data is generated.")
def _rank_all_words(): segmenter = Segmenter() #generation sxhy dictp stopwords = get_stopwords() print("Start TextRank over the selected quatrains ...") quatrains = get_quatrains() adjlist = dict() for idx, poem in enumerate(quatrains): if 0 == (idx + 1) % 10000: print("[TextRank] Scanning %d/%d poems ..." % (idx + 1, len(quatrains))) for sentence in poem['sentences']: segs = filter(lambda word: word not in stopwords, segmenter.segment(sentence)) #分词结果 for seg in segs: if seg not in adjlist: adjlist[seg] = dict() for i, seg in enumerate(segs): for _, other in enumerate(segs[i + 1:]): if seg != other: adjlist[seg][other] = adjlist[seg][other]+1 \ if other in adjlist[seg] else 1.0 adjlist[other][seg] = adjlist[other][seg]+1 \ if seg in adjlist[other] else 1.0 for word in adjlist: w_sum = sum(weight for other, weight in adjlist[word].items()) #权重增加 for other in adjlist[word]: adjlist[word][other] /= w_sum print("[TextRank] Weighted graph has been built.") _text_rank(adjlist)
def _gen_train_data(): sampled_poems = np.array(random_int_list(1, 70000, 4000)) segmenter = Segmenter() #generation sxhy dict poems = get_pop_quatrains() #获得较为流行的10万首诗 random.shuffle(poems) #重新排序 ranks = get_word_ranks() #Textrank word -rank_number print("Generating training data ...") data = [] kw_data = [] test_data = [] for idx, poem in enumerate(poems): sentences = poem['sentences'] if len(sentences) == 4: flag = True test_flag = True rows = [] kw_row = [] test_row = [] if idx in sampled_poems: test_flag = False for sentence in sentences: rows.append([sentence]) test_row.append([sentence]) segs = list( filter(lambda seg: seg in ranks, segmenter.segment(sentence))) if 0 == len(segs): flag = False break keyword = reduce(lambda x, y: x if ranks[x] < ranks[y] else y, segs) #选取权重比较大的keywords kw_row.append(keyword) rows[-1].append(keyword) if flag and test_flag: data.extend(rows) kw_data.append(kw_row) if flag and test_flag is False: test_data.extend(test_row) if 0 == (idx + 1) % 2000: print("[Training Data] %d/%d poems are processed." % (idx + 1, len(poems))) print(test_data) with codecs.open(train_path, 'w', 'utf-8') as fout: for row in data: fout.write('\t'.join(row) + '\n') with codecs.open(kw_train_path, 'w', 'utf-8') as fout: for kw_row in kw_data: fout.write('\t'.join(kw_row) + '\n') with codecs.open(test_path, 'w', 'utf-8') as fout: for test_row in test_data: fout.write('\t'.join(test_row) + '\n') print("Training data is generated.")
def gen_train_data(): """获取每一句的keywords,拼起来写入文件""" print("Generating training data ...") segmenter = Segmenter() poems = Poems() ranked_words = RankedWords() gen_data = list() plan_data = list() valid = True counter_line = 0 print('len(poems)==>', len(poems)) for poem in poems: # print(len(poem)) if len(poem) != 4: # print(poem) valid = False continue context = start_of_sentence() keywords = list() for sentence in poem: counter_line += 1 keyword = '' if len(sentence) != 7: valid = False break filterwords = list( filter(lambda x: x in ranked_words, segmenter.segment(sentence))) if filterwords: keyword = filterwords[0] for word in filterwords: # print('word==>',word) if ranked_words.get_rank(word) < ranked_words.get_rank( keyword): keyword = word if keyword: gen_line = sentence + end_of_sentence() + \ '\t' + keyword + '\t' + context + '\n' keywords.append(keyword) gen_data.append(gen_line) context += sentence + end_of_sentence() plan_data.append(' '.join(keywords)) with open(plan_data_path, 'w') as fw: for data_iter in gen_data: fw.write(data_iter + '\n') with open(gen_data_path, 'w') as fw: for data_iter in gen_data: fw.write(data_iter) print('counter_line==>', counter_line) del segmenter, poems, ranked_words
def _gen_word_cnts(): counters = dict() segmenter = Segmenter() quatrains = get_quatrains() for idx, poem in enumerate(quatrains): for sentence in poem['sentences']: segs = segmenter.segment(sentence) for seg in segs: counters[seg] = counters[seg]+1 if seg in counters else 1 if 0 == (idx+1)%10000: print "[Word Count] %d/%d quatrains has been processed." %(idx+1, len(quatrains)) with codecs.open(_wc_path, 'w', 'utf-8') as fout: json.dump(counters, fout)
def gen_train_data(): print("Generating training data ...") segmenter = Segmenter() poems = Poems() poems.shuffle() ranked_words = RankedWords() plan_data = [] gen_data = [] for poem in poems: if len(poem) != 4: continue # Only consider quatrains. valid = True context = start_of_sentence() gen_lines = [] keywords = [] for sentence in poem: if len(sentence) != 7: #只考虑七字诀句 valid = False break #get a list of selected words from this sentence #ignore all words if they are not in the ranked words list words = list( filter(lambda seg: seg in ranked_words, segmenter.segment(sentence))) if len(words) == 0: valid = False break keyword = words[0] # from all words in this sentence, get the word with highest text_rank score for word in words[1:]: if ranked_words.get_rank(word) < ranked_words.get_rank( keyword): keyword = word gen_line = sentence + end_of_sentence() + \ '\t' + keyword + '\t' + context + '\n' gen_lines.append(gen_line) keywords.append(keyword) context += sentence + end_of_sentence() if valid: # plan data: each line is four keywords from the 4 sentences plan_data.append('\t'.join(keywords) + '\n') gen_data.extend(gen_lines) with open(plan_data_path, 'w') as fout: for line in plan_data: fout.write(line) with open(gen_data_path, 'w') as fout: for line in gen_data: fout.write(line)
def _train(self): print("Start training Word2Vec for planner ...") quatrains = get_quatrains() segmenter = Segmenter() seg_lists = [] for idx, quatrain in enumerate(quatrains): seg_list = [] for sentence in quatrain['sentences']: seg_list.extend([seg for seg in segmenter.segment(sentence) if seg in self.ranks]) seg_lists.append(seg_list) if 0 == (idx+1)%10000: print("[Plan Word2Vec] %d/%d quatrains has been processed." %(idx+1, len(quatrains))) print("Hold on. This may take some time ...") self.model = models.Word2Vec(seg_lists, size = 512, min_count = 5) self.model.save(_model_path)
def gen_train_data(): print("Generating training data ...") segmenter = Segmenter() poems = Poems() poems.shuffle() ranked_words = RankedWords() plan_data = [] gen_data = [] for poem in poems: # 只处理四行七言的诗 if len(poem) != 4: continue valid = True context = start_of_sentence() gen_lines = [] keywords = [] for sentence in poem: if len(sentence) != 7: valid = False break words = list( filter(lambda seg: seg in ranked_words, segmenter.segment(sentence))) if len(words) == 0: valid = False break keyword = words[0] for word in words[1:]: if ranked_words.get_rank(word) < ranked_words.get_rank( keyword): keyword = word gen_line = sentence + end_of_sentence() + \ '\t' + keyword + '\t' + context + '\n' gen_lines.append(gen_line) keywords.append(keyword) context += sentence + end_of_sentence() if valid: plan_data.append('\t'.join(keywords) + '\n') gen_data.extend(gen_lines) with open(plan_data_path, 'w') as fout: for line in plan_data: fout.write(line) with open(gen_data_path, 'w') as fout: for line in gen_data: fout.write(line)
def _get_adjlists(self): poems = Poems() segmenter = Segmenter() adjlists = collections.defaultdict(dict) for poem_set in poems: for poem in poem_set: words = segmenter.segment(poem) for i in range(len(words) - 1): for j in range(i + 1, len(words)): if words[j] not in adjlists[words[i]]: adjlists[words[i]][words[j]] = 1.0 else: adjlists[words[i]][words[j]] += 1.0 if words[i] not in adjlists[words[j]]: adjlists[words[j]][words[i]] = 1.0 else: adjlists[words[j]][words[i]] += 1.0 return adjlists
def _build_adjlists_from_tencent_embeddings(self): print("[TextRank] Generating word graph ...") segmenter = Segmenter() poems = Poems() adjlists = dict( ) # 2D dict, dict[word1][word2]=prob(going from word1 to word2) wv = get_tencent_embedding_keyedVectors(_tencent_embedding_path) # Count number of co-occurrence. ######################## get a 2D cos sim matrix for all words ################### words = set() for poem in poems: for sentence in poem: for word in segmenter.segment(sentence): # for each word selected from the sentence if word not in self.stopwords: #keep only non-stopwords words words.add(word) for word in words: if word not in adjlists: #initialize all words to a new dict() adjlists[word] = dict() for word in words: for other in words: if word == other: continue if other in adjlists[word] or word in adjlists[other]: continue sim = wv.similarity(word, other) adjlists[word][other] = sim adjlists[other][word] = sim # Normalize weights. for a in adjlists: sum_w = sum(w for _, w in adjlists[a].items()) for b in adjlists[a]: adjlists[a][b] /= sum_w return adjlists
def _train(self): print "Start training Word2Vec for planner ..." quatrains = get_quatrains() segmenter = Segmenter() # 对诗句分段和取其中的每个词不一样 seg_lists = [] for idx, quatrain in enumerate(quatrains): seg_list = [] for sentence in quatrain['sentences']: seg_list.extend( filter(lambda seg: seg in self.ranks, segmenter.segment(sentence))) seg_lists.append(seg_list) if 0 == (idx + 1) % 10000: print "[Plan Word2Vec] %d/%d quatrains has been processed." % ( idx + 1, len(quatrains)) print "Hold on. This may take some time ..." self.model = models.Word2Vec(seg_lists, size=512, min_count=5) # 代表一个词向量类,生成的是词向量模型 self.model.save(_model_path)
def _gen_train_data(): segmenter = Segmenter() poems = get_pop_quatrains() random.shuffle(poems) ranks = get_word_ranks() print "Generating training data ..." data = [] kw_data = [] for idx, poem in enumerate(poems): sentences = poem['sentences'] if len(sentences) == 4: flag = True lines = u'' rows = [] kw_row = [] for sentence in sentences: rows.append([sentence]) segs = filter(lambda seg: seg in ranks, segmenter.segment(sentence)) if 0 == len(segs): # 只要该行诗句存在不在ranks中的词则这一首诗都不能用 flag = False break keyword = reduce(lambda x, y: x if ranks[x] < ranks[y] else y, segs) kw_row.append(keyword) rows[-1].append(keyword) # rows的每一个元素是该行诗句加上对应的关键字数组 if flag: data.extend(rows) # 用extend,data的每一个元素和rows的每一个元素相同 kw_data.append(kw_row) # 用append if 0 == (idx + 1) % 2000: print "[Training Data] %d/%d poems are processed." % (idx + 1, len(poems)) with codecs.open(train_path, 'w', 'utf-8') as fout: for row in data: fout.write('\t'.join(row) + '\n') # 每一行都是用tab键分隔开的一行诗加上关键字序列 with codecs.open(kw_train_path, 'w', 'utf-8') as fout: for kw_row in kw_data: fout.write('\t'.join(kw_row) + '\n') print "Training data is generated."
class WordFeature(object): def __init__(self, punct_file=None, stop_file=None, once_file=None, reserve_file=None, area_file=None, color_file=None, quantifier_file=None, num_file=None): cur_dir = os.path.dirname(os.path.abspath(__file__)) if not punct_file: punct_file = cur_dir + '/dict/punct.txt' if not stop_file: stop_file = cur_dir + '/dict/stop_words.txt' if not once_file: once_file = cur_dir + '/dict/once.words' if not reserve_file: reserve_file = cur_dir + '/dict/reserve_words.txt' self.segmenter = Segmenter() self.punct = set() self.load_punct = (punct_file) self.stop_words = set() self.load_stop_words(stop_file) self.remove_words = set() self.load_remove_words(once_file) self.reserve_words = set() self.load_reserve_words(reserve_file) self.replace_lst = [(u'斜跨包', u'斜挎包'), (u'!', u','), (u'。', u','), (u',', u','), (u'市场价', u''), (u'全国包邮', u''), (u'包邮', u''), (u'【', u''), (u'】', u''), (u'[', u''), (u']', u''), (u'《', u''), (u'》', u'')] self.word_label = WordLabel(area_file=area_file, color_file=None, quantifier_file=None, num_file=None) def _add_char_to_set(self, myset, filename): with open(filename, 'r') as f: lines = f.readlines() for l in lines: lines = l.rstrip('\n').decode('utf-8') for c in lines: myset.add(c) def load_punct(self, filename): self._add_char_to_set(self.punct, filename) def load_stop_words(self, filename): with open(filename, 'r') as f: for line in f: self.stop_words.add(line.rstrip('\n').decode('utf-8')) def load_remove_words(self, filename): with open(filename, 'r') as f: for line in f: self.remove_words.add(line.rstrip('\n').decode('utf-8')) def load_reserve_words(self, filename): with open(filename, 'r') as f: for line in f: self.reserve_words.add(line.rstrip('\n').decode('utf-8').lower()) def check_is_mode(self, word): has_hyphen = False for c in word: if c == u'-': has_hyphen = True if (c < u'a' and c > u'z') and (c < u'0' and c > u'9'): return False return has_hyphen def check_valid_new(self, word): if word in self.reserve_words: return True if not word: return False if word.isnumeric(): return False # unicode 编码无法使用 isalnum() if word.encode("u8").isalnum() and len(word) <= 3: return False # if len(word) == 1 and ord(word) < 256: if len(word) == 1: return False if word in self.punct: return False if word in self.stop_words: return False if word in self.remove_words: return False if self.check_is_mode(word): return False try: float(word) return False except: pass return True def check_valid(self, word): if not word: return False if word.isnumeric(): return False if word in self.punct: return False if len(word) == 1 and ord(word) < 256: return False if word[0].isdigit(): return False if word in self.stop_words: return False if word in self.remove_words: return False if self.check_is_mode(word): return False return True def convert_word_features(self, text): words = self.segmenter.segment(text.lower().strip()) features = {} word0 = "" for word in words: word = word.strip().replace(u'(', u'').replace(u')', u'').replace(u'(', u'').replace(u')', u'') if not word: continue word = self.word_label.word_label(word, word0) word0 = word if not self.check_valid(word): continue features[word] = 1 return features def convert_all(self, cid, name, cat, brand, price): remove_cat_count = 0 try: config = zk_conf.get_client(cid) if config and "category_remove" in config: remove_cat_count = config["category_remove"] except Exception, e: logging.error("category_remove: %s", e) try: cat= json.dumps(json.loads(cat)[remove_cat_count:], separators=(',',':'), ensure_ascii=False) except: cat = u'[]' if brand.endswith(u'公司'): brand = u'' name = self.extract_sentence(name) sample = self.convert_features_with_all(name, cat, brand, price) return (cid, name, cat, brand, price, sample)
# -*- coding: utf-8 -*- from codecs import open from itertools import imap from math import log from lexicon import Lexicon from segment import Segmenter def wrap(line): w, f = line.strip().split(' ') f = log(float(f) + 1.0) return (w, f) with open('dict.txt', 'r', 'utf-8') as fin: tf = dict(imap(wrap, fin)) lex = Lexicon(tf) seg = Segmenter(lex) result = seg.segment(u'這是一隻可愛的小花貓') print('/'.join(result).encode('utf-8'))
# -*- coding: utf-8 -*- from codecs import open from itertools import imap from math import log from lexicon import Lexicon from segment import Segmenter def wrap(line): w, f = line.strip().split(" ") f = log(float(f) + 1.0) return (w, f) with open("dict.txt", "r", "utf-8") as fin: tf = dict(imap(wrap, fin)) lex = Lexicon(tf) seg = Segmenter(lex) result = seg.segment(u"這是一隻可愛的小花貓") print "/".join(result).encode("utf-8")
def _get_adjlists(self): print("[TextRank] Generating word graph ...") segmenter = Segmenter() poems = Poems() adjlists = dict( ) # 2D dict, dict[word1][word2]=prob(going from word1 to word2) # Count number of co-occurrence. """ ######################## count relationship per sentence ################### for poem in poems: for sentence in poem: words = [] for word in segmenter.segment(sentence): # for each word selected from the sentence if word not in self.stopwords: #keep only non-stopwords words words.append(word) for word in words: if word not in adjlists: #initialize all words to a new dict() adjlists[word] = dict() for i in range(len(words)): for j in range(i + 1, len(words)): #### if two words present in the same sentence, their score +=1 ##### if words[j] not in adjlists[words[i]]: adjlists[words[i]][words[j]] = 1.0 else: adjlists[words[i]][words[j]] += 1.0 if words[i] not in adjlists[words[j]]: adjlists[words[j]][words[i]] = 1.0 else: adjlists[words[j]][words[i]] += 1.0 ######################## end count relationship per sentence ################### """ ######################## count relationship per poem ################### for poem in poems: for sentence in poem: words = [] for word in segmenter.segment(sentence): # for each word selected from the sentence if word not in self.stopwords: #keep only non-stopwords words words.append(word) for word in words: if word not in adjlists: #initialize all words to a new dict() adjlists[word] = dict() for i in range(len(words)): for j in range(i + 1, len(words)): #### if two words present in the same sentence, their score +=1 ##### if words[j] not in adjlists[words[i]]: adjlists[words[i]][words[j]] = 1.0 else: adjlists[words[i]][words[j]] += 1.0 if words[i] not in adjlists[words[j]]: adjlists[words[j]][words[i]] = 1.0 else: adjlists[words[j]][words[i]] += 1.0 ######################## end count relationship per poem ################### # Normalize weights. for a in adjlists: sum_w = sum(w for _, w in adjlists[a].items()) for b in adjlists[a]: adjlists[a][b] /= sum_w return adjlists