def split_into_sentences(line): tokens = [] en_token = [] def close_token(token): if token: tokens.append(''.join(token)) del (token[:]) for c in line: if is_terminator(c): # close current token if not tokens: continue close_token(en_token) tokens.append(c) yield tokens tokens = [] elif is_punct(c): close_token(en_token) tokens.append(c) elif is_zh(c): close_token(en_token) tokens.append(c) elif c == u' ' or c == u'\t': close_token(en_token) else: en_token.append(c) if tokens: yield tokens
def split_into_sentences(line): tokens = [] en_token = [] def close_token(token): if token: tokens.append("".join(token)) del (token[:]) for c in line: if is_terminator(c): # close current token if not tokens: continue close_token(en_token) tokens.append(c) yield tokens tokens = [] elif is_punct(c): close_token(en_token) tokens.append(c) elif is_zh(c): close_token(en_token) tokens.append(c) elif c == u" " or c == u"\t": close_token(en_token) else: en_token.append(c) if tokens: yield tokens
def predict_class(model, data_str): best_n = len(word2index) count_all = {} if not data_str or not len(data_str): return None line = data_str.strip() line_t = jieba.cut(line, cut_all=False) objs = [] for item in line_t: if hanzi_util.is_zh(item[0]): if item not in word2index: # 单字词已经被踢掉了 objs.append(word2index["U"]) else: objs.append(word2index[item]) else: objs.append(word2index["U"]) objs = list(set(objs)) objs_vector = np.zeros((1, best_n)) for item in objs: objs_vector[0, item] = 1.0 predict_prob = model.predict_proba(objs_vector, verbose=False) return predict_prob.argmax()
def is_not_chinese_word(self, word): return not (word and is_zh(word[0]))
def build_train_data(best_n): global word2index global index2word global train_vector global train_vtag global train_tags global wordvector word2index = {} index2word = {} train_tags = [] wordvector = [] t_word2index = {} t_index2word = {} t_train_info = {} word2vec_str = [] print("BUILD TRAIN DATA...") word_fd = FreqDist() # 可统计所有词的词频 cond_word_fd = ConditionalFreqDist() # 可统计积极文本中的词频和消极文本中的词频 for parent, dirname, filenames in os.walk(DATA_DIR): for filename in filenames: if filename[-4:] != ".txt": continue tag_name = filename[:-4] print("正在处理:%s" % (tag_name)) train_tags.append(tag_name) tag_id = train_tags.index(tag_name) t_train_info[tag_id] = [] line_num = 0 with open(DATA_DIR + "/" + filename, "r") as fin: while True: try: line = fin.readline() except UnicodeDecodeError as e: print("Unicode Error! filename=%s, line_num=%d" % (filename, line_num)) continue if not line: print("文件已处理完! filename=%s, line_num=%d" % (filename, line_num)) break line_num += 1 if not line_num % 500: print("LINE:%d" % (line_num)) line = line.strip() line_t = jieba.cut(line, cut_all=False) objs = [] objs_str = [] for item in line_t: if hanzi_util.is_zh(item[0]): if item not in t_word2index: item_id = len(t_word2index) t_word2index[item] = item_id t_index2word[item_id] = item else: item_id = t_word2index[item] if item_id not in objs: word_fd[item_id] += 1 cond_word_fd[tag_id][item_id] += 1 objs.append(item_id) objs_str.append(item) if objs: t_train_info[tag_id].append(objs) if objs_str: word2vec_str.append(objs_str) w2v_model = word2vec.Word2Vec( sentences=word2vec_str, size=100, window=4, min_count=3, workers=4, sorted_vocab=1, iter=10 ) del word2vec_str print("word2vec DONE.") print("Randomize>>>") cond_word_sum = {} for tag in train_tags: tag_id = train_tags.index(tag) shuffle(t_train_info[tag_id]) cond_word_sum[tag_id] = cond_word_fd[tag_id].N() print("SUM:%s->%d" % (tag, cond_word_sum[tag_id])) total_w_count = word_fd.N() print("TOTAL:%d" % (total_w_count)) global sorted_word_scores sorted_word_scores = {} word_scores = {} word_scores_sub = {} print("CALC CHI-SQUARE...") for word, freq in word_fd.items(): word_scores[word] = 0 for tag in train_tags: tag_id = train_tags.index(tag) word_scores[word] += BigramAssocMeasures.chi_sq( cond_word_fd[tag_id][word], (freq, cond_word_sum[tag_id]), total_w_count ) sorted_word_scores = sorted(word_scores.items(), key=lambda e: e[1], reverse=True) del cond_word_sum del word_fd del cond_word_fd if best_n < len(sorted_word_scores): sorted_word_scores = sorted_word_scores[:best_n] else: best_n = len(sorted_word_scores) # real word2index index2world for index in range(best_n): word2index[t_index2word[sorted_word_scores[index][0]]] = index index2word[index] = t_index2word[sorted_word_scores[index][0]] for i in range(len(word2index)): if index2word[i] in w2v_model: wordvector.append(w2v_model[index2word[i]]) else: print("RANDOM VECTOR FOR: %s" % (index2word[i])) wordvector.append(np.random.randn(100)) # 'U' word2index["U"] = best_n index2word[best_n] = "U" wordvector.append(np.random.randn(100)) train_vector = [] train_vtag = [] # train_info for tag_id in t_train_info: for l_index in range(len(t_train_info[tag_id])): objs = [] for item in t_train_info[tag_id][l_index]: if t_index2word[item] in word2index: objs.append(word2index[t_index2word[item]]) else: objs.append(word2index["U"]) objs = list(set(objs)) if objs: train_vector.append(objs) train_vtag.append(tag_id) del t_index2word del t_word2index del sorted_word_scores del t_train_info print("build_train_data finished!") return