def synonym_bigram(texts, center_word, filtrate=get_flag): """组合词""" tk.add_word(center_word, 2000, 'CENTER') left, right = Counter(), Counter() for text in texts: if center_word in text: for sentence in clean.ngram(text): words = [w for w in tk.cut(sentence) if filtrate(w)] for i in range(len(words) - 1): if words[i] == center_word: word = ' '.join(words[i:i + 2]) flag = ' '.join(tk.get_flag(w) for w in words[i:i + 2]) left[(word, flag)] += 1 if words[i + 1] == center_word: word = ' '.join(words[i:i + 2]) flag = ' '.join(tk.get_flag(w) for w in words[i:i + 2]) right[(word, flag)] += 1 u = max(left.most_common()[0][1], right.most_common()[0][1]) left = corpus.ls2df([(i, j, k, tk.bar(k, u)) for (i, j), k in left.most_common()], ['word', 'flag', 'freq', 'bar']) right = corpus.ls2df([(i, j, k, tk.bar(k, u)) for (i, j), k in right.most_common()], ['word', 'flag', 'freq', 'bar']) corpus.df2sheets([left, right], ['left', 'right'], 'synonym_bigram_%s.xlsx' % center_word)
def extract(self, text, top_n=10, flags=False): counter = Counter() for w in cut(text): counter[w] += self.get_idf(w) if flags: return [(w, tk.get_flag(w)) for w, i in counter.most_common()] return [w for w, i in counter.most_common(top_n)]
def get_flag(word): if word in corpus.STOP_WORDS: return False flag = tk.get_flag(word) if flag in flag_filter: return False return True
def clf_word(texts, labels, model_num=0, detail=False): # 向量化 vectorizer = TfidfVectorizer(tokenizer=cut) x = vectorizer.fit_transform(texts) # 建模 model = models[model_num] clf = model() clf.fit(x, labels) classes = clf.classes_ print(model.__name__, clf.score(x, labels), *classes) # 词分类 c = Counter(w for t in texts for w in cut(t)).most_common(N) if detail is False: ls = [] for word, freq in c: flag = tk.get_flag(word) # 词性 predict_proba = clf.predict_proba(vectorizer.transform([word]))[0] max_index = argmax(predict_proba) max_proba = predict_proba[max_index] # 概率 label = classes[max_index] # 类别 ls.append([freq, flag, word, label, max_proba, tk.bar(max_proba)]) corpus.ls2sheet( ls, ['freq', 'flag', 'word', 'label', 'probability', 'bar'], 'clf_word_' + model.__name__) else: maximum = c[0][1]**.5 ls = [] for word, freq in c: flag = tk.get_flag(word) # 词性 predict_proba = clf.predict_proba(vectorizer.transform( [word]))[0] # 概率 label = classes[argmax(predict_proba)] # 类别 ls.append([ flag, word, label, *predict_proba, freq, tk.bar(freq**.5, maximum) ]) corpus.ls2sheet( ls, ['flag', 'word', 'label', *clf.classes_, 'freq', 'bar'], 'clf_word_detail' + model.__name__) for i in reversed(ls): print(i[1], i[2], i[-1])
def trigram_flag(texts, n=2, stop_words=STOP_WORDS): """统计语言模型(带词性)""" c = Counter() for text in texts: for sentence in clean.ngram(clear(text)): words = [w for w in tk.cut(sentence) if w not in stop_words] for i in range(len(words) + 1 - n): word = ' '.join(words[i:i + n]) flag = ' '.join(tk.get_flag(w) for w in words[i:i + n]) c[(word, flag)] += 1 u = c.most_common()[0][1] c = [(i, j, k, tk.bar(k, u)) for (i, j), k in c.most_common(N)] DataFrame(c, columns=['word', 'flag', 'freq', 'bar']).to_excel('%dgram_flag.xlsx' % n, index=False)
def trigram_flag_sort(texts, n=2, stop_words=STOP_WORDS): """统计语言模型(带词性+排序)""" c = Counter() for text in texts: for sentence in clean.ngram(clear(text)): words = [w for w in tk.cut(sentence) if w not in stop_words] for i in range(len(words) + 1 - n): wf = sorted([(tk.get_flag(w), w) for w in words[i:i + n]]) word = ' '.join(j[1] for j in wf) flag = ' '.join(j[0] for j in wf) c[(word, flag)] += 1 c = [(k, j, i) for (i, j), k in c.most_common(N)] DataFrame(c, columns=['freq', 'flag', 'word']).to_excel('%dgram_flag_sort.xlsx' % n, index=False)
def synonym_w2v(texts, words): for word in words: tk.add_word(word, flag='TEMP') wv, counter = modeling(texts, loop=False) standard = counter.most_common()[0][1]**.5 ls_of_df = [] for word in words: try: ls = [(w, tk.get_flag(w), s, counter[w], tk.bar(counter[w]**.5, standard)) for w, s in wv.similar_by_word(word, 100)] except KeyError: continue ls_of_df.append( corpus.ls2df(ls, ['word', 'flag', 'similar', 'frequency', 'bar'])) corpus.df2sheets(ls_of_df, [clean.re.sub('\W', '', w) for w in words], 'synonym_w2v')
def synonym_neighbor(texts, center_word, filtrate=get_flag, half=5): """组合词""" tk.add_word(center_word, 2000, 'CENTER') c = Counter() for text in texts: if center_word in text: for sentence in clean.text2phrase(text): words = [w for w in tk.cut(sentence) if filtrate(w)] length = len(words) for i in range(length): if words[i] == center_word: for j in range(max(i - half, 0), min(i + 1 + half, length)): word = words[j] flag = tk.get_flag(word) c[(word, flag)] += 1 / max(abs(j - i), 1) u = c.most_common()[1][1] df = corpus.ls2df([(i, j, k, tk.bar(k, u)) for (i, j), k in c.most_common()], ['word', 'flag', 'freq', 'bar']) corpus.df2sheet(df, 'synonym_neighbor_%s.xlsx' % center_word)
def cut(text): for sentence in clean.ngram(text.strip()): for word in tk.cut(sentence): if clean.is_word(word) and tk.get_flag( word) not in discarded_flags: yield word