def frequency(texts, fname='frequency.xlsx'): """词频统计""" c = Counter(w for t in texts for s in clean.ngram(t) for w in tk1.cut(s)if clean.is_word(w)).most_common(N) DataFrame([(w, tk1.get_flag(w), f)for w, f in c], columns=['word', 'flag', 'freq']).to_excel(fname, index=False) maximum = c[0][1] ** .5 for w, f in reversed(c): tk1.bar(f**.5, maximum, w)
def synonym_bigram(texts, center_word, filtrate=get_flag): """组合词""" tk.add_word(center_word, 2000, 'CENTER') left, right = Counter(), Counter() for text in texts: if center_word in text: for sentence in clean.ngram(text): words = [w for w in tk.cut(sentence) if filtrate(w)] for i in range(len(words) - 1): if words[i] == center_word: word = ' '.join(words[i:i + 2]) flag = ' '.join(tk.get_flag(w) for w in words[i:i + 2]) left[(word, flag)] += 1 if words[i + 1] == center_word: word = ' '.join(words[i:i + 2]) flag = ' '.join(tk.get_flag(w) for w in words[i:i + 2]) right[(word, flag)] += 1 u = max(left.most_common()[0][1], right.most_common()[0][1]) left = corpus.ls2df([(i, j, k, tk.bar(k, u)) for (i, j), k in left.most_common()], ['word', 'flag', 'freq', 'bar']) right = corpus.ls2df([(i, j, k, tk.bar(k, u)) for (i, j), k in right.most_common()], ['word', 'flag', 'freq', 'bar']) corpus.df2sheets([left, right], ['left', 'right'], 'synonym_bigram_%s.xlsx' % center_word)
def new_word_flag(texts, dictionary=_dict, fname='new_word_flag.xlsx'): """探索新词极其词性""" c = Counter( (w.word, w.flag) for t in texts for s in ngram(t) for w in tk2.cut(s) if w.word not in dictionary and fullmatch(w.word)).most_common() DataFrame([(i[0], i[1], j) for i, j in c], columns=['word', 'flag', 'freq']).to_excel(fname, index=False)
def new_word(texts, fname='new_word.xlsx'): """探索新词""" c = Counter( w for t in texts for s in clean.ngram(t) for w in tk1.cut(s) if clean.is_word(w) and w not in dictionary).most_common(N) DataFrame(c, columns=['word', 'freq']).to_excel(fname, index=False) maximum = c[0][1] ** .5 for w, f in reversed(c): tk1.bar(f**.5, maximum, w)
def new_word_flag(texts, fname='new_word_flag.xlsx'): """探索新词极其词性""" c = Counter( (w.word, w.flag) for t in texts for s in clean.ngram(t) for w in tk2.cut(s) if clean.is_word(w.word) and w.word not in dictionary).most_common(N) maximum = c[0][1] ** .5 DataFrame([(i, j, k, tk2.bar(k**.5, maximum)) for (i, j), k in c], columns=['word', 'flag', 'freq', 'bar']).to_excel(fname, index=False) for w, f in reversed(c): tk1.bar(f**.5, maximum, ' '.join(w))
def trigram(texts, n=2, stop_words=STOP_WORDS): """统计语言模型""" c = Counter() for text in texts: for sentence in clean.ngram(clear(text)): words = [w for w in tk.cut(sentence) if w not in stop_words] for i in range(len(words) + 1 - n): c[' '.join(words[i:i + n])] += 1 DataFrame(c.most_common(N), columns=['word', 'freq'])[['freq', 'word']].to_excel('%dgram.xlsx' % n, index=False)
def trigram_flag(texts, n=2, stop_words=STOP_WORDS): """统计语言模型(带词性)""" c = Counter() for text in texts: for sentence in clean.ngram(clear(text)): words = [w for w in tk.cut(sentence) if w not in stop_words] for i in range(len(words) + 1 - n): word = ' '.join(words[i:i + n]) flag = ' '.join(tk.get_flag(w) for w in words[i:i + n]) c[(word, flag)] += 1 u = c.most_common()[0][1] c = [(i, j, k, tk.bar(k, u)) for (i, j), k in c.most_common(N)] DataFrame(c, columns=['word', 'flag', 'freq', 'bar']).to_excel('%dgram_flag.xlsx' % n, index=False)
def trigram_flag_sort(texts, n=2, stop_words=STOP_WORDS): """统计语言模型(带词性+排序)""" c = Counter() for text in texts: for sentence in clean.ngram(clear(text)): words = [w for w in tk.cut(sentence) if w not in stop_words] for i in range(len(words) + 1 - n): wf = sorted([(tk.get_flag(w), w) for w in words[i:i + n]]) word = ' '.join(j[1] for j in wf) flag = ' '.join(j[0] for j in wf) c[(word, flag)] += 1 c = [(k, j, i) for (i, j), k in c.most_common(N)] DataFrame(c, columns=['freq', 'flag', 'word']).to_excel('%dgram_flag_sort.xlsx' % n, index=False)
def frequency(texts, fname='frequency.xlsx'): """词频统计""" c = Counter(w for t in texts for s in ngram(t) for w in tk0.cut(s) if fullmatch(w)).most_common() DataFrame([(w, tk0.get_flag(w), f) for w, f in c], columns=['word', 'flag', 'freq']).to_excel(fname, index=False)
def new_word(texts, dictionary=_dict, fname='new_word.xlsx'): """探索新词""" c = Counter(w for t in texts for s in ngram(t) for w in tk1.cut(s) if w not in dictionary and fullmatch(w)).most_common() DataFrame(c, columns=['word', 'freq']).to_excel(fname, index=False)
def cut(text): for sentence in clean.ngram(text.strip()): for word in tk.cut(sentence): if clean.is_word(word) and tk.get_flag( word) not in discarded_flags: yield word