Beispiel #1
0
def synonym_bigram(texts, center_word, filtrate=get_flag):
    """组合词"""
    tk.add_word(center_word, 2000, 'CENTER')
    left, right = Counter(), Counter()
    for text in texts:
        if center_word in text:
            for sentence in clean.ngram(text):
                words = [w for w in tk.cut(sentence) if filtrate(w)]
                for i in range(len(words) - 1):
                    if words[i] == center_word:
                        word = ' '.join(words[i:i + 2])
                        flag = ' '.join(tk.get_flag(w) for w in words[i:i + 2])
                        left[(word, flag)] += 1
                    if words[i + 1] == center_word:
                        word = ' '.join(words[i:i + 2])
                        flag = ' '.join(tk.get_flag(w) for w in words[i:i + 2])
                        right[(word, flag)] += 1
    u = max(left.most_common()[0][1], right.most_common()[0][1])
    left = corpus.ls2df([(i, j, k, tk.bar(k, u))
                         for (i, j), k in left.most_common()],
                        ['word', 'flag', 'freq', 'bar'])
    right = corpus.ls2df([(i, j, k, tk.bar(k, u))
                          for (i, j), k in right.most_common()],
                         ['word', 'flag', 'freq', 'bar'])
    corpus.df2sheets([left, right], ['left', 'right'],
                     'synonym_bigram_%s.xlsx' % center_word)
Beispiel #2
0
 def extract(self, text, top_n=10, flags=False):
     counter = Counter()
     for w in cut(text):
         counter[w] += self.get_idf(w)
     if flags:
         return [(w, tk.get_flag(w)) for w, i in counter.most_common()]
     return [w for w, i in counter.most_common(top_n)]
Beispiel #3
0
def get_flag(word):
    if word in corpus.STOP_WORDS:
        return False
    flag = tk.get_flag(word)
    if flag in flag_filter:
        return False
    return True
Beispiel #4
0
def clf_word(texts, labels, model_num=0, detail=False):
    # 向量化
    vectorizer = TfidfVectorizer(tokenizer=cut)
    x = vectorizer.fit_transform(texts)
    # 建模
    model = models[model_num]
    clf = model()
    clf.fit(x, labels)
    classes = clf.classes_
    print(model.__name__, clf.score(x, labels), *classes)
    # 词分类
    c = Counter(w for t in texts for w in cut(t)).most_common(N)
    if detail is False:
        ls = []
        for word, freq in c:
            flag = tk.get_flag(word)  # 词性
            predict_proba = clf.predict_proba(vectorizer.transform([word]))[0]
            max_index = argmax(predict_proba)
            max_proba = predict_proba[max_index]  # 概率
            label = classes[max_index]  # 类别
            ls.append([freq, flag, word, label, max_proba, tk.bar(max_proba)])
        corpus.ls2sheet(
            ls, ['freq', 'flag', 'word', 'label', 'probability', 'bar'],
            'clf_word_' + model.__name__)
    else:
        maximum = c[0][1]**.5
        ls = []
        for word, freq in c:
            flag = tk.get_flag(word)  # 词性
            predict_proba = clf.predict_proba(vectorizer.transform(
                [word]))[0]  # 概率
            label = classes[argmax(predict_proba)]  # 类别
            ls.append([
                flag, word, label, *predict_proba, freq,
                tk.bar(freq**.5, maximum)
            ])
        corpus.ls2sheet(
            ls, ['flag', 'word', 'label', *clf.classes_, 'freq', 'bar'],
            'clf_word_detail' + model.__name__)
        for i in reversed(ls):
            print(i[1], i[2], i[-1])
Beispiel #5
0
def trigram_flag(texts, n=2, stop_words=STOP_WORDS):
    """统计语言模型(带词性)"""
    c = Counter()
    for text in texts:
        for sentence in clean.ngram(clear(text)):
            words = [w for w in tk.cut(sentence) if w not in stop_words]
            for i in range(len(words) + 1 - n):
                word = ' '.join(words[i:i + n])
                flag = ' '.join(tk.get_flag(w) for w in words[i:i + n])
                c[(word, flag)] += 1
    u = c.most_common()[0][1]
    c = [(i, j, k, tk.bar(k, u)) for (i, j), k in c.most_common(N)]
    DataFrame(c, columns=['word', 'flag', 'freq',
                          'bar']).to_excel('%dgram_flag.xlsx' % n, index=False)
Beispiel #6
0
def trigram_flag_sort(texts, n=2, stop_words=STOP_WORDS):
    """统计语言模型(带词性+排序)"""
    c = Counter()
    for text in texts:
        for sentence in clean.ngram(clear(text)):
            words = [w for w in tk.cut(sentence) if w not in stop_words]
            for i in range(len(words) + 1 - n):
                wf = sorted([(tk.get_flag(w), w) for w in words[i:i + n]])
                word = ' '.join(j[1] for j in wf)
                flag = ' '.join(j[0] for j in wf)
                c[(word, flag)] += 1
    c = [(k, j, i) for (i, j), k in c.most_common(N)]
    DataFrame(c, columns=['freq', 'flag',
                          'word']).to_excel('%dgram_flag_sort.xlsx' % n,
                                            index=False)
Beispiel #7
0
def synonym_w2v(texts, words):
    for word in words:
        tk.add_word(word, flag='TEMP')
    wv, counter = modeling(texts, loop=False)
    standard = counter.most_common()[0][1]**.5
    ls_of_df = []
    for word in words:
        try:
            ls = [(w, tk.get_flag(w), s, counter[w],
                   tk.bar(counter[w]**.5, standard))
                  for w, s in wv.similar_by_word(word, 100)]
        except KeyError:
            continue
        ls_of_df.append(
            corpus.ls2df(ls, ['word', 'flag', 'similar', 'frequency', 'bar']))
    corpus.df2sheets(ls_of_df, [clean.re.sub('\W', '', w) for w in words],
                     'synonym_w2v')
Beispiel #8
0
def synonym_neighbor(texts, center_word, filtrate=get_flag, half=5):
    """组合词"""
    tk.add_word(center_word, 2000, 'CENTER')
    c = Counter()
    for text in texts:
        if center_word in text:
            for sentence in clean.text2phrase(text):
                words = [w for w in tk.cut(sentence) if filtrate(w)]
                length = len(words)
                for i in range(length):
                    if words[i] == center_word:
                        for j in range(max(i - half, 0),
                                       min(i + 1 + half, length)):
                            word = words[j]
                            flag = tk.get_flag(word)
                            c[(word, flag)] += 1 / max(abs(j - i), 1)
    u = c.most_common()[1][1]
    df = corpus.ls2df([(i, j, k, tk.bar(k, u))
                       for (i, j), k in c.most_common()],
                      ['word', 'flag', 'freq', 'bar'])
    corpus.df2sheet(df, 'synonym_neighbor_%s.xlsx' % center_word)
Beispiel #9
0
def cut(text):
    for sentence in clean.ngram(text.strip()):
        for word in tk.cut(sentence):
            if clean.is_word(word) and tk.get_flag(
                    word) not in discarded_flags:
                yield word