Esempio n. 1
0
def split_into_sentences(line):
    tokens = []
    en_token = []

    def close_token(token):
        if token:
            tokens.append(''.join(token))
            del (token[:])

    for c in line:
        if is_terminator(c):
            # close current token
            if not tokens: continue
            close_token(en_token)
            tokens.append(c)
            yield tokens
            tokens = []
        elif is_punct(c):
            close_token(en_token)
            tokens.append(c)
        elif is_zh(c):
            close_token(en_token)
            tokens.append(c)
        elif c == u' ' or c == u'\t':
            close_token(en_token)
        else:
            en_token.append(c)
    if tokens:
        yield tokens
def split_into_sentences(line):
    tokens = []
    en_token = []

    def close_token(token):
        if token:
            tokens.append("".join(token))
            del (token[:])

    for c in line:
        if is_terminator(c):
            # close current token
            if not tokens:
                continue
            close_token(en_token)
            tokens.append(c)
            yield tokens
            tokens = []
        elif is_punct(c):
            close_token(en_token)
            tokens.append(c)
        elif is_zh(c):
            close_token(en_token)
            tokens.append(c)
        elif c == u" " or c == u"\t":
            close_token(en_token)
        else:
            en_token.append(c)
    if tokens:
        yield tokens
Esempio n. 3
0
def predict_class(model, data_str):

    best_n = len(word2index)
    count_all = {}
    if not data_str or not len(data_str):
        return None
    line = data_str.strip()
    line_t = jieba.cut(line, cut_all=False)
    objs = []
    for item in line_t:
        if hanzi_util.is_zh(item[0]):
            if item not in word2index:  # 单字词已经被踢掉了
                objs.append(word2index["U"])
            else:
                objs.append(word2index[item])
        else:
            objs.append(word2index["U"])
    objs = list(set(objs))
    objs_vector = np.zeros((1, best_n))
    for item in objs:
        objs_vector[0, item] = 1.0
    predict_prob = model.predict_proba(objs_vector, verbose=False)

    return predict_prob.argmax()
Esempio n. 4
0
 def is_not_chinese_word(self, word):
     return not (word and is_zh(word[0]))
Esempio n. 5
0
def build_train_data(best_n):
    global word2index
    global index2word
    global train_vector
    global train_vtag
    global train_tags
    global wordvector
    word2index = {}
    index2word = {}
    train_tags = []
    wordvector = []

    t_word2index = {}
    t_index2word = {}
    t_train_info = {}
    word2vec_str = []

    print("BUILD TRAIN DATA...")

    word_fd = FreqDist()  # 可统计所有词的词频
    cond_word_fd = ConditionalFreqDist()  # 可统计积极文本中的词频和消极文本中的词频

    for parent, dirname, filenames in os.walk(DATA_DIR):
        for filename in filenames:
            if filename[-4:] != ".txt":
                continue
            tag_name = filename[:-4]
            print("正在处理:%s" % (tag_name))
            train_tags.append(tag_name)
            tag_id = train_tags.index(tag_name)
            t_train_info[tag_id] = []
            line_num = 0
            with open(DATA_DIR + "/" + filename, "r") as fin:
                while True:
                    try:
                        line = fin.readline()
                    except UnicodeDecodeError as e:
                        print("Unicode Error! filename=%s, line_num=%d" % (filename, line_num))
                        continue
                    if not line:
                        print("文件已处理完! filename=%s, line_num=%d" % (filename, line_num))
                        break

                    line_num += 1
                    if not line_num % 500:
                        print("LINE:%d" % (line_num))
                    line = line.strip()
                    line_t = jieba.cut(line, cut_all=False)
                    objs = []
                    objs_str = []
                    for item in line_t:
                        if hanzi_util.is_zh(item[0]):
                            if item not in t_word2index:
                                item_id = len(t_word2index)
                                t_word2index[item] = item_id
                                t_index2word[item_id] = item
                            else:
                                item_id = t_word2index[item]
                            if item_id not in objs:
                                word_fd[item_id] += 1
                                cond_word_fd[tag_id][item_id] += 1
                                objs.append(item_id)
                            objs_str.append(item)
                    if objs:
                        t_train_info[tag_id].append(objs)
                    if objs_str:
                        word2vec_str.append(objs_str)

    w2v_model = word2vec.Word2Vec(
        sentences=word2vec_str, size=100, window=4, min_count=3, workers=4, sorted_vocab=1, iter=10
    )
    del word2vec_str
    print("word2vec DONE.")

    print("Randomize>>>")
    cond_word_sum = {}
    for tag in train_tags:
        tag_id = train_tags.index(tag)
        shuffle(t_train_info[tag_id])
        cond_word_sum[tag_id] = cond_word_fd[tag_id].N()
        print("SUM:%s->%d" % (tag, cond_word_sum[tag_id]))
    total_w_count = word_fd.N()
    print("TOTAL:%d" % (total_w_count))

    global sorted_word_scores
    sorted_word_scores = {}
    word_scores = {}

    word_scores_sub = {}
    print("CALC CHI-SQUARE...")
    for word, freq in word_fd.items():
        word_scores[word] = 0
        for tag in train_tags:
            tag_id = train_tags.index(tag)
            word_scores[word] += BigramAssocMeasures.chi_sq(
                cond_word_fd[tag_id][word], (freq, cond_word_sum[tag_id]), total_w_count
            )
    sorted_word_scores = sorted(word_scores.items(), key=lambda e: e[1], reverse=True)

    del cond_word_sum
    del word_fd
    del cond_word_fd

    if best_n < len(sorted_word_scores):
        sorted_word_scores = sorted_word_scores[:best_n]
    else:
        best_n = len(sorted_word_scores)

    # real word2index index2world
    for index in range(best_n):
        word2index[t_index2word[sorted_word_scores[index][0]]] = index
        index2word[index] = t_index2word[sorted_word_scores[index][0]]

    for i in range(len(word2index)):
        if index2word[i] in w2v_model:
            wordvector.append(w2v_model[index2word[i]])
        else:
            print("RANDOM VECTOR FOR: %s" % (index2word[i]))
            wordvector.append(np.random.randn(100))

    # 'U'
    word2index["U"] = best_n
    index2word[best_n] = "U"
    wordvector.append(np.random.randn(100))

    train_vector = []
    train_vtag = []

    # train_info
    for tag_id in t_train_info:
        for l_index in range(len(t_train_info[tag_id])):
            objs = []
            for item in t_train_info[tag_id][l_index]:
                if t_index2word[item] in word2index:
                    objs.append(word2index[t_index2word[item]])
                else:
                    objs.append(word2index["U"])
            objs = list(set(objs))
            if objs:
                train_vector.append(objs)
                train_vtag.append(tag_id)

    del t_index2word
    del t_word2index
    del sorted_word_scores
    del t_train_info

    print("build_train_data finished!")
    return
Esempio n. 6
0
 def is_not_chinese_word(self, word):
     return not (word and is_zh(word[0]))