예제 #1
0
def predict(sentence, num_words, model):  # 预测此句是否为要点句,是返回1,不是返回0
    data = np.empty((1, num_words, size), dtype="float64")

    sentence = u.remove_useless(sentence)
    word_list = u.seg2words_long(sentence)
    word_list = word_list[:num_words]  # 把长度缩减到训练模型的长度
    num = 0

    vector_model = wv.load_model()
    for i in range(len(word_list)):
        word = word_list[i].encode('utf-8')
        vector = wv.get_vector(vector_model, word)
        if vector == []:
            continue

        data[0, i, :] = vector
        num += 1

    for j in range(num, num_words):
        data[0, j, :] = -1

    prediction = model.predict(data)
    # print "%.2f%%" % (float(prediction[0][0]) * 100) + " " + "%.2f%%" % (float(prediction[0][1]) * 100)
    return np.argmax(prediction)
예제 #2
0
def save_data(line_list, data_path, ignore):

    num_lines = 0
    largest_num = 0

    vector_model = wv.load_model()

    for i in range(len(line_list)):
        if (i + 1) % 50 == 0:
            print("第" + str(i + 1) + "行 (" + str(i + 1) + "/" +
                  str(len(line_list)) + ")")

        # 清除无关信息
        line_list[i] = u.remove_useless(line_list[i])

        # 处理标签
        label = 0
        if "|" in line_list[i]:
            label = 1
            line_list[i] = line_list[i].replace("|", "")
        else:
            if ignore:
                if random.randint(0, 9) < 3:
                    label = 0
                else:
                    continue
            else:
                label = 0

        # 转换为词向量
        total_vector = []
        word_list = u.seg2words_long(line_list[i])
        for word in word_list:
            word = word.encode('utf-8')
            vector = wv.get_vector(vector_model, word)  # 模型是utf-8的
            if (vector == []):
                continue
            total_vector.append(vector)

        # 找到最大值
        if len(total_vector) > largest_num:
            largest_num = len(total_vector)

        # 去除空行
        if total_vector == []:
            continue

        num_lines += 1

        # 写入数据
        f = open(data_path, "a")
        f.write(str(label) + "\n")
        for vector in total_vector:
            for num in vector:
                f.write(str(num) + " ")
            f.write("\n")
        f.write("%\n")
        f.close()

    # 在开头两行补上数据的维度,以供在训练的初始化时提取
    f = open(data_path, 'r+')
    content = f.read()
    f.seek(0, 0)
    f.write(str(num_lines) + "\n")
    f.write(str(largest_num) + "\n")
    f.write(content)
    f.close()