Beispiel #1
0
def main():
    # 读取表2
    comments = read_xl_by_line(sheet_2_input)
    # 读取附件三、四到元组
    sheet_3 = read_xl_by_line(sheet_3_input)
    sheet_4 = read_xl_by_line(sheet_4_input)

    # 分词、去停用词并生成表2的评论对象字典
    stop_words = load_word_list(stop_words_input)
    comm_dict_2 = Comm.generate_comm_dict(comments, cut_all=cut_all, stop_words_lt=stop_words)
    comm_dict_3 = Comm.generate_comm_dict(sheet_3, cut_all=cut_all, stop_words_lt=stop_words)
    comm_dict_4 = Comm.generate_comm_dict(sheet_4, cut_all=cut_all, stop_words_lt=stop_words)

    # 将三个表分好词的语料输出到文本文件
    text_file = open(line_sentence_output, "w", encoding="utf8")
    for d in (comm_dict_2, comm_dict_3, comm_dict_4):
        for c in d.values():
            text_file.write(" ".join(c.seg_topic) + "\n")  # 留言主题
            text_file.write(" ".join(c.seg_detail) + "\n")  # 留言详情
    text_file.close()

    # 训练word2vec模型(使用贝叶斯优化得出的一组参数)
    word2vec_model = Word2Vec(LineSentence(line_sentence_output),
                              alpha=0.01667,
                              min_count=6,
                              size=176,
                              window=36)
    # 文本模式保存word2vec模型
    word2vec_model.wv.save_word2vec_format(word2vec_model_path, binary=False)
Beispiel #2
0
def fetch_issue1_dataset():
    """
    加载留言分类问题的默认数据集
    :return: tuple(文档向量, 标签标志, 标签名)
    """
    stop = fetch_default_stop_words()
    comments = read_xl_by_line(sheet_2_input)

    # 加载训练好的wv模型
    wv_model = gensim.models.KeyedVectors.load_word2vec_format(
        word2vec_model_path, binary=False)

    # 预处理,并构造留言对象
    comm_dict = Comm.generate_comm_dict(comments,
                                        stop_words_lt=stop,
                                        cut_all=True,
                                        full_dataset=True)
    # 表2中涉及的所有一级标签
    target_names = list(set([row[5] for row in comments]))
    # 将表2中标注的所有一级标签转化成数字表示(target_names中的index)
    targets = [target_names.index(row[5]) for row in comments]

    # 计算文档向量,向量化模型使用预训练的Word2Vec
    vec = [doc_vec(comm_dict[row[0]].seg_detail, model=wv_model) for row in comments]

    return vec, targets, target_names
def main():
    # 构造向量化工具
    count_vect = CountVectorizer()
    comments = read_xl_by_line("../../resources/xls/e2.xlsx")  # 留言文本
    stop_words = load_word_list("../../resources/special-words/stop_words.txt")  # 停用词
    comm_dict_2 = Comm.generate_comm_dict(comments, True, stop_words)

    line_sents = [comm_dict_2[row[0]].seg_topic + comm_dict_2[row[0]].seg_detail for row in comments]
    sents = list(map(lambda x: " ".join(x), line_sents))

    # 表2中涉及的所有一级标签
    target_names = list(set([row[5] for row in comments]))
    # 将表2中标注的所有一级标签转化成数字表示(target_names中的index)
    targets = [target_names.index(row[5]) for row in comments]

    # 分割训练集和测试集
    x_train, x_test, y_train, y_test \
        = train_test_split(sents, targets, test_size=0.3)

    x_train_counts = count_vect.fit_transform(x_train)  # 拟合模型

    # 构建TF-TDF特征
    tf_transformer = TfidfTransformer().fit(x_train_counts)
    # 构建TF-IDF特征
    x_train_tf = tf_transformer.transform(x_train_counts)

    # 使用特征集(X)和目标(target)训练/拟合一个朴素贝叶斯分类器
    clf = MultinomialNB()
    clf.fit(x_train_tf.toarray(), y_train)

    test_tf = tf_transformer.transform(count_vect.transform(x_test))

    predicted = clf.predict(test_tf.toarray())  # 分类器

    print(classification_report(y_true=y_test, y_pred=predicted, target_names=target_names))
Beispiel #4
0
def tf_idf_knn_clf():
    """使用tf-idf作为特征的kNN分类器"""
    sheet_2 = read_xl_by_line(sheet_2_input)
    stop = dataset.fetch_default_stop_words()
    comm_dict_2 = dataset.fetch_data("full_dataset_sheet_2", stop_words=stop,
                                     cut_all=False, remove_duplicates=False)

    _, targets, target_names = dataset.fetch_issue1_dataset()

    seg_sheet_2 = [comm_dict_2[row[0]].seg_topic + comm_dict_2[row[0]].seg_detail for row in sheet_2]
    sents = list(map(lambda x: " ".join(x), seg_sheet_2))

    count_vect = CountVectorizer()

    # 分割训练集和测试集
    x_train, x_test, y_train, y_test \
        = train_test_split(sents, targets, test_size=0.3)

    x_train_counts = count_vect.fit_transform(x_train)  # 拟合模型

    # 构建TF-TDF特征
    tf_transformer = TfidfTransformer().fit(x_train_counts)
    # 构建TF-IDF特征
    x_train_tf = tf_transformer.transform(x_train_counts)

    # 使用特征集(X)和目标(target)训练/拟合一个kNN分类器
    clf = KNeighborsClassifier(algorithm="brute", leaf_size=11, n_neighbors=13)
    clf.fit(x_train_tf.toarray(), y_train)

    # 将x_test中的文本转化为tf-idf矩阵表示
    test_tf = tf_transformer.transform(count_vect.transform(x_test))
    predicted = clf.predict(test_tf)  # 预测x_test中的分类

    acc = metrics.accuracy_score(y_test, predicted)
    # f1 = metrics.f1_score(y_test, predicted)
    print(f"acc: {acc}")
def main():
    rows = read_xl_by_line("../../resources/xls/e3.xlsx")  # 以元组的形式加载附件表
    # todo::聚类时是否使用“留言主题”?
    lines = [row[2] + "。" + row[4] for row in rows]

    stop = fetch_default_stop_words()  # 默认停用词
    stop.extend(["\t", "\n", ""])  # 附加的停用词

    # 要求的特定词性
    word_type = [
        "n",
        "s",
        "ns",
        "nt",
        "nw",
        "nz",  # 各种名词
        "a",
        "an",  # 形容词
        "LOC",
        "ORG"
    ]  # 专有地名、组织机构名

    # 挑出指定词性的词,将他们放在列表中(每个留言文本是一个词列表)
    specific_only = pick_specific_type_words(text=lines,
                                             types=word_type,
                                             stop_words=stop)

    # --------------------------------------
    # 对特定词性的词加权后进行文本聚类

    # 加载之前训练的Word2Vec模型
    # wv_model = gensim.models.KeyedVectors.load_word2vec_format(
    #     "../resources/word2vec_build_on_all_text", binary=False)

    # 从附件3的语料中建立word2vec模型
    line_sents = fetch_data("example_3", stop_words=stop, mode="lines")
    wv_model = Word2Vec(line_sents, size=400, window=5, sg=1, min_count=5)

    # 计算所有文档(词列表)的文档向量,对特定词性的词加权,特定词性的词的权重设置为2(其他词默认权重为1)
    weight = {
        key: 1
        for key in set([word for words in specific_only for word in words])
    }
    docs_vec = [
        doc_vec_with_weight(doc, model=wv_model, weight=weight) for doc in
        fetch_data("example_3", cut_all=False, mode="lines", stop_words=stop)
    ]

    # 使用pandas进行数据标准化(z-score)
    data = pd.DataFrame(docs_vec)
    data_zs = (data - data.mean()) / data.std()

    # 均值漂移(Mean Shift)模型
    ms_model = MeanShift()
    ms_model.fit(data_zs)  # 拟合模型

    # Kmeans模型
    # km_model = KMeans(n_clusters=5, max_iter=500)
    # km_model.fit(data_zs)

    labels = ms_model.labels_
    output_cluster(labels, lines)
def main():
    comments_with_likes = read_xl_by_line("../../resources/full_dataset/full_dataset_sheet_3.xlsx")  # 加载附件三
    stop_words = fetch_default_stop_words()
    # 分词、去停用词、并生成字典
    comm_dict_3 = fetch_data("full_dataset_sheet_3", stop_words=stop_words, cut_all=True, remove_duplicates=False)
    print("data loading completed." + str(time.asctime(time.localtime(time.time()))))

    # ----------------------------------------
    # 方案一:从附件三建立Word2Vec模型
    # line_sents = fetch_data("example_3", stop_words=stop_words, mode="lines")
    # wv_model = Word2Vec(line_sents,
    #                     size=400, window=5, sg=1, min_count=5)

    # 方案二:从三个附件的语料建立Word2Vec模型
    # wv_model = Word2Vec(LineSentence("../resources/line_sents.txt"),
    #   size=400, window=5, sg=1, min_count=5)

    # 方案三:加载之前在云端训练的模型
    wv_model = gensim.models.KeyedVectors.load_word2vec_format(
        "../resources/wv_model_full_dataset_0425", binary=False)
    print("wv model loading completed." + str(time.asctime(time.localtime(time.time()))))
    # ----------------------------------------

    # 计算基于Word2Vec模型的文档向量
    docs_vec = [doc_vec(
        comm_dict_3[row[0]].seg_topic + comm_dict_3[row[0]].seg_detail,
        model=wv_model) for row in comments_with_likes]
    print("doc vec processing completed.", str(time.asctime(time.localtime(time.time()))))

    # 使用pandas进行数据标准化(z-score)
    data = pd.DataFrame(docs_vec)
    data_zs = (data - data.mean()) / data.std()

    # ----------------------------------------
    # 聚类方案一:K-Means
    # k = 1000  # 类数
    # iteration = 500  # 最大迭代次数
    # km_model = KMeans(n_clusters=k, n_jobs=4, max_iter=iteration)
    #
    # # 训练/拟合模型
    # km_model.fit(data_zs)
    # joblib.dump(km_model, "../resources/km_model_sheet_3_wv")

    # 加载训练好的K-Means模型
    # km_model = joblib.load("../resources/km_model_sheet_3_wv")

    # 输出每个类别样本个数
    # print(pd.Series(km_model.labels_).value_counts())
    # labels = km_model.labels_  # 获取样本的聚类标签

    # ------------------------------------------------
    # 聚类方案二:均值漂移(Mean Shift)模型
    ms_model = MeanShift(bandwidth=11)
    ms_model.fit(data_zs)  # 拟合模型

    # 加载训练好的Mean-Shift模型
    # ms_model = joblib.load("../resources/ms_model_sheet_3_wv")
    labels = ms_model.labels_

    # 保存模型到resources目录
    # joblib.dump(ms_model, "../resources/ms_model_sheet_3_wv")

    # ------------------------------------------------
    # 聚类方案三:AP聚类
    # ap_model = AffinityPropagation()
    # ap_model.fit(data_zs)
    # labels = ap_model.labels_
    print("model fit/load completed.", str(time.asctime(time.localtime(time.time()))))

    # 按标签分类存放在字典中
    comm_cluster = {}
    for index, row in enumerate(comments_with_likes):
        if labels[index] in list(comm_cluster.keys()):
            comm_cluster[labels[index]].append(row)
        else:
            comm_cluster[labels[index]] = [row]
    print("label picking completed." + str(time.asctime(time.localtime(time.time()))))

    # 每个簇的热度评价
    # for key in comm_cluster.keys():
    #     print(f"Label {key}", HotspotAssess(comm_cluster[key]).score)
    #     # for row in comm_cluster[key]:
    #     #     print(row[5])
    #     print("-------------------------------")

    # 仅考虑留言数量大于等于3的簇,剩下的视为“离群点”
    more_valuable_clusters = [cluster for cluster in comm_cluster.values() if len(cluster) >= 3]
    # 根据热度排序,选出热度前五的簇
    sorted_clusters = sorted(more_valuable_clusters, key=lambda c: HotspotEvaluation(c).score)
    print("ranking completed." + str(time.asctime(time.localtime(time.time()))))

    for cluster in sorted_clusters:
        print(cluster)
Beispiel #7
0
def main():
    # 加载附件三
    comments_with_likes = read_xl_by_line(sheet_3_input)
    # 加载停用词
    stop_words = fetch_default_stop_words()
    # 分词、去停用词、并生成字典
    comm_dict_3 = fetch_data("full_dataset_sheet_3",
                             stop_words=stop_words,
                             cut_all=True,
                             remove_duplicates=False)
    print("data loading completed." +
          str(time.asctime(time.localtime(time.time()))))

    # 加载训练好的wv模型
    wv_model = gensim.models.KeyedVectors.load_word2vec_format(
        word2vec_model_path, binary=False)
    print("wv model loading completed." +
          str(time.asctime(time.localtime(time.time()))))

    # 计算基于Word2Vec模型的文档向量,并进行标准化
    docs_vec = [
        doc_vec(comm_dict_3[row[0]].seg_topic + comm_dict_3[row[0]].seg_detail,
                model=wv_model) for row in comments_with_likes
    ]
    print("doc vec processing completed.",
          str(time.asctime(time.localtime(time.time()))))
    data = pd.DataFrame(docs_vec)
    data_zs = (data - data.mean()) / data.std()

    # 训练均值漂移模型
    ms_model = MeanShift(bandwidth=4)
    ms_model.fit(data_zs)  # 拟合模型

    labels = ms_model.labels_
    print("mean-shift model fit/load completed.",
          str(time.asctime(time.localtime(time.time()))))

    # 按标签分类存放在字典中
    comm_cluster = {}
    for index, row in enumerate(comments_with_likes):
        if labels[index] in list(comm_cluster.keys()):
            comm_cluster[labels[index]].append(row)
        else:
            comm_cluster[labels[index]] = [row]
    print("label picking completed." +
          str(time.asctime(time.localtime(time.time()))))

    # 仅考虑留言数量大于等于3的簇,剩下的视为“离群点”
    more_valuable_clusters = [
        cluster for cluster in comm_cluster.values() if len(cluster) >= 3
    ]
    # 根据热度排序,选出热度前五的簇
    sorted_clusters = sorted(more_valuable_clusters,
                             key=lambda c: HotspotEvaluation(c).score,
                             reverse=True)
    print("ranking completed." +
          str(time.asctime(time.localtime(time.time()))))

    # 对热度前五的簇进行LDA主题建模,抽取关键词
    top_5 = sorted_clusters[:5]
    for cluster in top_5:
        print(HotspotEvaluation(cluster).score)
        print(draw_cluster_key_word(cluster))
        print("----------------------")

    # 生成热点问题表
    cluster_sheet_title = ("热度排名", "问题ID", "热度指数", "时间范围", "地点/人群", "问题描述")
    cluster_sheet_rows = [
        (index + 1, index + 1, HotspotEvaluation(cluster).score,
         HotspotEvaluation(cluster).date_range_str, None,
         ",".join(draw_cluster_key_word(cluster)))
        for index, cluster in enumerate(top_5)
    ]
    # 写入Excel表格
    write_rows(path=cluster_sheet_path,
               rows=cluster_sheet_rows,
               title=cluster_sheet_title)

    # 生成热点问题留言明细表
    detail_sheet_title = ("问题ID", "留言编号", "留言用户", "留言主题", "留言时间", "留言详情",
                          "点赞数", "反对数")
    detail_sheet_rows = [
        (index + 1, *row[:5], row[6], row[5])  # 最后两列的顺序与附件四相反
        for index, cluster in enumerate(top_5) for row in cluster
    ]
    # 写入Excel表格
    write_rows(path=detail_sheet_path,
               rows=detail_sheet_rows,
               title=detail_sheet_title)
Beispiel #8
0
def fetch_data(ds_name: str, cut_all=True, mode='dict', stop_words=None, remove_duplicates=True):
    """
    加载数据集
    :param ds_name: 数据集名
    1.以"example"开头的数据集为示例小数据集
    2.以"full_dataset"开头的是完整数据集
    3.其他

    :param cut_all: 是否启用全模式
    :param mode: 两种返回模式
    1.'dict': 存储Comm对象(留言对象)的字典,键为留言id,值为留言对象
    2.'lines': 已分词的词链表,list of list(words in a sentence),原文本中的每句话为一个列表元素

    :param stop_words: 停用词表
    :param remove_duplicates: 是否针对“留言详情”去重
    :return: list or dict
    """
    stop_words = [] if stop_words is None else stop_words
    comments = []
    comm_dict = {}

    if ds_name.startswith("example"):

        if ds_name == "example_3":
            comments = read_xl_by_line("../resources/xls/e3.xlsx")  # 加载附件三
        elif ds_name == "example_2":
            comments = read_xl_by_line("../resources/xls/e2.xlsx")
        elif ds_name == "example_4":
            comments = read_xl_by_line("../resources/xls/e4.xlsx")
        elif ds_name == "example_all":
            # 三个附件语料的集合
            return fetch_data("example_2", cut_all, mode, stop_words) \
                   + fetch_data("example_3", cut_all, mode, stop_words) \
                   + fetch_data("example_4", cut_all, mode, stop_words)
        comm_dict = Comm.generate_comm_dict(comments, cut_all=cut_all, stop_words_lt=stop_words)

    if ds_name.startswith("full_dataset"):
        if ds_name == "full_dataset_sheet_2":
            comments = read_xl_by_line(sheet_2_input)
        elif ds_name == "full_dataset_sheet_3":
            comments = read_xl_by_line(sheet_3_input)
        elif ds_name == "full_dataset_sheet_4":
            comments = read_xl_by_line(sheet_4_input)
        else:
            raise UnknownDataset
        # 完整数据集的附件三的列排列顺序与示例数据不同,因此附加full_dataset参数进行适应
        comm_dict = Comm.generate_comm_dict(comments,
                                            cut_all=cut_all,
                                            stop_words_lt=stop_words,
                                            full_dataset=True)

    if ds_name == "sheet_4_labeled":  # 加载手工标注后的附件四
        comments = read_xl_by_line(sheet_4_labeled_input)
        comm_dict = Comm.generate_comm_dict(comments,
                                            cut_all=cut_all,
                                            stop_words_lt=stop_words,
                                            full_dataset=True)

    if remove_duplicates is True:
        # 针对“留言详情”去重(利用集合的特性实现去重)
        detail_key_dict = {elem.detail: elem for key, elem in comm_dict.items()}
        comm_dict = {elem.comm_id: elem for key, elem in detail_key_dict.items()}

    if mode == 'dict':
        return comm_dict
    if mode == 'lines':
        return [comm_dict[row[0]].seg_topic + comm_dict[row[0]].seg_detail for row in comments]
    if mode == 'reply_lines':
        return [comm_dict[row[0]].seg_reply for row in comments]