def main():
    # 设置结果保存的目录
    result_dir = r'D:\semantic analysis\2016-10-05结果\html标记分句2//'
    txt_dir = r"D:\semantic analysis\2016-10-05结果\新词分句//"
    set_dir = r"D:\semantic analysis\2016-10-05结果\新词//"

    k_list = util.get_key_list()

    for key in k_list:
        print(key)
        # 文件目录
        file_list = sorted(util.get_file_list(txt_dir + key, ".txt"))
        # 集合目录
        set_list = sorted(util.get_file_list(set_dir + key, ".pkl"))

        util.create_directory(result_dir + "新词//" + key + "//")

        i = 0
        while i < len(file_list):
            s_list = util.get_list_from_file(txt_dir + key + "//" +
                                             set_list[i][0:-4] + ".txt")
            new_word_list = util.get_nw(set_dir + key + "//" + set_list[i])
            # 过滤相同的语句,防止重复计算
            s_list = list(set(s_list))
            w_list = remark(s_list, new_word_list, key)
            html_name = file_list[i][:-4] + '.html'
            util.save_file(result_dir + "新词//" + key + "//" + html_name,
                           w_list)
            i += 1
Esempio n. 2
0
def main():
    k_list = util.get_key_list()
    jieba.set_dictionary("D:\semantic analysis\分词\词库\导出结果\dict1.txt")
    jieba.initialize()
    for key in k_list:
        print(key)
        file_list = util.get_file_list(
            'D://semantic analysis//analyze_data//fc//' + key, ".txt")
        # 建立目录
        mk_dir('./w')
        mk_dir('./p')

        for n_file in file_list:
            s_list = util.get_list_from_file(n_file)
            # 过滤相同的语句,防止重复计算
            print(len(s_list))
            s_list = list(set(s_list))
            print(len(s_list))
            wg = nx.Graph()
            pg = nx.Graph()

            for sentence in s_list:
                # 创建整句话的网络
                ll = util.input_filer(sentence)
                wg = add_s2g(wg, ' '.join(ll))

                # 只创建关键词所在的分句的网络
                for ss in ll:
                    if (key in ss):
                        pg = add_s2g(pg, ss)

            pkl_name = n_file[:-4] + '.pkl'
            util.save_nw(pg, './/p//' + pkl_name)
            util.save_nw(wg, './/w//' + pkl_name)

            print(n_file)
            print(
                time.strftime('%Y-%m-%d %H:%M:%S',
                              time.localtime(time.time())))

            with open('record.txt', 'a', encoding='utf-8') as rf:
                rf.write(n_file + '\n')
def main():
    # 设置结果保存的目录
    result_dir = r'D:\semantic analysis\新纯文本\1新词分句//'
    txt_dir = r"D:\semantic analysis\新纯文本\1新词//"

    k_list = util.get_key_list()

    for key in k_list:
        print(key)
        # 文件目录
        file_list = util.get_file_list(txt_dir + key, ".txt")

        # 建立目录
        # mk_dir(result_dir+"新词整句//"+key)
        mk_dir(result_dir + key)

        for file in file_list:
            s_list = util.get_list_from_file(txt_dir + key + "//" + file)
            # 过滤相同的语句,防止重复计算
            # s_list = list(set(s_list))
            w_list, p_list = extract_sentence(s_list, key)
            util.save_file(result_dir + key + "//" + file, p_list, True)
import re
import tool.util as util

key_list = util.get_key_list()

dict_dir = r"D:\semantic analysis\2016-10-09结果\词频1//"
for key in key_list:
    print(key)
    set_dict, file_name = util.get_objdict_list(dict_dir + key, ".txt")
    date_list = util.get_file_list(dict_dir + key, ".txt")
    pattern = re.compile(r"(\d*-\d*)-\d*")
    month_array = pattern.findall(" ".join(date_list))
    month_array = ["2010"]

    util.create_directory(r"D:\semantic analysis\2016-10-12结果\2010年频数统计//" +
                          key)

    # 循环查找月份
    for month in month_array:
        pattern = re.compile(r"(" + month + "-\d*-\d*)")
        date_array = pattern.findall(" ".join(date_list))
        print(date_array)
        # 循环合并月份频数字典
        r_dict = dict()
        for file_date in date_array:
            r_dict = util.union_dicts(set_dict[file_date + ".txt"], r_dict)
        util.save_dict_list(
            r_dict,
            r"D:\semantic analysis\2016-10-12结果\2010年频数统计//" + key + ".txt")
            # 相互比例
            if mode == 1:
                r1, r2 = com_function(copy.deepcopy(g1), copy.deepcopy(g2))
                result_list.append(nw_list[ii + lap][0:-4] + "\t" + str(r1))
                result_list.append((nw_list[ii][0:-4] + "\t" + str(r2)))
            # 一对一
            elif mode == 0:
                result_list = com_function(copy.deepcopy(g1),
                                           copy.deepcopy(g2))
                util.save_file(
                    result_dir + key + "//" + nw_list[ii + lap][0:-4] + ".txt",
                    result_list)
            # n对一
            elif mode == 2:
                r1 = com_function(copy.deepcopy(g1), copy.deepcopy(g2), type)
                result_list.append(nw_list[ii + lap][0:-4] + "\t" + str(r1))

            ii -= lap
        if mode != 0:
            result_list.reverse()
            util.save_file(result_dir + key + ".txt", result_list)


key_list = util.get_key_list() + util.get_key_list2()
pkl_dir = r"D:\semantic analysis\新结果\去虚词去单字\合成共现网络\{0}\p//"
# pkl_dir = r"D:\semantic analysis\新结果\合并图\{0}//"
result_dir = r"D:\semantic analysis\新结果\去虚词去单字\半年间边变化率//"
# result_dir = r"D:\semantic analysis\新结果\合并图\扯淡//"
loop_compare(cal_existed_edges_ratio, key_list, pkl_dir, result_dir, 2, 5)
# loop_compare(same_node_degree, key_list, pkl_dir, result_dir, 0)