def main(): # 设置结果保存的目录 result_dir = r'D:\semantic analysis\2016-10-05结果\html标记分句2//' txt_dir = r"D:\semantic analysis\2016-10-05结果\新词分句//" set_dir = r"D:\semantic analysis\2016-10-05结果\新词//" k_list = util.get_key_list() for key in k_list: print(key) # 文件目录 file_list = sorted(util.get_file_list(txt_dir + key, ".txt")) # 集合目录 set_list = sorted(util.get_file_list(set_dir + key, ".pkl")) util.create_directory(result_dir + "新词//" + key + "//") i = 0 while i < len(file_list): s_list = util.get_list_from_file(txt_dir + key + "//" + set_list[i][0:-4] + ".txt") new_word_list = util.get_nw(set_dir + key + "//" + set_list[i]) # 过滤相同的语句,防止重复计算 s_list = list(set(s_list)) w_list = remark(s_list, new_word_list, key) html_name = file_list[i][:-4] + '.html' util.save_file(result_dir + "新词//" + key + "//" + html_name, w_list) i += 1
def main(): k_list = util.get_key_list() jieba.set_dictionary("D:\semantic analysis\分词\词库\导出结果\dict1.txt") jieba.initialize() for key in k_list: print(key) file_list = util.get_file_list( 'D://semantic analysis//analyze_data//fc//' + key, ".txt") # 建立目录 mk_dir('./w') mk_dir('./p') for n_file in file_list: s_list = util.get_list_from_file(n_file) # 过滤相同的语句,防止重复计算 print(len(s_list)) s_list = list(set(s_list)) print(len(s_list)) wg = nx.Graph() pg = nx.Graph() for sentence in s_list: # 创建整句话的网络 ll = util.input_filer(sentence) wg = add_s2g(wg, ' '.join(ll)) # 只创建关键词所在的分句的网络 for ss in ll: if (key in ss): pg = add_s2g(pg, ss) pkl_name = n_file[:-4] + '.pkl' util.save_nw(pg, './/p//' + pkl_name) util.save_nw(wg, './/w//' + pkl_name) print(n_file) print( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) with open('record.txt', 'a', encoding='utf-8') as rf: rf.write(n_file + '\n')
def main(): # 设置结果保存的目录 result_dir = r'D:\semantic analysis\新纯文本\1新词分句//' txt_dir = r"D:\semantic analysis\新纯文本\1新词//" k_list = util.get_key_list() for key in k_list: print(key) # 文件目录 file_list = util.get_file_list(txt_dir + key, ".txt") # 建立目录 # mk_dir(result_dir+"新词整句//"+key) mk_dir(result_dir + key) for file in file_list: s_list = util.get_list_from_file(txt_dir + key + "//" + file) # 过滤相同的语句,防止重复计算 # s_list = list(set(s_list)) w_list, p_list = extract_sentence(s_list, key) util.save_file(result_dir + key + "//" + file, p_list, True)
import re import tool.util as util key_list = util.get_key_list() dict_dir = r"D:\semantic analysis\2016-10-09结果\词频1//" for key in key_list: print(key) set_dict, file_name = util.get_objdict_list(dict_dir + key, ".txt") date_list = util.get_file_list(dict_dir + key, ".txt") pattern = re.compile(r"(\d*-\d*)-\d*") month_array = pattern.findall(" ".join(date_list)) month_array = ["2010"] util.create_directory(r"D:\semantic analysis\2016-10-12结果\2010年频数统计//" + key) # 循环查找月份 for month in month_array: pattern = re.compile(r"(" + month + "-\d*-\d*)") date_array = pattern.findall(" ".join(date_list)) print(date_array) # 循环合并月份频数字典 r_dict = dict() for file_date in date_array: r_dict = util.union_dicts(set_dict[file_date + ".txt"], r_dict) util.save_dict_list( r_dict, r"D:\semantic analysis\2016-10-12结果\2010年频数统计//" + key + ".txt")
# 相互比例 if mode == 1: r1, r2 = com_function(copy.deepcopy(g1), copy.deepcopy(g2)) result_list.append(nw_list[ii + lap][0:-4] + "\t" + str(r1)) result_list.append((nw_list[ii][0:-4] + "\t" + str(r2))) # 一对一 elif mode == 0: result_list = com_function(copy.deepcopy(g1), copy.deepcopy(g2)) util.save_file( result_dir + key + "//" + nw_list[ii + lap][0:-4] + ".txt", result_list) # n对一 elif mode == 2: r1 = com_function(copy.deepcopy(g1), copy.deepcopy(g2), type) result_list.append(nw_list[ii + lap][0:-4] + "\t" + str(r1)) ii -= lap if mode != 0: result_list.reverse() util.save_file(result_dir + key + ".txt", result_list) key_list = util.get_key_list() + util.get_key_list2() pkl_dir = r"D:\semantic analysis\新结果\去虚词去单字\合成共现网络\{0}\p//" # pkl_dir = r"D:\semantic analysis\新结果\合并图\{0}//" result_dir = r"D:\semantic analysis\新结果\去虚词去单字\半年间边变化率//" # result_dir = r"D:\semantic analysis\新结果\合并图\扯淡//" loop_compare(cal_existed_edges_ratio, key_list, pkl_dir, result_dir, 2, 5) # loop_compare(same_node_degree, key_list, pkl_dir, result_dir, 0)