def cal_mcs(pkl_dir, mcs_dir, is_front, key_word, lap=1): f_list = util.get_file_list(pkl_dir, '.pkl') os.chdir(pkl_dir) # 升序排序 nw_list = sorted(f_list) record_list = [] num_list = [] enum_list = [] ii = len(nw_list) - 1 # g2是2号 g1是1号,此处获取最末端的网络 g2 = util.get_nw(nw_list[ii]) # 迭代生成子图 k = 1 while k < lap: g2 = mcs(g2, util.get_nw(nw_list[ii - k])) k += 1 while ii > 0: jj = ii ii -= lap # print(nw_list[ii]) g1 = util.get_nw(nw_list[ii]) # 迭代生成子图 k = 1 while k < lap: g1 = mcs(g1, util.get_nw(nw_list[ii - k])) k += 1 # 生成连通子图 g1 = mcs(g2, g1) # 生成文件名字 filename = nw_list[ii][0:-4] + '-' + nw_list[jj][0:-4] + '.pkl' # 2016.9.20 用于测试,保存结果 util.save_nw( g1, 'D://semantic analysis//nalyze_data//result//过程结果//连通子图//' + filename) if is_front: # 计算比例,1,2 跟1比 pr = mcs_ratio(g1, g1, key_word) record_list.append(nw_list[jj][0:-4] + '\t' + str(pr)) else: # 计算比例,1,2 跟2比 pr = mcs_ratio(g1, g2, key_word) record_list.append(nw_list[jj][0:-4] + '\t' + str(pr)) num_list.append(nw_list[jj][0:-4] + '\t' + str(g1.number_of_nodes())) enum_list.append(nw_list[jj][0:-4] + '\t' + str(g1.number_of_edges())) # 统计节点数 # with open(mcs_dir + filename[0:-4]+'.txt','w',encoding='utf-8') as file: # for node in g1.nodes(): # file.write(node+'\n') # util.save_nw(g1,mcs_dir + filename) g2 = g1
def loop_compare(com_function, keyword_list, pkl_dir1, result_dir, mode=1, lap=1, type="pkl"): for key in keyword_list: global keyword keyword = key print(key) if mode == 0: util.create_directory(result_dir + key + "//") pkl_dir = pkl_dir1.format(key) f_list = util.get_file_list(pkl_dir, '.pkl') os.chdir(pkl_dir) result_list = [] # 升序排序 nw_list = sorted(f_list) ii = len(nw_list) - 1 while ii - 2 * lap >= 0: g2 = util.get_nw(nw_list[ii]) # 迭代生成子图 # k = 1 # while k < lap: # g2 = nx.compose(g2, util.get_nw(nw_list[ii - k])) # k += 1 ii -= lap g1 = util.get_nw(nw_list[ii]) # 迭代生成子图 # k = 1 # while k < lap: # g1 = nx.compose(g1, util.get_nw(nw_list[ii - k])) # k += 1 # 生成连通子图 # 相互比例 if mode == 1: r1, r2 = com_function(copy.deepcopy(g1), copy.deepcopy(g2)) result_list.append(nw_list[ii + lap][0:-4] + "\t" + str(r1)) result_list.append((nw_list[ii][0:-4] + "\t" + str(r2))) # 一对一 elif mode == 0: result_list = com_function(copy.deepcopy(g1), copy.deepcopy(g2)) util.save_file( result_dir + key + "//" + nw_list[ii + lap][0:-4] + ".txt", result_list) # n对一 elif mode == 2: r1 = com_function(copy.deepcopy(g1), copy.deepcopy(g2), type) result_list.append(nw_list[ii + lap][0:-4] + "\t" + str(r1)) ii -= lap if mode != 0: result_list.reverse() util.save_file(result_dir + key + ".txt", result_list)
def loop_compare(com_function, keyword_list, pkl_dir1, result_dir, mode=1, lap=1, type="pkl"): for key in keyword_list: print(key) if mode == 0: util.create_directory(result_dir + key + "//") pkl_dir = pkl_dir1.format(key) f_list = util.get_file_list(pkl_dir, '.txt') os.chdir(pkl_dir) result_list = [] # 升序排序 nw_list = sorted(f_list) ii = len(nw_list) - 1 while ii - 2 * lap >= 0: g2 = util.txt2dict(util.get_list_from_file(nw_list[ii])) # 迭代生成子图 k = 1 while k < lap: g2 = nx.compose(g2, util.get_nw(nw_list[ii - k])) k += 1 ii -= lap g1 = util.txt2dict(util.get_list_from_file(nw_list[ii])) d1 = util.get_nw( "D:\semantic analysis\新结果\去虚词去单字共现网络//{0}//p//".format(key) + nw_list[ii].split(".")[0] + ".pkl") # 迭代生成子图 k = 1 while k < lap: g1 = nx.compose(g1, util.get_nw(nw_list[ii - k])) k += 1 # 生成连通子图 if mode == 1: r1, r2 = com_function(copy.deepcopy(g1), copy.deepcopy(g2)) result_list.append(nw_list[ii + lap][0:-4] + "\t" + str(r1)) result_list.append((nw_list[ii][0:-4] + "\t" + str(r2))) elif mode == 0: result_list = com_function(copy.deepcopy(g1), copy.deepcopy(g2)) util.save_file( result_dir + key + "//" + nw_list[ii + lap][0:-4] + ".txt", result_list) elif mode == 2: r1 = com_function(copy.deepcopy(g1), copy.deepcopy(g2), d1) # result_list.append(str(r1)) result_list.append(nw_list[ii + lap][0:-4] + "\t" + str(r1)) ii -= lap if mode != 0: result_list.reverse() util.save_file(result_dir + key + ".txt", result_list)
def cal_mcs(pkl_dir): os.chdir(pkl_dir) pkl_list = util.get_file_list(pkl_dir, '.pkl') # 从小到大排序 pkl_list = sorted(pkl_list) m_set = util.get_nw(r"D:\semantic analysis\2016-10-03结果\达人//上升集合1.pkl") for file in pkl_list: set1 = util.get_nw(file) r_set = m_set & set1 print(len(r_set) / len(m_set))
def main1(keyword): dirr = 'D:\semantic analysis\pNet1\\' + keyword + '//p//' r_dir = 'D:\semantic analysis//3次采集结果\连续比例4//' pkl_list = util.get_file_list(dirr, '.pkl') pkl_list = sorted(pkl_list) for pkl in pkl_list: print(pkl) ll = len(pkl_list) - 1 ii = ll g = util.get_nw(dirr + '\\' + pkl_list[ii]) r_list = [] n_list = [] # 生成五个图的公共子图 while ii >= ll - 3: ii -= 1 g2 = util.get_nw(dirr + '\\' + pkl_list[ii]) g = mcs(g2, g) print(pkl_list[ii] + '\t' + str(g.number_of_nodes())) ii = len(pkl_list) - 1 while ii > 0: ii -= 1 g2 = util.get_nw(dirr + '\\' + pkl_list[ii]) rr, nn = mcs_ratio_advanced(g2, g, keyword) r_list.append(pkl_list[ii][0:-4] + '\t' + str(rr)) n_list.append(pkl_list[ii][0:-4] + '\t' + str(nn)) util.save_file(r_dir + keyword + '.txt', r_list) util.save_file(r_dir + 'n' + keyword + '.txt', n_list) # for key in key_list: # print(key) # main1(key) # g2 = util.get_nw('D:\semantic analysis\c_date\给力\p//2011-03-31.pkl') # g1 = util.get_nw('D:\semantic analysis\c_date\给力\p//2011-03-30.pkl') # g1 = util.get_nw('D:\semantic analysis\c_date\给力\p//2011-03-29.pkl') # g4 = util.get_nw('D:\semantic analysis\c_date\给力\p//2011-03-28.pkl') # # # g5 = mcs(g2, g1) # g6 = mcs(g1, g5) # g7 = mcs(g4, g6) # r = g1.number_of_nodes() / g2.number_of_nodes() # print("的节点数:"+str(g2.number_of_nodes())) # print("的节点数:"+str(g1.number_of_nodes())) # print("两者公共子图的节点数:"+str(g5.number_of_nodes())) # print("两者公共子图的节点数:"+str(g6.number_of_nodes())) # print("两者公共子图的节点数:"+str(g7.number_of_nodes())) # print("比值1:" + str(g1.number_of_nodes() / g2.number_of_nodes())) # print("比值2:" + str(g1.number_of_nodes() / g1.number_of_nodes()))
def cal_union_set(keyword, pkl_dir): set_list = sorted(util.get_file_list(pkl_dir, "pkl")) r_list = [] d_list = [] i = 0 r_set0 = set() while i < len(set_list): r_set0 = r_set0 | util.get_nw(pkl_dir + set_list[i]) r_list.append(set_list[i][0:-4] + '\t' + str(len(r_set0))) d_list.append(len(r_set0)) i += 1 i = 0 r_list2 = [] d_list2 = [] while i < len(r_list)-1: r_list2.append(set_list[i][0:-4] + '\t' + str(d_list[i+1]-d_list[i])) d_list2.append(d_list[i+1]-d_list[i]) i += 1 i = 15 r_list3 = [] while i < len(d_list2): r_list3.append(set_list[i][0:-4] + '\t' + str(d_list2[i]/d_list[i])) i += 1 return r_list, r_list2,r_list3
def cal_mcs(div, num, pkl_dir): os.chdir(pkl_dir) pkl_list = util.get_file_list(pkl_dir, '.pkl') # 从小到大排序 pkl_list = sorted(pkl_list) # part_num = len(pkl_list) // div part_num = 200 print("part_nume") print(part_num) if part_num < num: print("参数有误") return i = 0 set_list = [] # 块内的循环 while i < div: # 进行合并操作 j = 0 r_set = set() while j < num: pkl_num = part_num * i + j set1 = util.get_nw(pkl_list[pkl_num]) r_set = set1 | r_set j += 1 # 把合并的结果加入列表中 set_list.append(r_set) print(len(r_set)) i += 1
def loop_compare(keyword_list, pkl_dir1, txt_dir1, result_dir, mode=1, lap=1): for key in keyword_list: print(key) if mode == 0: util.create_directory(result_dir + key + "//") pkl_dir = pkl_dir1.format(key) txt_dir = txt_dir1.format(key) # 获取日期列表 d_list = util.get_file_list(pkl_dir, '.pkl') d_list = [d.split(".")[0] for d in d_list] result_list = [] # 升序排序 d_list = sorted(d_list) ii = len(d_list) - 1 while ii - lap >= 0: g1 = get_core_graph(pkl_dir + d_list[ii] + ".pkl") d1 = get_txt_dict(txt_dir + d_list[ii] + ".txt") # 迭代生成子图 k = 1 while k < lap: g1 = nx.compose(g1, util.get_nw(d_list[ii - k])) k += 1 result_list.append(compare_function(d1, g1)) ii -= lap util.save_file(result_dir + key + ".txt", result_list)
def main(): # 设置结果保存的目录 result_dir = r'D:\semantic analysis\2016-10-05结果\html标记分句2//' txt_dir = r"D:\semantic analysis\2016-10-05结果\新词分句//" set_dir = r"D:\semantic analysis\2016-10-05结果\新词//" k_list = util.get_key_list() for key in k_list: print(key) # 文件目录 file_list = sorted(util.get_file_list(txt_dir + key, ".txt")) # 集合目录 set_list = sorted(util.get_file_list(set_dir + key, ".pkl")) util.create_directory(result_dir + "新词//" + key + "//") i = 0 while i < len(file_list): s_list = util.get_list_from_file(txt_dir + key + "//" + set_list[i][0:-4] + ".txt") new_word_list = util.get_nw(set_dir + key + "//" + set_list[i]) # 过滤相同的语句,防止重复计算 s_list = list(set(s_list)) w_list = remark(s_list, new_word_list, key) html_name = file_list[i][:-4] + '.html' util.save_file(result_dir + "新词//" + key + "//" + html_name, w_list) i += 1
def count_num_of_node(pkl_dir): pkl_file_list = util.get_file_list(pkl_dir, '.pkl') r_list = [] for file in pkl_file_list: g = util.get_nw(file) s = file[0:10] + '\t' + str(g.number_of_nodes()) r_list.append(s) return r_list
def main1(): # date_list = ["2012-08-05","2011-04-05","2011-03-28","2011-10-20","2012-12-30","2011-07-30","2011-06-09","2012-02-05","2012-12-16","2011-08-01","2011-05-19","2013-09-01","2012-08-01","2013-12-01"] # key_list = ["吐槽","纠结","淡定","自拍","正能量","山寨","达人","腹黑","接地气","扯淡","闷骚","不明觉厉","完爆","人艰不拆"] date_list = [ "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31" ] key_list = [ '努力', '感觉', '简单', '无聊', '希望', '美好', '气质', '害怕', '喜欢', '不约而同', '喜闻乐见', ] # 设置结果保存的目录 result_dir = r'D:\semantic analysis\2016-10-09结果\html标记结果//' txt_dir = r"D:\semantic analysis\纯文本\常用词分句//" set_dir = r"D:\semantic analysis\2016-10-09结果\中间结果//" i = 0 while i < len(key_list): key = key_list[i] print(key) # 文件目录 file_list = sorted(util.get_file_list(txt_dir + key, ".txt")) # 集合目录 set_dir_list = util.get_file_list(set_dir + key, ".pkl") set_list = [] for set_list_dir in set_dir_list: set_list.append(util.get_nw(set_dir + key + "//" + set_list_dir)) print(set_list_dir) util.create_directory(result_dir + key + "//") rr = cal_index2(date_list[i], txt_dir + key_list[i]) j = 0 # 每个分段 while j < len(rr): k = 0 while k < rr[j]: print(file_list[k][:-4]) print(rr[j]) txt_list = util.get_list_from_file(txt_dir + key + "//" + file_list[k]) w_list = remark(txt_list, set_list[j], key) html_name = file_list[k][:-4] + '.html' util.save_file(result_dir + key + "//" + html_name, w_list) k += 1 j += 1 i += 1
def loop_compare(com_function, keyword_list, pkl_dir1, result_dir, mode=1, lap=1, type="pkl"): for keyword in keyword_list: pkl_dir = pkl_dir1.format(keyword) f_list = util.get_file_list(pkl_dir, '.pkl') os.chdir(pkl_dir) # 升序排序 nw_list = sorted(f_list) record_list = [] ii = len(nw_list) - 1 # g2是2号 g1是1号,此处获取最末端的网络 g2 = util.get_nw(nw_list[ii]) # 迭代生成子图 k = 1 while k < lap: g2 = nx.compose(g2, util.get_nw(nw_list[ii - k])) k += 1 while ii - lap >= 0: jj = ii ii -= lap # print(nw_list[ii]) g1 = util.get_nw(nw_list[ii]) # 迭代生成子图 k = 1 while k < lap: g1 = nx.compose(g1, util.get_nw(nw_list[ii - k])) k += 1 # 计算比例 r1 = com_function(g1, g2) record_list.append(nw_list[jj][0:-4] + '\t' + str(r1)) g2 = g1 record_list.reverse() util.save_file(result_dir + keyword + ".txt", record_list)
def cal_node_mcs(pkl_dir, mcs_dir, key_word, lap=2): f_list = util.get_file_list(pkl_dir, '.pkl') os.chdir(pkl_dir) # 升序排序 nw_list = sorted(f_list) record_list = [] num_list = [] enum_list = [] ii = len(nw_list) - 1 while (ii - lap + 1) >= 0: # print(nw_list[ii]) g1 = util.get_nw(nw_list[ii]) # 迭代生成子图 k = 1 while k < lap: g1 = mcs(g1, util.get_nw(nw_list[ii - k])) k += 1 # 生成文件名字 filename = nw_list[ii][0:-4] + '.pkl' # 保存结果 pkl_dir = r"D:\semantic analysis\公共子图节点数\新词\30公共子图//" + key_word + "//" util.create_directory(pkl_dir) util.save_nw(g1, pkl_dir + nw_list[ii][0:-4]) num_list.append(nw_list[ii][0:-4] + '\t' + str(g1.number_of_nodes())) enum_list.append(nw_list[ii][0:-4] + '\t' + str(g1.number_of_edges())) # 统计节点数 # with open(mcs_dir + filename[0:-4]+'.txt','w',encoding='utf-8') as file: # for node in g1.nodes(): # file.write(node+'\n') # util.save_nw(g1,mcs_dir + filename) ii -= lap # util.save_file(mcs_dir + key_word+'mcs.txt', record_list) util.save_file(mcs_dir + 'n' + key_word + 'mcs.txt', num_list) util.save_file(mcs_dir + 'e' + key_word + 'mcs.txt', enum_list)
def loop_key2(pkl_dir, result_dir, key_word, lap=1): pkl_dir = pkl_dir.format(key_word) f_list = util.get_file_list(pkl_dir, '.pkl') os.chdir(pkl_dir) # 升序排序 nw_list = sorted(f_list) ii = 0 # g2是2号 g1是1号,此处获取最末端的网络 g1 = util.get_nw(nw_list[ii]) util.create_directory(result_dir + key_word) while ii < len(nw_list) - lap: ii += lap g2 = util.get_nw(nw_list[ii]) # 生成文件名字 filename = nw_list[ii][0:-4] + '.txt' result_list = extract_new_nodes_attributes(g1, g2) util.save_file(result_dir + key_word + "//" + filename, result_list) g1 = nx.compose(g1, g2)
def cal_mcs(pkl_dir): os.chdir(pkl_dir) pkl_list = util.get_file_list(pkl_dir, '.pkl') # 从小到大排序 pkl_list = sorted(pkl_list) r_set = set() r_set1 = set() i = 0 while i < 40: set1 = util.get_nw(pkl_list[i]) set2 = util.get_nw(pkl_list[len(pkl_list) - 1 - i]) r_set = set1 | r_set r_set1 = set2 | r_set1 i += 1 print(len(r_set)) print(len(r_set1)) r_set2 = r_set & r_set1 print(len(r_set2)) print(len(r_set2) / len(r_set))
def loop_key(pkl_dir, result_dir, key_word, lap=1): pkl_dir = pkl_dir.format(key_word) f_list = util.get_file_list(pkl_dir, '.pkl') os.chdir(pkl_dir) # 升序排序 nw_list = sorted(f_list) ii = len(nw_list) - 1 # g2是2号 g1是1号,此处获取最末端的网络 g2 = util.get_nw(nw_list[ii]) util.create_directory(result_dir + key_word) while ii > 0: jj = ii ii -= lap # print(nw_list[ii]) g1 = util.get_nw(nw_list[ii]) # 生成文件名字 filename = nw_list[ii][0:-4] + '-' + nw_list[jj][0:-4] + '.txt' result_list = cal_connect_real_probability(g1, g2, key_word) util.save_file(result_dir + key_word + "//" + filename, result_list) g2 = g1
def get_core_graph(pkl_dir): g = util.get_nw(pkl_dir) return nx.k_core(g)
import tool.util as util import os key_list = util.get_key_list2() for keyword in key_list: print(keyword) dirr = 'D:\semantic analysis\常用词的分词集合\\' + keyword os.chdir(dirr) pkl_list = util.get_file_list(dirr, '.pkl') pkl_list = sorted(pkl_list) util.create_directory(r"D:\semantic analysis\2016-10-05结果//" + keyword) i = 0 s = util.get_nw(pkl_list[0]) while i < len(pkl_list): s1 = util.get_nw(pkl_list[i]) | s s2 = s1 - s print(len(s2)) util.save_nw( s2, r"D:\semantic analysis\2016-10-05结果//" + keyword + "//" + pkl_list[i]) s = s1 i += 1
import tool.util as util import networkx as nx file_path = "D:\semantic analysis\新结果\共现网络\{0}\p//" result_path = "D:\semantic analysis\结果\度分布比率\{0}//" # file_path = "D:\semantic analysis\分词网络\pNet\{0}\p//" keyword_list = ["美好"] for keyword in keyword_list: # 目前所在的路径 cur_path = file_path.format(keyword) # 获取文件列表 file_list = sorted(util.get_file_list(cur_path, ".pkl")) for file in file_list: item = file.split(".")[0] + "\t" g = util.get_nw(cur_path + file) # 计算所有节点间平均最短路径长度 item += str(nx.average_shortest_path_length(g)) # 计算平均群聚系数 item = item + "\t" + str(nx.average_clustering(g)) # item = item + "\t" + str(nx.average_neighbor_degree(g)) # item = item + "\t" + str(nx.average_degree_connectivity(g)) item = item + "\t" + str(nx.diameter(g)) # item = item + "\t" + str(nx.degree_centrality(g)) print(item) # 所有度
import networkx as nx import tool.util as util import os dir = "D:\semantic analysis\新结果\去虚词去单字共现网络\纠结\p//" os.chdir(dir) file_list = util.get_file_list(dir, "pkl") s1 = set() l1 = list() sg = util.get_nw(file_list[0]) for file in file_list[0:20]: g = util.get_nw(file) sg = nx.compose(g, sg) gs = set(sg.nodes()) print("gs ", end=" ") print(len(gs)) for file in file_list[0:20]: g = util.get_nw(file) s1 = s1 | set(nx.k_core(g).nodes()) l1.extend(nx.k_core(g).nodes()) print("s1 " + str(len(s1))) print("l1 " + str(len(l1))) s2 = set() l2 = list() for file in file_list[-50:]: g = util.get_nw(file) s2 = s2 | set(nx.k_core(g).nodes())
core_num_dict = nx.core_number(graph) max_num = max(core_num_dict.values()) for node in node_list: temp_list.append(max_num + 1 - core_num_dict.get(node, max_num + 1)) temp_list.append(max_num) result_list.append(temp_list) result_list[0].append("总层数") return result_list index = 0 if __name__ == '__main__': keyword_list = util.get_key_list2() + util.get_key_list() # keyword_list = ["纠结"] workbook = xlsxwriter.Workbook( "D:\semantic analysis\新结果\去虚词去单字/kcore共现网络变化整体{0}.xlsx".format( str(index))) for keyword in keyword_list: dd = "D:\semantic analysis\新结果\去虚词去单字\共现网络\{0}\p//".format(keyword) i_file = r"D:\semantic analysis\新结果\去虚词去单字\总网络\{0}\p//".format(keyword) i_f = util.get_file_list(i_file, ".pkl")[0] nn = util.get_file_list(dd, ".pkl")[index] print(keyword) ig = util.get_nw(i_file + i_f) rl = loop_compare(ig, dd) calculate_attribute(rl) create_excel(workbook, rl, keyword) workbook.close()
ss = num_dict.get(d, set()) ss.add(ori_node) num_dict[d] = ss result_dict[d] = result_dict.get(d, 0) + 1 keys = list(result_dict.keys()) keys.sort() result_list = list() for k in keys: result_list.append( str(k) + "\t" + str(result_dict.get(k, 0) / len(num_dict[k]))) return result_list # 计算新加入节点的特性 词频 # g2,g1 为两个待求的网络,g1前,g2后 word_freq_dict = util.get_nw( r"D:\semantic analysis\新结果\去虚词去单字\2017-4-9整理\词频分布\词频1.pkl") def extract_new_nodes_attributes2(g1, g2, key_word): s1 = set(g1.nodes()) s = set(g2.nodes()) - s1 s1.remove(key_word) # 新网络中与新加入节点相连接的节点 ori_list = list() for word in s: ori_list.extend(list(nx.all_neighbors(g2, word))) # 分布特性的结果字典 result_dict = dict() for ori_node in ori_list: if ori_node in s1: d = word_freq_dict.get(ori_node, 0)
import tool.util as util name = "046.pkl" print(name) s1 = util.get_nw(r"D:\semantic analysis\2016-10-09结果\最后结果\纠结\\" + name) print(s1)
# 把集合edges的所有点都连接起来 def add_gram_edges(self, nodes): i = 0 j = 0 while i < len(nodes): j = i while j < i+2 and j < len(nodes)-1: j += 1 self.add_edge(nodes[i], nodes[j]) i += 1 # self.add_edge(nodes[len(nodes)-3], nodes[len(nodes)-1]) # 获取生成网络 def get_network(self): g = nx.from_numpy_matrix(self.matrix) return nx.relabel_nodes(g, self.get_num2word_dict()) if __name__ == '__main__': # word_set = [] # for i in range(10): # word_set.append(str(i)) # print(word_set) # mm = MatrixNetwork(word_set) # mm.add_gram_edges(word_set) import tool.util as util mm = util.get_nw(r"D:\semantic analysis\结果\测试网络\美好\p//2009-09-22.pkl") util.show_nw(mm)
import tool.util as util import os key_list = util.get_key_list2() for keyword in key_list: print(keyword) dirr = 'D:\semantic analysis\常用词的分词集合\\' + keyword os.chdir(dirr) pkl_list = util.get_file_list(dirr, '.pkl') pkl_list = sorted(pkl_list) util.create_directory(r"D:\semantic analysis\常用词的分词集合1//" + keyword) i = 0 s = util.get_nw(pkl_list[0]) while i < len(pkl_list) - 1: s1 = util.get_nw(pkl_list[i + 1]) util.save_nw( s & s1, r"D:\semantic analysis\常用词的分词集合1//" + keyword + "//" + pkl_list[i]) s = s1 i += 1
import tool.util as util dir = r"D:\semantic analysis\常用词的分词集合\感觉\\" set_list = sorted(util.get_file_list(dir, "pkl")) # for file in set_list: # print(len(util.get_nw(dir+file))) i = 0 r_set0 = set() while i < len(set_list): r_set0 = r_set0 | util.get_nw(dir + set_list[i]) print(len(r_set0)) i += 1 # # r_set = set() # while i < 200: # r_set = r_set | util.get_nw(dir+set_list[i]) # i += 1 # # r_set2 = set() # while i < 210: # r_set2 = r_set2 | util.get_nw(dir+set_list[i]) # i += 1 # # r_set3 = r_set & r_set2 & r_set0 # print(len(r_set0)) # print(len(r_set)) # print(len(r_set2)) # print(len(r_set3)) # util.save_nw(r_set3, r"D:\semantic analysis\2016-10-03结果\达人//上升集合2.pkl")
"2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31" ] key_list = util.get_key_list2() k = 0 for key in key_list: print(key) dir = "D:\semantic analysis\常用词的分词集合//" # index_list = cal_index2(date_list[k], dir+key) index_list = [100, 125, 150] print(index_list) k += 1 file_list = util.get_file_list(dir + key, ".pkl") set_list = [] # 获取目录下所有set集合 os.chdir(dir + key) for file in file_list: set_list.append(util.get_nw(file)) # print(len(set_list)) rd_list, r_list = cal_difference(index_list, set_list) r_dir = r"D:\semantic analysis\2016-10-09结果\中间结果//" util.create_directory(r_dir + key) i = 0 while i < len(rd_list): print(len(rd_list[i])) print(len(r_list[i])) print(len(rd_list[i]) / len(r_list[i])) # util.save_nw(r_set, r_dir+key+"//"+str(index_list[i]).zfill(3)+".pkl") i += 1