def loop_compare(keyword_list, pkl_dir1, txt_dir1, result_dir, mode=1, lap=1): for key in keyword_list: print(key) if mode == 0: util.create_directory(result_dir + key + "//") pkl_dir = pkl_dir1.format(key) txt_dir = txt_dir1.format(key) # 获取日期列表 d_list = util.get_file_list(pkl_dir, '.pkl') d_list = [d.split(".")[0] for d in d_list] result_list = [] # 升序排序 d_list = sorted(d_list) ii = len(d_list) - 1 while ii - lap >= 0: g1 = get_core_graph(pkl_dir + d_list[ii] + ".pkl") d1 = get_txt_dict(txt_dir + d_list[ii] + ".txt") # 迭代生成子图 k = 1 while k < lap: g1 = nx.compose(g1, util.get_nw(d_list[ii - k])) k += 1 result_list.append(compare_function(d1, g1)) ii -= lap util.save_file(result_dir + key + ".txt", result_list)
def main(): # 设置结果保存的目录 result_dir = r'D:\semantic analysis\2016-10-05结果\html标记分句2//' txt_dir = r"D:\semantic analysis\2016-10-05结果\新词分句//" set_dir = r"D:\semantic analysis\2016-10-05结果\新词//" k_list = util.get_key_list() for key in k_list: print(key) # 文件目录 file_list = sorted(util.get_file_list(txt_dir + key, ".txt")) # 集合目录 set_list = sorted(util.get_file_list(set_dir + key, ".pkl")) util.create_directory(result_dir + "新词//" + key + "//") i = 0 while i < len(file_list): s_list = util.get_list_from_file(txt_dir + key + "//" + set_list[i][0:-4] + ".txt") new_word_list = util.get_nw(set_dir + key + "//" + set_list[i]) # 过滤相同的语句,防止重复计算 s_list = list(set(s_list)) w_list = remark(s_list, new_word_list, key) html_name = file_list[i][:-4] + '.html' util.save_file(result_dir + "新词//" + key + "//" + html_name, w_list) i += 1
def loop_compare(com_function, keyword_list, pkl_dir1, result_dir, mode=1, lap=1, type="pkl"): for key in keyword_list: global keyword keyword = key print(key) if mode == 0: util.create_directory(result_dir + key + "//") pkl_dir = pkl_dir1.format(key) f_list = util.get_file_list(pkl_dir, '.pkl') os.chdir(pkl_dir) result_list = [] # 升序排序 nw_list = sorted(f_list) ii = len(nw_list) - 1 while ii - 2 * lap >= 0: g2 = util.get_nw(nw_list[ii]) # 迭代生成子图 # k = 1 # while k < lap: # g2 = nx.compose(g2, util.get_nw(nw_list[ii - k])) # k += 1 ii -= lap g1 = util.get_nw(nw_list[ii]) # 迭代生成子图 # k = 1 # while k < lap: # g1 = nx.compose(g1, util.get_nw(nw_list[ii - k])) # k += 1 # 生成连通子图 # 相互比例 if mode == 1: r1, r2 = com_function(copy.deepcopy(g1), copy.deepcopy(g2)) result_list.append(nw_list[ii + lap][0:-4] + "\t" + str(r1)) result_list.append((nw_list[ii][0:-4] + "\t" + str(r2))) # 一对一 elif mode == 0: result_list = com_function(copy.deepcopy(g1), copy.deepcopy(g2)) util.save_file( result_dir + key + "//" + nw_list[ii + lap][0:-4] + ".txt", result_list) # n对一 elif mode == 2: r1 = com_function(copy.deepcopy(g1), copy.deepcopy(g2), type) result_list.append(nw_list[ii + lap][0:-4] + "\t" + str(r1)) ii -= lap if mode != 0: result_list.reverse() util.save_file(result_dir + key + ".txt", result_list)
def main2(): fir = "D:\semantic analysis\整理文本\正能量//" xml_list = util.get_file_list(fir, "txt") for xml in xml_list: w_list = set(util.get_list_from_file(fir + xml)) r_list = remark(w_list, (['正能量']), None) html_name = xml[:-4] + '.html' util.save_file(fir + html_name, r_list)
def loop_compare(com_function, keyword_list, pkl_dir1, result_dir, mode=1, lap=1, type="pkl"): for key in keyword_list: print(key) if mode == 0: util.create_directory(result_dir + key + "//") pkl_dir = pkl_dir1.format(key) f_list = util.get_file_list(pkl_dir, '.txt') os.chdir(pkl_dir) result_list = [] # 升序排序 nw_list = sorted(f_list) ii = len(nw_list) - 1 while ii - 2 * lap >= 0: g2 = util.txt2dict(util.get_list_from_file(nw_list[ii])) # 迭代生成子图 k = 1 while k < lap: g2 = nx.compose(g2, util.get_nw(nw_list[ii - k])) k += 1 ii -= lap g1 = util.txt2dict(util.get_list_from_file(nw_list[ii])) d1 = util.get_nw( "D:\semantic analysis\新结果\去虚词去单字共现网络//{0}//p//".format(key) + nw_list[ii].split(".")[0] + ".pkl") # 迭代生成子图 k = 1 while k < lap: g1 = nx.compose(g1, util.get_nw(nw_list[ii - k])) k += 1 # 生成连通子图 if mode == 1: r1, r2 = com_function(copy.deepcopy(g1), copy.deepcopy(g2)) result_list.append(nw_list[ii + lap][0:-4] + "\t" + str(r1)) result_list.append((nw_list[ii][0:-4] + "\t" + str(r2))) elif mode == 0: result_list = com_function(copy.deepcopy(g1), copy.deepcopy(g2)) util.save_file( result_dir + key + "//" + nw_list[ii + lap][0:-4] + ".txt", result_list) elif mode == 2: r1 = com_function(copy.deepcopy(g1), copy.deepcopy(g2), d1) # result_list.append(str(r1)) result_list.append(nw_list[ii + lap][0:-4] + "\t" + str(r1)) ii -= lap if mode != 0: result_list.reverse() util.save_file(result_dir + key + ".txt", result_list)
def main1(): # date_list = ["2012-08-05","2011-04-05","2011-03-28","2011-10-20","2012-12-30","2011-07-30","2011-06-09","2012-02-05","2012-12-16","2011-08-01","2011-05-19","2013-09-01","2012-08-01","2013-12-01"] # key_list = ["吐槽","纠结","淡定","自拍","正能量","山寨","达人","腹黑","接地气","扯淡","闷骚","不明觉厉","完爆","人艰不拆"] date_list = [ "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31" ] key_list = [ '努力', '感觉', '简单', '无聊', '希望', '美好', '气质', '害怕', '喜欢', '不约而同', '喜闻乐见', ] # 设置结果保存的目录 result_dir = r'D:\semantic analysis\2016-10-09结果\html标记结果//' txt_dir = r"D:\semantic analysis\纯文本\常用词分句//" set_dir = r"D:\semantic analysis\2016-10-09结果\中间结果//" i = 0 while i < len(key_list): key = key_list[i] print(key) # 文件目录 file_list = sorted(util.get_file_list(txt_dir + key, ".txt")) # 集合目录 set_dir_list = util.get_file_list(set_dir + key, ".pkl") set_list = [] for set_list_dir in set_dir_list: set_list.append(util.get_nw(set_dir + key + "//" + set_list_dir)) print(set_list_dir) util.create_directory(result_dir + key + "//") rr = cal_index2(date_list[i], txt_dir + key_list[i]) j = 0 # 每个分段 while j < len(rr): k = 0 while k < rr[j]: print(file_list[k][:-4]) print(rr[j]) txt_list = util.get_list_from_file(txt_dir + key + "//" + file_list[k]) w_list = remark(txt_list, set_list[j], key) html_name = file_list[k][:-4] + '.html' util.save_file(result_dir + key + "//" + html_name, w_list) k += 1 j += 1 i += 1
def main1(keyword): dirr = 'D:\semantic analysis\pNet1\\' + keyword + '//p//' r_dir = 'D:\semantic analysis//3次采集结果\连续比例4//' pkl_list = util.get_file_list(dirr, '.pkl') pkl_list = sorted(pkl_list) for pkl in pkl_list: print(pkl) ll = len(pkl_list) - 1 ii = ll g = util.get_nw(dirr + '\\' + pkl_list[ii]) r_list = [] n_list = [] # 生成五个图的公共子图 while ii >= ll - 3: ii -= 1 g2 = util.get_nw(dirr + '\\' + pkl_list[ii]) g = mcs(g2, g) print(pkl_list[ii] + '\t' + str(g.number_of_nodes())) ii = len(pkl_list) - 1 while ii > 0: ii -= 1 g2 = util.get_nw(dirr + '\\' + pkl_list[ii]) rr, nn = mcs_ratio_advanced(g2, g, keyword) r_list.append(pkl_list[ii][0:-4] + '\t' + str(rr)) n_list.append(pkl_list[ii][0:-4] + '\t' + str(nn)) util.save_file(r_dir + keyword + '.txt', r_list) util.save_file(r_dir + 'n' + keyword + '.txt', n_list) # for key in key_list: # print(key) # main1(key) # g2 = util.get_nw('D:\semantic analysis\c_date\给力\p//2011-03-31.pkl') # g1 = util.get_nw('D:\semantic analysis\c_date\给力\p//2011-03-30.pkl') # g1 = util.get_nw('D:\semantic analysis\c_date\给力\p//2011-03-29.pkl') # g4 = util.get_nw('D:\semantic analysis\c_date\给力\p//2011-03-28.pkl') # # # g5 = mcs(g2, g1) # g6 = mcs(g1, g5) # g7 = mcs(g4, g6) # r = g1.number_of_nodes() / g2.number_of_nodes() # print("的节点数:"+str(g2.number_of_nodes())) # print("的节点数:"+str(g1.number_of_nodes())) # print("两者公共子图的节点数:"+str(g5.number_of_nodes())) # print("两者公共子图的节点数:"+str(g6.number_of_nodes())) # print("两者公共子图的节点数:"+str(g7.number_of_nodes())) # print("比值1:" + str(g1.number_of_nodes() / g2.number_of_nodes())) # print("比值2:" + str(g1.number_of_nodes() / g1.number_of_nodes()))
def post(self): username = self.get_argument('username') answer_id = safe_str_to_int(self.get_argument('answer_id')) ask_content = self.get_argument('ask_content') original_question_id = safe_str_to_int(self.get_argument('original_question_id')) be_asked_username = self.get_argument('be_asked_username') options = None # 拿到问题相关的图片 files = self.request.files if files: keys = ['ask_pic_file', 'ask_sound_file'] for key in keys: if key in files: tmp_file = files[key][0] file_name = tmp_file['filename'] from tool.util import get_file_extension, save_file suffix = get_file_extension(file_name) from dbop.dbQuestion import get_latest_id index = get_latest_id("tb_ask") new_file_name = "{0}_{1}{2}".format("ask", index, suffix) msg0 = "[in postQuestionServer] new_file_name=" + new_file_name logging.info(msg0) file_content = tmp_file['body'] # 注入url字段信息 tmp_dict = dict() if key == 'ask_pic_file': tmp_dict['ask_pic_url'] = save_file(new_file_name, file_content, 2) tmp_dict['ask_pic_url'] = "http://" + ConfigManager().get_config('host') + ":" + \ str(ConfigManager().get_config('port')) + tmp_dict['ask_pic_url'] elif key == 'ask_sound_file': tmp_dict['ask_sound_url'] = save_file(new_file_name, file_content, 3) tmp_dict['ask_sound_url'] = "http://" + ConfigManager().get_config('host') + ":" + \ str(ConfigManager().get_config('port')) + tmp_dict['ask_sound_url'] if options is None: options = tmp_dict.copy() else: options.update(tmp_dict) self.set_header("Content-Type", "application/json;charset=utf8") result = ask_question(username, answer_id, ask_content, original_question_id, be_asked_username, options=options) self.write(result) self.finish()
def loop_compare(com_function, keyword_list, pkl_dir1, result_dir, mode=1, lap=1, type="pkl"): for keyword in keyword_list: pkl_dir = pkl_dir1.format(keyword) f_list = util.get_file_list(pkl_dir, '.pkl') os.chdir(pkl_dir) # 升序排序 nw_list = sorted(f_list) record_list = [] ii = len(nw_list) - 1 # g2是2号 g1是1号,此处获取最末端的网络 g2 = util.get_nw(nw_list[ii]) # 迭代生成子图 k = 1 while k < lap: g2 = nx.compose(g2, util.get_nw(nw_list[ii - k])) k += 1 while ii - lap >= 0: jj = ii ii -= lap # print(nw_list[ii]) g1 = util.get_nw(nw_list[ii]) # 迭代生成子图 k = 1 while k < lap: g1 = nx.compose(g1, util.get_nw(nw_list[ii - k])) k += 1 # 计算比例 r1 = com_function(g1, g2) record_list.append(nw_list[jj][0:-4] + '\t' + str(r1)) g2 = g1 record_list.reverse() util.save_file(result_dir + keyword + ".txt", record_list)
def post(self): username = self.get_argument('username') password = self.get_argument('password') grade = safe_str_to_int(self.get_argument('grade')) identifier = safe_str_to_int(self.get_argument('identifier')) nickname = self.get_argument('nickname') subject = self.get_argument('subject', None) serial_number = self.get_argument('serial_number', None) options = self.get_argument('options', None) # 拿到用户头像 files = self.request.files if files: key = 'avatar_file' if key in files: avatar_file = files[key][0] file_name = avatar_file['filename'] from tool.util import get_file_extension, save_file suffix = get_file_extension(file_name) from dbop.dbUser import get_latest_id index = get_latest_id(username, is_new=True) new_file_name = "{0}_{1}{2}".format("user", index, suffix) msg0 = "[in registerServer] new_file_name=" + new_file_name logging.info(msg0) file_content = avatar_file['body'] # 注入头像url字段信息 tmp_dict = dict() tmp_dict['avatar_url'] = save_file(new_file_name, file_content, 1) tmp_dict['avatar_url'] = "http://" + ConfigManager().get_config('host') + ":" + \ str(ConfigManager().get_config('port')) + tmp_dict['avatar_url'] if options: options = safe_str_to_dict(options) options.update(tmp_dict) else: options = tmp_dict.copy() else: # 注入系统默认的头像 from tool.util import get_system_default_avatar_url tmp_dict = dict() tmp_dict['avatar_url'] = get_system_default_avatar_url() if options: options = safe_str_to_dict(options) options.update(tmp_dict) else: options = tmp_dict.copy() if subject: subject = safe_str_to_int(subject) if options: options = safe_str_to_dict(options) logging.info(options) self.set_header("Content-Type", "application/json;charset=utf8") result = register(username, password, grade, identifier, nickname, subject, serial_number, options=options) self.write(result) self.finish()
def cal_node_mcs(pkl_dir, mcs_dir, key_word, lap=2): f_list = util.get_file_list(pkl_dir, '.pkl') os.chdir(pkl_dir) # 升序排序 nw_list = sorted(f_list) record_list = [] num_list = [] enum_list = [] ii = len(nw_list) - 1 while (ii - lap + 1) >= 0: # print(nw_list[ii]) g1 = util.get_nw(nw_list[ii]) # 迭代生成子图 k = 1 while k < lap: g1 = mcs(g1, util.get_nw(nw_list[ii - k])) k += 1 # 生成文件名字 filename = nw_list[ii][0:-4] + '.pkl' # 保存结果 pkl_dir = r"D:\semantic analysis\公共子图节点数\新词\30公共子图//" + key_word + "//" util.create_directory(pkl_dir) util.save_nw(g1, pkl_dir + nw_list[ii][0:-4]) num_list.append(nw_list[ii][0:-4] + '\t' + str(g1.number_of_nodes())) enum_list.append(nw_list[ii][0:-4] + '\t' + str(g1.number_of_edges())) # 统计节点数 # with open(mcs_dir + filename[0:-4]+'.txt','w',encoding='utf-8') as file: # for node in g1.nodes(): # file.write(node+'\n') # util.save_nw(g1,mcs_dir + filename) ii -= lap # util.save_file(mcs_dir + key_word+'mcs.txt', record_list) util.save_file(mcs_dir + 'n' + key_word + 'mcs.txt', num_list) util.save_file(mcs_dir + 'e' + key_word + 'mcs.txt', enum_list)
def loop_key2(pkl_dir, result_dir, key_word, lap=1): pkl_dir = pkl_dir.format(key_word) f_list = util.get_file_list(pkl_dir, '.pkl') os.chdir(pkl_dir) # 升序排序 nw_list = sorted(f_list) ii = 0 # g2是2号 g1是1号,此处获取最末端的网络 g1 = util.get_nw(nw_list[ii]) util.create_directory(result_dir + key_word) while ii < len(nw_list) - lap: ii += lap g2 = util.get_nw(nw_list[ii]) # 生成文件名字 filename = nw_list[ii][0:-4] + '.txt' result_list = extract_new_nodes_attributes(g1, g2) util.save_file(result_dir + key_word + "//" + filename, result_list) g1 = nx.compose(g1, g2)
def main(): # 设置结果保存的目录 result_dir = r'D:\semantic analysis\新纯文本\1新词分句//' txt_dir = r"D:\semantic analysis\新纯文本\1新词//" k_list = util.get_key_list() for key in k_list: print(key) # 文件目录 file_list = util.get_file_list(txt_dir + key, ".txt") # 建立目录 # mk_dir(result_dir+"新词整句//"+key) mk_dir(result_dir + key) for file in file_list: s_list = util.get_list_from_file(txt_dir + key + "//" + file) # 过滤相同的语句,防止重复计算 # s_list = list(set(s_list)) w_list, p_list = extract_sentence(s_list, key) util.save_file(result_dir + key + "//" + file, p_list, True)
def loop_key(pkl_dir, result_dir, key_word, lap=1): pkl_dir = pkl_dir.format(key_word) f_list = util.get_file_list(pkl_dir, '.pkl') os.chdir(pkl_dir) # 升序排序 nw_list = sorted(f_list) ii = len(nw_list) - 1 # g2是2号 g1是1号,此处获取最末端的网络 g2 = util.get_nw(nw_list[ii]) util.create_directory(result_dir + key_word) while ii > 0: jj = ii ii -= lap # print(nw_list[ii]) g1 = util.get_nw(nw_list[ii]) # 生成文件名字 filename = nw_list[ii][0:-4] + '-' + nw_list[jj][0:-4] + '.txt' result_list = cal_connect_real_probability(g1, g2, key_word) util.save_file(result_dir + key_word + "//" + filename, result_list) g2 = g1
r"D:\semantic analysis\2016-10-09结果\词频\希望//2011-09-30.txt") dd = util.txt2dict(w_list) key_list = util.get_key_list2() + util.get_key_list() txt_dir = r"D:\semantic analysis\2016-10-09结果\词频1//" for key in key_list: print(key) file_dir = txt_dir + key dict_list = [] r_list = [] file_list = util.get_file_list(file_dir, ".txt") for file_name in file_list: word_list = sorted( util.get_list_from_file(txt_dir + key + "//" + file_name)) dict_list.append(util.txt2dict(word_list)) # 循环求比值 i = 1 dict1 = dict_list[0] while i < len(dict_list): dict2 = dict_list[i] r_list.append(file_list[i - 1][0:-4] + "\t" + str(cal_mcs_ratio(dict1, dict2, key))) dict1 = dict2.copy() i += 1 util.save_file( r"D:\semantic analysis\2016-10-12结果\自身比例节点数//" + key + ".txt", r_list, False)
low_value -= 1 # 生成统计的目标集合 target_set = set() while low_index > high_index: target_set.add(rank_key[low_index]) low_index -= 1 node_sum = len(target_set) r_list = [] r_list1 = [] dict_list, key_dict_value = util.get_objdict_list( r"D:\semantic analysis\2016-10-09结果\词频月//" + key, ".txt") for dict_key in key_dict_value: word_dict = dict_list[dict_key] temp_set = set() for k, v in word_dict.items(): temp_set.add(k) sum1 = len(temp_set & target_set) r_list.append(dict_key[0:-4] + "\t" + str(sum1)) r_list1.append(dict_key[0:-4] + "\t" + str(sum1 / node_sum)) util.save_file( r"D:\semantic analysis\2016-10-12结果\2010年保留比例\新词\数量//" + key + ".txt", r_list) util.save_file( r"D:\semantic analysis\2016-10-12结果\2010年保留比例\新词\比例//" + key + ".txt", r_list1)
def post(self): username = self.get_argument('username') nickname = self.get_argument('nickname', None) phone_number = self.get_argument('phone_number', None) name = self.get_argument('name', None) sex = safe_str_to_dict(self.get_argument('sex', None)) birthday = self.get_argument('birthday', None) address = self.get_argument('address', None) grade = safe_str_to_int(self.get_argument('grade', None)) subject = safe_str_to_int(self.get_argument('subject', None)) options = dict() if nickname: options['nickname'] = nickname if phone_number: options['phone_number'] = phone_number if name: options['name'] = name if sex: options['sex'] = sex if birthday: options['birthday'] = birthday if address: options['address'] = address if grade: options['grade'] = grade if subject: options['subject'] = subject # 拿到用户头像 props = None files = self.request.files logging.info("start!!!") logging.info(files) if files: key = 'avatar_file' if key in files: avatar_file = files[key][0] file_name = avatar_file['filename'] from tool.util import get_file_extension, save_file suffix = get_file_extension(file_name) from dbop.dbUser import get_latest_id index = get_latest_id(username) new_file_name = "{0}_{1}{2}".format("user", index, suffix) msg0 = "[in modifyPersonalInformationServer] new_file_name=" + new_file_name logging.info(msg0) file_content = avatar_file['body'] # 注入头像url字段信息 tmp_dict = dict() tmp_dict['avatar_url'] = save_file(new_file_name, file_content, 1) tmp_dict['avatar_url'] = "http://" + ConfigManager().get_config('host') + ":" + \ str(ConfigManager().get_config('port')) + tmp_dict['avatar_url'] if props: props = safe_str_to_dict(props) props.update(tmp_dict) else: props = tmp_dict.copy() logging.info("yes!!!") logging.info(props) self.set_header("Content-Type", "application/json;charset=utf8") result = modify_personal_information(username, props=options, options=props) self.write(result) self.finish()
# 统计所有子图的节点数 import tool.util as util import networkx as nx def count_num_of_node(pkl_dir): pkl_file_list = util.get_file_list(pkl_dir, '.pkl') r_list = [] for file in pkl_file_list: g = util.get_nw(file) s = file[0:10] + '\t' + str(g.number_of_nodes()) r_list.append(s) return r_list key_list = util.get_key_list() pkl_dir1 = r'D:\semantic analysis\pNet1//' df_dir = r'D://semantic analysis//3次采集结果//节点数//' # for key in key_list: # util.create_directory(df_dir+key) for key in key_list: print(key) r = count_num_of_node(pkl_dir1 + key + '//p//') util.save_file(df_dir + key + '.txt', r)
# 计算节点的比例数值 # 日期 数量 import tool.util as util file_list = util.get_file_list('D:\semantic analysis//3次采集结果\节点数', '.txt') for f in file_list: x_list = [] y_list = [] ry_list = [] r_data_list = [] data_list = util.get_list_from_file(f) for data in data_list: item = data.split('\t') x_list.append(item[0]) y_list.append(float(item[1])) my = max(y_list) i = 0 while i < len(x_list): r_data_list.append(x_list[i] + '\t' + str(y_list[i] / my)) i += 1 util.save_file( 'D:\semantic analysis//3次采集结果\节点数' + '//' + f[0:-4] + 'mcs.txt', r_data_list)
for key in key_word: print(key) pynlpir.nlpir.AddUserWord(c_char_p(key.encode())) result_dir = r"D:\semantic analysis\新结果\去重去虚词去单字词频数//" fold_list_dir = r"D:\semantic analysis\新纯文本\1常用词分句//" for key in key_word: print(key) file_list = sorted(util.get_file_list(fold_list_dir + key, ".txt")) # 循环文件 for txt_file in file_list: print(txt_file) # 过滤重复 s_list = set( util.get_list_from_file(fold_list_dir + key + "//" + txt_file)) # 获取分词dict rr = count_word(s_list, key) # if "无力" in rr: # print(rr["无力"]/rr["吐槽"]) # 对key进行排序 kk = sort_by_value(rr) w_list = create_dict_list(kk, rr) # 创建目录 util.create_directory(result_dir + key) util.save_file(result_dir + key + "//" + txt_file, w_list, False) # 关闭分词工具 pynlpir.close()
import tool.util as util # 计算所有文件里面所有词语出现的次数 key_list = util.get_key_list() + util.get_key_list2() txt_dir = r"D:\semantic analysis\2016-10-09结果\词频1//" r_dir = r"D:\semantic analysis\2016-10-09结果\总数1//" for key in key_list: print(key) file_dir = txt_dir + key dict_list = [] r_list = [] file_list = util.get_file_list(file_dir, ".txt") for file_name in file_list: word_list = sorted( util.get_list_from_file(txt_dir + key + "//" + file_name)) dict_list.append(util.txt2dict(word_list)) r_dict = dict_list[0] for word_dict in dict_list: r_dict = util.union_dicts(r_dict, word_dict) # 对key进行排序 kk = util.sort_by_value(r_dict) w_list = util.create_dict_list(kk, r_dict) # 创建目录 util.save_file(r"D:\semantic analysis\2016-10-09结果\总数1//" + key + ".txt", w_list, False)
import tool.util as util key_list = util.get_key_list()+util.get_key_list2() fre_path = r"D:\semantic analysis\结果\去重频率//" result_path = r"D:\semantic analysis\结果\累计位置前50//" def get_acc_ratio(ratio_list): if len(ratio_list)-1 < 1: return 0 s = 0 for index, r in enumerate(ratio_list): s += r # if s > 0.6: # return index/(len(ratio_list)-1) if index > 50 or index == len(ratio_list)-1: return s for key in key_list: file_list = util.get_file_list(fre_path+key+"//",".txt") result_list = [] for file in file_list: rl = util.get_list_from_dicttxt(fre_path+key+"//"+file) result_list.append(get_acc_ratio(rl)) util.save_file(result_path+key+".txt",result_list)
w_file.write("http://weibo.com/" + str(c[0]) + "/info?mod=pedit_more" + '\n') # with open("raddress2.txt","w") as rfile: # for ii in cur: # rfile.write(ii[1]+'\n') cur.close() #关闭游标 conn.commit() #向数据库中提交任何未解决的事务,对不支持事务的数据库不进行任何操作 conn.close() #关闭到数据库的连接,释放数据库资源 dd = [ "2010-05-15", "2010-08-02", "2010-09-26", "2010-11-03", "2011-01-11", "2011-03-20", "2011-06-01", "2011-10-16", "2012-02-16", "2012-07-09", "2012-11-19" ] for d in dd: extract_url("fh", d) ll = util.get_list_from_file(r"D:\semantic analysis\用户信息//userUrl.txt") lll = util.get_list_from_file(r"D:\semantic analysis\用户信息//user.txt") print(len(ll)) ss = set(ll) ss1 = set(lll) print(len(ss)) ss = ss - ss1 print(len(ss)) ll = list(ss) util.save_file(r"D:\semantic analysis\用户信息//userUrl1.txt", ll)
while i < len(set_list): r_set0 = r_set0 | util.get_nw(pkl_dir + set_list[i]) r_list.append(set_list[i][0:-4] + '\t' + str(len(r_set0))) d_list.append(len(r_set0)) i += 1 i = 0 r_list2 = [] d_list2 = [] while i < len(r_list)-1: r_list2.append(set_list[i][0:-4] + '\t' + str(d_list[i+1]-d_list[i])) d_list2.append(d_list[i+1]-d_list[i]) i += 1 i = 15 r_list3 = [] while i < len(d_list2): r_list3.append(set_list[i][0:-4] + '\t' + str(d_list2[i]/d_list[i])) i += 1 return r_list, r_list2,r_list3 key_list = util.get_key_list() for key in key_list: print(key) r1, r2, r3 = cal_union_set(key, "D:\semantic analysis\分词集合//"+key+"//") # util.save_file(r"D:\semantic analysis\2016-10-03结果\增量//"+key+".txt", r2) # util.save_file(r"D:\semantic analysis\2016-10-03结果\总量//"+key+".txt", r1) util.save_file(r"D:\semantic analysis\2016-10-03结果\比例//"+key+".txt", r3)
def phantomjs_screen_html(self): util.save_file(self.driver.page_source.encode('utf-8'), './hj.html')
import tool.util as util def get_num_sentence(file_path): return len(set(util.get_list_from_file(file_path))) key_list = util.get_key_list2() root = r"D:\semantic analysis\新纯文本\1常用词/" for key in key_list: print(key) file_list = util.get_file_list(root + key, ".txt") r_list = [] for file in file_list: ss = get_num_sentence(root + key + "//" + file) r_list.append(file[0:-4] + "\t" + str(ss)) util.save_file(r"D:\semantic analysis\新结果//去重句子数//常用词//" + key + ".txt", r_list)