def combine(src_path, save_path): file_list_dict, file_name_list = util.get_objdict_list(src_path, ".txt") for file_name, file_dict in file_list_dict.items(): r_dict = dict() for place, num in file_dict.items(): p_place = place.split(" ")[0] r_dict[p_place] = r_dict.get(p_place, 0) + num util.save_dict_list(r_dict,save_path+file_name)
import re import tool.util as util key_list = util.get_key_list() dict_dir = r"D:\semantic analysis\2016-10-09结果\词频1//" for key in key_list: print(key) set_dict, file_name = util.get_objdict_list(dict_dir + key, ".txt") date_list = util.get_file_list(dict_dir + key, ".txt") pattern = re.compile(r"(\d*-\d*)-\d*") month_array = pattern.findall(" ".join(date_list)) month_array = ["2010"] util.create_directory(r"D:\semantic analysis\2016-10-12结果\2010年频数统计//" + key) # 循环查找月份 for month in month_array: pattern = re.compile(r"(" + month + "-\d*-\d*)") date_array = pattern.findall(" ".join(date_list)) print(date_array) # 循环合并月份频数字典 r_dict = dict() for file_date in date_array: r_dict = util.union_dicts(set_dict[file_date + ".txt"], r_dict) util.save_dict_list( r_dict, r"D:\semantic analysis\2016-10-12结果\2010年频数统计//" + key + ".txt")
import re import tool.util as util key_list = util.get_key_list2() dict_dir = r"D:\semantic analysis\2016-10-09结果\词频1//" r_dict = dict() for key in key_list: print(key) set_dict = util.get_obj_list(dict_dir + key, ".txt") for file_date in set_dict: r_dict = util.union_dicts(file_date, r_dict) util.save_dict_list(r_dict, r"D:\百度指数截屏//常用词总频率统计" + ".txt")
"2011-10-16", "2012-02-16", "2012-07-09", "2012-11-19"] ] py_list = ["zp","dd","sz","dr","ms","fh"] date_list_list = [["2010-06-01", "2011-01-21", "2011-07-09", "2011-11-21"]] py_list = ["tc"] root_path = r"D:\semantic analysis\用户信息\dict//" i = 0 while i < len(py_list): py = py_list[i] date_list = date_list_list[i] i += 1 for dd in date_list: user_id_list = extract_user_id(py,dd) place_list = [] place_dict = dict() if user_id_list: for user_id in user_id_list: place = get_place(user_id) if place: place_list.append(place) place_dict = dict((a, place_list.count(a)) for a in place_list) util.create_directory(root_path+py) util.save_dict_list(place_dict, root_path+py+"//"+dd+".txt")
import tool.util as util # 从每个dict——txt文件里面统计词频率 dict_path = r"D:\semantic analysis\结果\去重频数//" result_path = r"D:\semantic analysis\结果\去重频率//" keyword_list = util.get_key_list2() + util.get_key_list() for key in keyword_list: print(key) r_dict, file_name_list = util.get_objdict_list(dict_path + key, ".txt") for (k, word_dict) in r_dict.items(): sum = 0 r_f_dict = {} if key in word_dict: word_dict.pop(key) for word, value in word_dict.items(): sum += int(value) for word, value in word_dict.items(): ratio = value / sum r_f_dict[word] = ratio util.create_directory(result_path + key + "//") util.save_dict_list(r_f_dict, result_path + key + "//" + k)
# 统计所有关键词出现的文件数 def count_word(word_set, word_dict): for word in word_set: if word not in word_dict: word_dict[word] = 1 else: word_dict[word] = word_dict[word] + 1 def count_word_from_dict(new_dict, word_dict): for word, v in new_dict.items(): if word not in word_dict: word_dict[word] = 1 else: word_dict[word] = word_dict[word] + 1 dir = r"D:\semantic analysis\2016-10-09结果\词频月//" key_list = util.get_key_list() for key in key_list: os.chdir(dir+key) print(key) file_list = util.get_obj_list(dir+key,".txt") r_word_dict = dict() for file in file_list: count_word_from_dict(file, r_word_dict) util.save_dict_list(r_word_dict,r"D:\semantic analysis\2016-10-12结果\总频数月//"+key+".txt")