def run(self): global dic dic = {} # word_cloude_create.send_process(20) if os.path.isfile(os.path.join(self.path, 'index.txt')) is False: print("Error, no index.txt") return data = data_ops.Data_ops(self.path) cnt_0 = 0 all_0 = len(data.get_all_path()) for file_path in data.get_all_path(): word_cloude_create.send_process(int((50 * (cnt_0 / all_0)))) cnt_words(cnt_file_path=file_path) cnt_0 += 1 dic_b = dic.copy() list_b = [] for key in list(dic_b.keys()): list_b.append([key, dic_b[key]]) list_b.sort(key=lambda x: x[1]) list_b = list_b[:-20] dic_b.clear() cnt_2 = 0 all_2 = len(list_b) for line in list_b: if cnt_2 % 100 == 0: word_cloude_create.send_process( int(50 + (50 * (cnt_2 / all_2)))) print("working for %s..." % line[0]) dic_b[line[0]] = line[1] cnt_2 += 1 word_cloude_create.send_process(100) self.send_cloude_dict.emit(dic, dic_b)
def read(work_path: str): # 是谣言概率 is_rumor = {} # 不是谣言概率 not_is_rumor = {} # 同一天的微博数量 weibo_num = {} global my_data my_data = data_ops.Data_ops(work_path) path = my_data.get_all_path() date = [] # 是谣言的概率数组 is_Rumor_probability = [] # 不是谣言的概率数组 not_is_Rumor_probability = [] # for i in path : # p =division(i) # #日期 # date.append(p) # 读取test_results.tsv f = open(r"E:\py\test_results.tsv", "r", encoding="utf-8") probability = f.readlines() for i in probability: d = i.split("\t") is_Rumor_probability.append(float(d[0])) not_is_Rumor_probability.append(float(d[1][:-1])) # 读取weibo,获取日期 for i in path: d = open(i, "r", encoding="utf-8") date.append(d.read().split("\t")[0]) d.close() # 获取同一天微博的数量 for i in date: if i in weibo_num.keys(): weibo_num[i] += 1 if i not in weibo_num.keys(): weibo_num[i] = 1 # 获取同一天的微博的是谣言概率的和。 for i in range(len(is_Rumor_probability)): if date[i] in is_rumor.keys(): is_rumor[date[i]] += 1.0 * is_Rumor_probability[i] if date[i] not in is_rumor.keys(): is_rumor[date[i]] = 1.0 * is_Rumor_probability[i] for i in range(len(not_is_Rumor_probability)): if date[i] in not_is_rumor.keys(): not_is_rumor[date[i]] += 1.0 * not_is_Rumor_probability[i] if date[i] not in not_is_rumor.keys(): not_is_rumor[date[i]] = 1.0 * not_is_Rumor_probability[i] for i in is_rumor.keys(): is_rumor[i] = is_rumor[i] / weibo_num[i] for i in not_is_rumor.keys(): not_is_rumor[i] = not_is_rumor[i] / weibo_num[i] f.close()
def get_bert_res_to_json(work_path: str, bert_train_res_file_path: str): """ :param bert_train_res_file_path:BERT 训练文件结果 :param work_path:工作目录:用于获取路径(路径用于获取日期) :return:None """ # 将日期列表、是谣言概率的列表和不是谣言概率的列表 存放到这个总的列表中 total = [] my_data = data_ops.Data_ops(work_path) path = my_data.get_all_path() # 存放日期的列表 date = [] # 是谣言概率 is_Rumor_probability = [] # 不是谣言概率 not_is_Rumor_probability = [] if len(path) == 0: return 0 # 处理TSV文件 with open(bert_train_res_file_path, "r", encoding="utf-8") as f: probability = f.readlines() for i in probability: d = i.split("\t") is_Rumor_probability.append(float(d[0])) not_is_Rumor_probability.append(float(d[1][:-1])) # 读取weibo,获取日期 for i in range(len(probability)): with open(path[i], "r", encoding="utf-8") as d: date.append(d.read().split("\t")[0]) total.append(date) total.append(is_Rumor_probability) total.append(not_is_Rumor_probability) print(len(total)) print(len(date)) print(len(is_Rumor_probability)) print(len(not_is_Rumor_probability)) with open(os.path.join(work_path, "toal.json"), 'w+', encoding='utf-8') as f: json.dump(total, f, indent=4)
def get_all_path(): global my_ui my_data = data_ops.Data_ops(my_ui.my_page_corpus_lineEdit_workPath.text()) if my_data.test() is True: my_sum = str(len(my_data.get_all_path())) else: my_sum = "NaN" res = { path_para.json_file: my_ui.my_page_corpus_lineEdit_from_json.text(), path_para.work_path: my_ui.my_page_corpus_lineEdit_workPath.text(), path_para.dict: my_ui.my_page_corpus_lineEdit_directory.text(), "dataset_name": str(my_ui.my_page_corpus_lineEdit_from_json.text()).split('\\') [-1].split('/')[-1].replace(".json", ''), "sum": my_sum } return res
def start(work_path: str): P_modification = {} N_modification = {} none_enum = {} I_enum = {} num = Num() weibo = WeiBo() data = data_ops.Data_ops(work_path) path = data.get_all_path() if len(path) == 0: return start_t = time.time() cnt = 0 for i in range(len(path)): if len(path[i]) == 0: return p = division(path[i]) if len(p[3]) == 0: p[3] += " " l = participle(p[3]) num = statistics(l[0]) weibo = calc(num) # 负向的微博 if weibo.N_enum == 1 and weibo.P_enum == 0 and weibo.none_enum == 0 and weibo.I_enum == 0: if p[0] in N_modification.keys(): N_modification[p[0]] += weibo.N_enum if p[0] not in N_modification.keys(): N_modification[p[0]] = weibo.N_enum # 正向的微博 if weibo.N_enum == 0 and weibo.P_enum == 1 and weibo.none_enum == 0 and weibo.I_enum == 0: if p[0] in P_modification.keys(): P_modification[p[0]] += weibo.P_enum if p[0] not in P_modification.keys(): P_modification[p[0]] = weibo.P_enum # 言辞比较激烈的微博 if weibo.N_enum == 0 and weibo.P_enum == 0 and weibo.none_enum == 0 and weibo.I_enum == 1: if p[0] in I_enum.keys(): I_enum[p[0]] += weibo.I_enum if p[0] not in I_enum.keys(): I_enum[p[0]] = weibo.I_enum # 什么也不是的微博 if weibo.N_enum == 0 and weibo.P_enum == 0 and weibo.none_enum == 1 and weibo.I_enum == 0: if p[0] in none_enum.keys(): none_enum[p[0]] += weibo.none_enum if p[0] not in none_enum.keys(): none_enum[p[0]] = weibo.none_enum # print(weibo.P_modification) # print("--------------------") # print(weibo.N_modification) # 根据日期写字典 if cnt % 51 == 0: show_time(start_t=start_t, p=(cnt / len(path))) cnt += 1 P_keys = list(P_modification.keys()) N_keys = list(N_modification.keys()) I_keys = list(I_enum.keys()) none_keys = list(none_enum.keys()) # print("hsgdfhjsd gjf sdghjfgsdhjfsdhjfgSDHJFGASDHJFGHJSDFGHMSDGFHJKSDGFJKGSDKFGHSDJKFGKSDHJG") # print(P_keys[2]) # P I 互补日期 for i in P_keys: if i not in I_keys: I_enum[i] = 0 for i in I_keys: if i not in P_keys: P_modification[i] = 0 # N none 互补日期 for i in N_keys: if i not in none_keys: none_enum[i] = 0 for i in none_keys: if i not in N_keys: N_modification[i] = 0 # 刷新键值 P_keys = list(P_modification.keys()) N_keys = list(N_modification.keys()) I_keys = list(I_enum.keys()) none_keys = list(none_enum.keys()) # PN互补日期 for i in P_keys: if i not in N_keys: N_modification[i] = 0 for i in N_keys: if i not in P_keys: P_modification[i] = 0 # I none互补日期 for i in I_keys: if i not in none_keys: none_enum[i] = 0 for i in none_keys: if i not in I_keys: I_enum[i] = 0 total = percentage(P_modification, N_modification, I_enum, none_enum) with open(os.path.join(work_path, "P.json"), 'w+', encoding='utf-8') as f: json.dump(file_tools.transformer_direction(P_modification), f, ensure_ascii=False) with open(os.path.join(work_path, "N.json"), 'w+', encoding='utf-8') as f: json.dump(file_tools.transformer_direction(N_modification), f, ensure_ascii=False) with open(os.path.join(work_path, "I.json"), 'w+', encoding='utf-8') as f: json.dump(file_tools.transformer_direction(I_enum), f, ensure_ascii=False) with open(os.path.join(work_path, "none.json"), 'w+', encoding='utf-8') as f: json.dump(file_tools.transformer_direction(none_enum), f, ensure_ascii=False) # 将生成的百分比字典携程json文件 # 正向在一天中所占的百分比 with open(os.path.join(work_path, "P_percentage.json"), 'w+', encoding='utf-8') as f: json.dump(file_tools.transformer_direction(total[0]), f, ensure_ascii=False) # 负向在一天中所占的比例 with open(os.path.join(work_path, "N_percentage.json"), 'w+', encoding='utf-8') as f: json.dump(file_tools.transformer_direction(total[1]), f, ensure_ascii=False) # 激烈在一天中所占的百分比 with open(os.path.join(work_path, "I_percentage.json"), 'w+', encoding='utf-8') as f: json.dump(file_tools.transformer_direction(total[2]), f, ensure_ascii=False) # none类型在一天中所占的比例 with open(os.path.join(work_path, "none_percentage.json"), 'w+', encoding='utf-8') as f: json.dump(file_tools.transformer_direction(total[3]), f, ensure_ascii=False)
from data import data_ops # import LAC import jieba import set_page_corpus_connect import os, sys # 特别注意:ops_data仅进行读文件,不对路径进行读取!!! ops_data = data_ops.Data_ops(r"") # lac = LAC.LAC() def transformer_direction(my_dict_obj: dict): """ 将字典的格式转换成为 :param my_dict_obj: :return: """ r = { "date": list(my_dict_obj.keys()), "data": list(my_dict_obj.values()) } return r # 获取评价和情感的列表 def getSentiment(): """ :return: 获取评价和情感的列表 """