def get_multitags_uda(conf_list): uda_list = [] labels_map = [] for conf in conf_list: tokenizer = tokenization.FullTokenizer(vocab_file=conf['vocab_file'], do_lower_case=True) processor = raw_data_utils.CFFEXProcessor(conf["labels"]) uda = get_uda.Get_UDA(conf, tokenizer, processor) uda_list.append(uda) labels_map.append(conf["labels"]) return uda_list, labels_map
def main(_): if FLAGS.max_seq_length > 512: raise ValueError( "Cannot use sequence length {:d} because the BERT model " "was only trained up to sequence length {:d}".format( FLAGS.max_seq_length, 512)) # processor = raw_data_utils.get_processor(FLAGS.task_name) processor = raw_data_utils.CFFEXProcessor(FLAGS.labels, FLAGS.raw_data_dir, FLAGS.sup_train_file, FLAGS.sup_dev_file, FLAGS.unsup_file) # Create tokenizer tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) if FLAGS.data_type == "sup": sup_out_dir = FLAGS.output_base_dir tf.logging.info("Create sup. data: subset {} => {}".format( FLAGS.sub_set, sup_out_dir)) proc_and_save_sup_data( processor, FLAGS.sub_set, FLAGS.raw_data_dir, sup_out_dir, tokenizer, FLAGS.max_seq_length, FLAGS.trunc_keep_right, FLAGS.worker_id, FLAGS.replicas, FLAGS.sup_size, ) elif FLAGS.data_type == "unsup": assert FLAGS.aug_ops is not None, \ "aug_ops is required to preprocess unsupervised data." unsup_out_dir = os.path.join(FLAGS.output_base_dir, FLAGS.aug_ops, str(FLAGS.aug_copy_num)) data_stats_dir = os.path.join(FLAGS.raw_data_dir, "data_stats") tf.logging.info("Create unsup. data: subset {} => {}".format( FLAGS.sub_set, unsup_out_dir)) proc_and_save_unsup_data(processor, FLAGS.sub_set, FLAGS.raw_data_dir, data_stats_dir, unsup_out_dir, tokenizer, FLAGS.max_seq_length, FLAGS.trunc_keep_right, FLAGS.aug_ops, FLAGS.aug_copy_num, FLAGS.worker_id, FLAGS.replicas)
def illegal_eval_uda(): tokenizer = tokenization.FullTokenizer(vocab_file=conf['vocab_file'], do_lower_case=True) processor = raw_data_utils.CFFEXProcessor(conf["labels"]) uda = get_uda.Get_UDA(conf, tokenizer, processor) eval_data = read_csv('ssf_uda/data/CFFEX/illegal', 'eval_0.2.csv') check_data = pd.DataFrame(columns=('content', 'label', 'predict')) # label_dict = {7:"恶意抹黑监管机构", 3:"非法发行证券",1:"非法荐股",4:"非法配资",5:"负面言论风险言论",6:"煽动维权诈骗", # 2:"诱导开户",0:"正常"} label_dict = ["未涉嫌违规", "非法投资咨询", "非法诱导投资", "非法经营活动", "维权追损二次诈骗"] labels = conf['labels'] TP = 0.0 FN = 0.0 FP = 0.0 false_examples = [] for i in range(len(eval_data)): content = eval_data.iloc[i]['content'] label = label_dict[int(eval_data.iloc[i]['label'])] probs = uda.predict(content) label_p = label_dict[int(get_label(labels, probs))] check_data = check_data.append([{'content': content, 'label': json.dumps(label, ensure_ascii=False), 'predict': json.dumps(label_p, ensure_ascii=False)}], ignore_index=True) if label != "未涉嫌违规" and label_p != "未涉嫌违规": TP += 1 elif label != "未涉嫌违规" and label_p == "未涉嫌违规": # false_examples.append([content, label, label_p]) FN += 1 elif label == "未涉嫌违规" and label_p != "未涉嫌违规": false_examples.append([content, label, label_p]) FP += 1 for w in false_examples: print(w[0]) print("ori:{}\tpredict:{}".format(w[1], w[2])) pcsion = TP / (TP + FP) recall = TP / (TP + FN) to_csv(check_data, 'ssf_uda/data/CFFEX/illegal', 'illegal_uda.csv') print("误识别为违规言论条数:{}".format(len(false_examples))) print("违规言论精确率:{}, 召回率:{}".format(pcsion, recall)) uda.kill()
def main(_): # for i in range(5): # data = read_csv('ssf_uda/data/CFFEX/multitags/data_tag_{}'.format(i), 'data_tag_{}.csv'.format(i)) # train, eval = split_set_by_ratio(data,0.8) # to_csv(train,'ssf_uda/data/CFFEX/multitags/data_tag_{}'.format(i), 'train_0.8.csv') # to_csv(eval, 'ssf_uda/data/CFFEX/multitags/data_tag_{}'.format(i), 'eval_0.2.csv') # print(len(data)) # tag_list = [] # # print(str(tag_list).decode("string_escape")) # # for tag in tag_list: # # print(tag) # tag_set = split_multi_tags(tag_list) # print(str(tag_set).decode("string_escape")) # train = read_csv('ssf_uda/data/CFFEX/illegal', 'train_0.8.csv') # show_info(train) # for t in range(0, 30): # cnt = 0 # for i in range(len(eval)): # content = eval.iloc[i]["content"] # label = eval.iloc[i]['label'] # res = re_match(content, label, t) # if res == 1: # cnt +=1 # print(1.0*cnt/len(eval)) # cnt = 0 # print(1.0 * cnt / len(eval)) # eval = read_csv('ssf_uda/data/CFFEX/illegal', 'eval_0.2.csv') # label = train.iloc[i]['label'] # if not label_keyword_map.has_key(label): # label_keyword_map[label] = [] # key_words = summary(train.iloc[i]['content']) # for word in key_words: # if word not in label_keyword_map[label]: # label_keyword_map[label].append(word) # for i in range(1,5): # print(str(label_keyword_map[i]).decode("unicode_escape")) # train = dataframe_keyword_extraction(train) # to_csv(train, 'ssf_uda/data/CFFEX/illegal', 'train_extraction_0.8.csv') # unsup = read_csv('ssf_uda/data/CFFEX/illegal', 'unsup.csv') # unsup['label'] = 'unsup' # to_csv(unsup, 'ssf_uda/data/CFFEX/illegal', 'unsup.csv') # eval = read_csv('ssf_uda/data/CFFEX/illegal', 'eval_0.2.csv') # eval = dataframe_keyword_extraction(eval) # to_csv(eval, 'ssf_uda/data/CFFEX/illegal', 'eval_extraction_0.2.csv') # eval = dataframe_keyword_extraction(eval) # unsup = dataframe_keyword_extraction(unsup) # to_csv(unsup, 'ssf_uda/data/CFFEX/illegal', 'unsup_extration.csv') # unsup.insert(1,'label','unsup') # labels = ['unsup']*len(unsup) # unsup['label'] = labels # print(unsup.head()) # to_csv(unsup,'ssf_uda/data/CFFEX/illegal', 'unsup.csv') # train_data = read_csv('ssf_uda/data/CFFEX/illegal', 'unsup.csv') # # train_data.drop(['text_split','emb_list'],axis=1,inplace=True) # show_info(train_data) # label_index = train_data.columns.get_loc('content') # for i in range(len(train_data)): # content = train_data.iloc[i]['content'] # content = keyword_extraction(content) # # train_data.iloc[i,label_index] = content # # print(train_data.iloc[i]['content']) # show_info(train_data) # to_csv(train_data,'ssf_uda/data/CFFEX/illegal','unsup.csv') # to_csv(train_data,'ssf_uda/data/CFFEX/illegal',"train_extraction_0.8.csv") # eval = read_csv('ssf_uda/data/CFFEX/theme3', 'eval_0.2.csv') # unsup = read_csv('ssf_uda/data/CFFEX/illegal', 'unsup.csv') # show_info(train_data) # show_info(eval) # show_info(unsup) if FLAGS.process_for_split: train_data = read_csv(FLAGS.raw_data_dir, FLAGS.sup_train_file) dev_data = read_csv(FLAGS.raw_data_dir, FLAGS.sup_dev_file) # unsup_data = read_csv(FLAGS.raw_data_dir, FLAGS.unsup_file) train_data['text_split'] = train_data['content'].apply( lambda x: get_split(x, FLAGS.segment_len, FLAGS.segment_overlap)) dev_data['text_split'] = dev_data['content'].apply( lambda x: get_split(x, FLAGS.segment_len, FLAGS.segment_overlap)) # unsup_data['text_split'] = unsup_data['content'].apply(lambda x: get_split(x, FLAGS.segment_len, FLAGS.segment_overlap)) to_csv(train_data, FLAGS.raw_data_dir, FLAGS.sup_train_file) to_csv(dev_data, FLAGS.raw_data_dir, FLAGS.sup_dev_file) # to_csv(unsup_data, FLAGS.raw_data_dir, FLAGS.unsup_file) if FLAGS.process_for_embedding: tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=True) processor = raw_data_utils.CFFEXProcessor(FLAGS.labels) uda = get_uda.Get_UDA(conf, tokenizer, processor) train_data = read_csv(FLAGS.raw_data_dir, FLAGS.sup_train_file) dev_data = read_csv(FLAGS.raw_data_dir, FLAGS.sup_dev_file) train_data = encode_emb_list(uda, train_data) dev_data = encode_emb_list(uda, dev_data) to_csv(train_data, FLAGS.raw_data_dir, FLAGS.sup_train_file) to_csv(dev_data, FLAGS.raw_data_dir, FLAGS.sup_dev_file) uda.kill()
sys.setdefaultencoding('utf8') from django.views.decorators.csrf import csrf_exempt sys.path.append('../ssf_uda') sys.path.append('../ssf_uda/augmentation') sys.path.append('../ssf_uda/albert') sys.path.append('../ssf_uda/utils') from django.http import JsonResponse, HttpResponse from ssf_uda import get_uda from ssf_uda.utils import tokenization, raw_data_utils from conf import * tokenizer = tokenization.FullTokenizer(vocab_file=conf['vocab_file'], do_lower_case=True) processor = raw_data_utils.CFFEXProcessor(conf["labels"]) # processor = raw_data_utils.CFFEXIllegalProcessor(conf["labels"]) uda = get_uda.Get_UDA(conf, tokenizer, processor) dic = {0:"未涉嫌违规", 1:"非法投资咨询", 2:"非法诱导投资", 3:"非法经营活动", 4:"维权追损二次诈骗"} # 心跳包 def test(request): request.encoding = 'utf-8' return JsonResponse({"status": 200, "message": "connect success"}, safe=False) @csrf_exempt def get_illegal_type(request): request.encoding = 'utf-8' if request.method != "POST": return JsonResponse({"status":405,"message":"Only POST supported"},safe=False) else: