Ejemplo n.º 1
0
def illegal_eval_uda():
    tokenizer = tokenization.FullTokenizer(vocab_file=conf['vocab_file'],
                                           do_lower_case=True)
    processor = raw_data_utils.CFFEXProcessor(conf["labels"])
    uda = get_uda.Get_UDA(conf, tokenizer, processor)

    eval_data = read_csv('cffex_uda/data/CFFEX/illegal', 'eval_0.2.csv')

    check_data = pd.DataFrame(columns=('content', 'label', 'predict'))

    # label_dict = {7:"恶意抹黑监管机构", 3:"非法发行证券",1:"非法荐股",4:"非法配资",5:"负面言论风险言论",6:"煽动维权诈骗",
    #               2:"诱导开户",0:"正常"}
    label_dict = ["未涉嫌违规", "非法投资咨询", "非法诱导投资", "非法经营活动", "维权追损二次诈骗"]
    labels = conf['labels']

    TP = 0.0
    FN = 0.0
    FP = 0.0
    false_examples = []
    for i in range(len(eval_data)):
        content = eval_data.iloc[i]['content']
        label = label_dict[int(eval_data.iloc[i]['label'])]
        probs = uda.predict(content)
        label_p = label_dict[int(get_label(labels, probs))]
        check_data = check_data.append(
            [{
                'content': content,
                'label': json.dumps(label, ensure_ascii=False),
                'predict': json.dumps(label_p, ensure_ascii=False)
            }],
            ignore_index=True)

        if label != "未涉嫌违规" and label_p != "未涉嫌违规":
            TP += 1
        elif label != "未涉嫌违规" and label_p == "未涉嫌违规":
            # false_examples.append([content, label, label_p])
            FN += 1
        elif label == "未涉嫌违规" and label_p != "未涉嫌违规":
            false_examples.append([content, label, label_p])
            FP += 1

    for w in false_examples:
        print(w[0])
        print("ori:{}\tpredict:{}".format(w[1], w[2]))
    pcsion = TP / (TP + FP)
    recall = TP / (TP + FN)
    to_csv(check_data, 'cffex_uda/data/CFFEX/illegal', 'illegal_uda.csv')
    print("误识别为违规言论条数:{}".format(len(false_examples)))
    print("违规言论精确率:{}, 召回率:{}".format(pcsion, recall))

    uda.kill()
Ejemplo n.º 2
0
def main(_):

    if FLAGS.max_seq_length > 512:
        raise ValueError(
            "Cannot use sequence length {:d} because the BERT model "
            "was only trained up to sequence length {:d}".format(
                FLAGS.max_seq_length, 512))

    # processor = raw_data_utils.get_processor(FLAGS.task_name)
    processor = raw_data_utils.CFFEXProcessor(FLAGS.labels, FLAGS.raw_data_dir,
                                              FLAGS.sup_train_file,
                                              FLAGS.sup_dev_file,
                                              FLAGS.unsup_file)
    # Create tokenizer
    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    if FLAGS.data_type == "sup":
        sup_out_dir = FLAGS.output_base_dir
        tf.logging.info("Create sup. data: subset {} => {}".format(
            FLAGS.sub_set, sup_out_dir))

        proc_and_save_sup_data(
            processor,
            FLAGS.sub_set,
            FLAGS.raw_data_dir,
            sup_out_dir,
            tokenizer,
            FLAGS.max_seq_length,
            FLAGS.trunc_keep_right,
            FLAGS.worker_id,
            FLAGS.replicas,
            FLAGS.sup_size,
        )
    elif FLAGS.data_type == "unsup":
        assert FLAGS.aug_ops is not None, \
            "aug_ops is required to preprocess unsupervised data."
        unsup_out_dir = os.path.join(FLAGS.output_base_dir, FLAGS.aug_ops,
                                     str(FLAGS.aug_copy_num))
        data_stats_dir = os.path.join(FLAGS.raw_data_dir, "data_stats")

        tf.logging.info("Create unsup. data: subset {} => {}".format(
            FLAGS.sub_set, unsup_out_dir))
        proc_and_save_unsup_data(processor, FLAGS.sub_set, FLAGS.raw_data_dir,
                                 data_stats_dir, unsup_out_dir, tokenizer,
                                 FLAGS.max_seq_length, FLAGS.trunc_keep_right,
                                 FLAGS.aug_ops, FLAGS.aug_copy_num,
                                 FLAGS.worker_id, FLAGS.replicas)
Ejemplo n.º 3
0
import json
import sys

from django.views.decorators.csrf import csrf_exempt

sys.path.append('../cffex_uda')
sys.path.append('../cffex_uda/augmentation')
sys.path.append('../cffex_uda/albert')
sys.path.append('../cffex_uda/utils')

from django.http import JsonResponse
from cffex_uda import get_uda
from cffex_uda.utils import tokenization, raw_data_utils
from conf import *

tokenizer = tokenization.FullTokenizer(vocab_file=conf['vocab_file'],
                                       do_lower_case=True)
processor = raw_data_utils.CFFEXProcessor(conf["labels"])
uda = get_uda.Get_UDA(conf, tokenizer, processor)


# 心跳包
def test(request):
    request.encoding = 'utf-8'
    return JsonResponse({
        "status": 200,
        "message": "connect success"
    },
                        safe=False)


@csrf_exempt
Ejemplo n.º 4
0
def main(_):

    # for i in range(5):
    #     data  = read_csv('cffex_uda/data/CFFEX/multitags/data_tag_{}'.format(i), 'data_tag_{}.csv'.format(i))
    #     train, eval = split_set_by_ratio(data,0.8)
    #     to_csv(train,'cffex_uda/data/CFFEX/multitags/data_tag_{}'.format(i), 'train_0.8.csv')
    #     to_csv(eval, 'cffex_uda/data/CFFEX/multitags/data_tag_{}'.format(i), 'eval_0.2.csv')
    # print(len(data))
    # tag_list = []
    # # print(str(tag_list).decode("string_escape"))
    # # for tag in tag_list:
    # #     print(tag)
    # tag_set = split_multi_tags(tag_list)
    # print(str(tag_set).decode("string_escape"))
    # train = read_csv('cffex_uda/data/CFFEX/illegal', 'train_0.8.csv')
    # show_info(train)

    # for t in range(0, 30):
    #     cnt = 0
    #     for i in range(len(eval)):
    #         content = eval.iloc[i]["content"]
    #         label = eval.iloc[i]['label']
    #         res = re_match(content, label, t)
    #         if res == 1:
    #             cnt +=1
    #     print(1.0*cnt/len(eval))
    # cnt = 0

    # print(1.0 * cnt / len(eval))
    # eval = read_csv('cffex_uda/data/CFFEX/illegal', 'eval_0.2.csv')
    #     label = train.iloc[i]['label']
    #     if not label_keyword_map.has_key(label):
    #         label_keyword_map[label] = []
    #     key_words = summary(train.iloc[i]['content'])
    #     for word in key_words:
    #         if word not in label_keyword_map[label]:
    #             label_keyword_map[label].append(word)
    # for i in range(1,5):
    #     print(str(label_keyword_map[i]).decode("unicode_escape"))
    # train = dataframe_keyword_extraction(train)
    # to_csv(train, 'cffex_uda/data/CFFEX/illegal', 'train_extraction_0.8.csv')
    # unsup = read_csv('cffex_uda/data/CFFEX/illegal', 'unsup.csv')
    # unsup['label'] = 'unsup'
    # to_csv(unsup, 'cffex_uda/data/CFFEX/illegal', 'unsup.csv')
    # eval = read_csv('cffex_uda/data/CFFEX/illegal', 'eval_0.2.csv')
    # eval = dataframe_keyword_extraction(eval)
    # to_csv(eval, 'cffex_uda/data/CFFEX/illegal', 'eval_extraction_0.2.csv')
    # eval = dataframe_keyword_extraction(eval)
    # unsup = dataframe_keyword_extraction(unsup)
    # to_csv(unsup, 'cffex_uda/data/CFFEX/illegal', 'unsup_extration.csv')
    # unsup.insert(1,'label','unsup')

    # labels = ['unsup']*len(unsup)
    # unsup['label'] = labels
    # print(unsup.head())
    # to_csv(unsup,'cffex_uda/data/CFFEX/illegal', 'unsup.csv')
    # train_data = read_csv('cffex_uda/data/CFFEX/illegal', 'unsup.csv')
    # # train_data.drop(['text_split','emb_list'],axis=1,inplace=True)
    # show_info(train_data)
    # label_index = train_data.columns.get_loc('content')
    # for i in range(len(train_data)):
    #     content = train_data.iloc[i]['content']
    #     content = keyword_extraction(content)
    #
    #     train_data.iloc[i,label_index] = content
    #     # print(train_data.iloc[i]['content'])
    # show_info(train_data)
    # to_csv(train_data,'cffex_uda/data/CFFEX/illegal','unsup.csv')
    # to_csv(train_data,'cffex_uda/data/CFFEX/illegal',"train_extraction_0.8.csv")

    # eval = read_csv('cffex_uda/data/CFFEX/theme3', 'eval_0.2.csv')
    # unsup = read_csv('cffex_uda/data/CFFEX/illegal', 'unsup.csv')
    # show_info(train_data)
    # show_info(eval)
    # show_info(unsup)

    if FLAGS.process_for_split:
        train_data = read_csv(FLAGS.raw_data_dir, FLAGS.sup_train_file)
        dev_data = read_csv(FLAGS.raw_data_dir, FLAGS.sup_dev_file)
        # unsup_data = read_csv(FLAGS.raw_data_dir, FLAGS.unsup_file)
        train_data['text_split'] = train_data['content'].apply(
            lambda x: get_split(x, FLAGS.segment_len, FLAGS.segment_overlap))
        dev_data['text_split'] = dev_data['content'].apply(
            lambda x: get_split(x, FLAGS.segment_len, FLAGS.segment_overlap))
        # unsup_data['text_split'] = unsup_data['content'].apply(lambda x: get_split(x, FLAGS.segment_len, FLAGS.segment_overlap))
        to_csv(train_data, FLAGS.raw_data_dir, FLAGS.sup_train_file)
        to_csv(dev_data, FLAGS.raw_data_dir, FLAGS.sup_dev_file)
        # to_csv(unsup_data, FLAGS.raw_data_dir, FLAGS.unsup_file)

    if FLAGS.process_for_embedding:
        tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                               do_lower_case=True)
        processor = raw_data_utils.CFFEXProcessor(FLAGS.labels)
        uda = get_uda.Get_UDA(conf, tokenizer, processor)
        train_data = read_csv(FLAGS.raw_data_dir, FLAGS.sup_train_file)
        dev_data = read_csv(FLAGS.raw_data_dir, FLAGS.sup_dev_file)
        train_data = encode_emb_list(uda, train_data)
        dev_data = encode_emb_list(uda, dev_data)
        to_csv(train_data, FLAGS.raw_data_dir, FLAGS.sup_train_file)
        to_csv(dev_data, FLAGS.raw_data_dir, FLAGS.sup_dev_file)
        uda.kill()