Ejemplos de FullTokenizer en Python, ejemplos de ssf_uda.utils.tokenization.FullTokenizer en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: test.py Proyecto: linyuefeng123/ssf_uda

def get_multitags_uda(conf_list):
    uda_list = []
    labels_map = []
    for conf in conf_list:
        tokenizer = tokenization.FullTokenizer(vocab_file=conf['vocab_file'],
                                               do_lower_case=True)
        processor = raw_data_utils.CFFEXProcessor(conf["labels"])
        uda = get_uda.Get_UDA(conf, tokenizer, processor)
        uda_list.append(uda)
        labels_map.append(conf["labels"])
    return uda_list, labels_map

Ejemplo n.º 2

0

Mostrar archivo

def main(_):

    if FLAGS.max_seq_length > 512:
        raise ValueError(
            "Cannot use sequence length {:d} because the BERT model "
            "was only trained up to sequence length {:d}".format(
                FLAGS.max_seq_length, 512))

    # processor = raw_data_utils.get_processor(FLAGS.task_name)
    processor = raw_data_utils.CFFEXProcessor(FLAGS.labels, FLAGS.raw_data_dir,
                                              FLAGS.sup_train_file,
                                              FLAGS.sup_dev_file,
                                              FLAGS.unsup_file)
    # Create tokenizer
    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    if FLAGS.data_type == "sup":
        sup_out_dir = FLAGS.output_base_dir
        tf.logging.info("Create sup. data: subset {} => {}".format(
            FLAGS.sub_set, sup_out_dir))

        proc_and_save_sup_data(
            processor,
            FLAGS.sub_set,
            FLAGS.raw_data_dir,
            sup_out_dir,
            tokenizer,
            FLAGS.max_seq_length,
            FLAGS.trunc_keep_right,
            FLAGS.worker_id,
            FLAGS.replicas,
            FLAGS.sup_size,
        )
    elif FLAGS.data_type == "unsup":
        assert FLAGS.aug_ops is not None, \
            "aug_ops is required to preprocess unsupervised data."
        unsup_out_dir = os.path.join(FLAGS.output_base_dir, FLAGS.aug_ops,
                                     str(FLAGS.aug_copy_num))
        data_stats_dir = os.path.join(FLAGS.raw_data_dir, "data_stats")

        tf.logging.info("Create unsup. data: subset {} => {}".format(
            FLAGS.sub_set, unsup_out_dir))
        proc_and_save_unsup_data(processor, FLAGS.sub_set, FLAGS.raw_data_dir,
                                 data_stats_dir, unsup_out_dir, tokenizer,
                                 FLAGS.max_seq_length, FLAGS.trunc_keep_right,
                                 FLAGS.aug_ops, FLAGS.aug_copy_num,
                                 FLAGS.worker_id, FLAGS.replicas)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test.py Proyecto: linyuefeng123/ssf_uda

def illegal_eval_uda():
    tokenizer = tokenization.FullTokenizer(vocab_file=conf['vocab_file'], do_lower_case=True)
    processor = raw_data_utils.CFFEXProcessor(conf["labels"])
    uda = get_uda.Get_UDA(conf, tokenizer, processor)

    eval_data = read_csv('ssf_uda/data/CFFEX/illegal', 'eval_0.2.csv')

    check_data = pd.DataFrame(columns=('content', 'label', 'predict'))

    # label_dict = {7:"恶意抹黑监管机构", 3:"非法发行证券",1:"非法荐股",4:"非法配资",5:"负面言论风险言论",6:"煽动维权诈骗",
    #               2:"诱导开户",0:"正常"}
    label_dict = ["未涉嫌违规", "非法投资咨询", "非法诱导投资", "非法经营活动", "维权追损二次诈骗"]
    labels = conf['labels']

    TP = 0.0
    FN = 0.0
    FP = 0.0
    false_examples = []
    for i in range(len(eval_data)):
        content = eval_data.iloc[i]['content']
        label = label_dict[int(eval_data.iloc[i]['label'])]
        probs = uda.predict(content)
        label_p = label_dict[int(get_label(labels, probs))]
        check_data = check_data.append([{'content': content, 'label': json.dumps(label, ensure_ascii=False),
                                         'predict': json.dumps(label_p, ensure_ascii=False)}], ignore_index=True)

        if label != "未涉嫌违规" and label_p != "未涉嫌违规":
            TP += 1
        elif  label != "未涉嫌违规" and label_p == "未涉嫌违规":
            # false_examples.append([content, label, label_p])
            FN += 1
        elif label == "未涉嫌违规" and label_p != "未涉嫌违规":
            false_examples.append([content, label, label_p])
            FP += 1

    for w in false_examples:
        print(w[0])
        print("ori:{}\tpredict:{}".format(w[1], w[2]))
    pcsion = TP / (TP + FP)
    recall = TP / (TP + FN)
    to_csv(check_data, 'ssf_uda/data/CFFEX/illegal', 'illegal_uda.csv')
    print("误识别为违规言论条数：{}".format(len(false_examples)))
    print("违规言论精确率：{}, 召回率：{}".format(pcsion, recall))



    uda.kill()

Ejemplo n.º 4

0

Mostrar archivo

Archivo: csv_process.py Proyecto: linyuefeng123/ssf_uda

def main(_):

    # for i in range(5):
    #     data  = read_csv('ssf_uda/data/CFFEX/multitags/data_tag_{}'.format(i), 'data_tag_{}.csv'.format(i))
    #     train, eval = split_set_by_ratio(data,0.8)
    #     to_csv(train,'ssf_uda/data/CFFEX/multitags/data_tag_{}'.format(i), 'train_0.8.csv')
    #     to_csv(eval, 'ssf_uda/data/CFFEX/multitags/data_tag_{}'.format(i), 'eval_0.2.csv')
    # print(len(data))
    # tag_list = []
    # # print(str(tag_list).decode("string_escape"))
    # # for tag in tag_list:
    # #     print(tag)
    # tag_set = split_multi_tags(tag_list)
    # print(str(tag_set).decode("string_escape"))
    # train = read_csv('ssf_uda/data/CFFEX/illegal', 'train_0.8.csv')
    # show_info(train)

    # for t in range(0, 30):
    #     cnt = 0
    #     for i in range(len(eval)):
    #         content = eval.iloc[i]["content"]
    #         label = eval.iloc[i]['label']
    #         res = re_match(content, label, t)
    #         if res == 1:
    #             cnt +=1
    #     print(1.0*cnt/len(eval))
    # cnt = 0

    # print(1.0 * cnt / len(eval))
    # eval = read_csv('ssf_uda/data/CFFEX/illegal', 'eval_0.2.csv')
    #     label = train.iloc[i]['label']
    #     if not label_keyword_map.has_key(label):
    #         label_keyword_map[label] = []
    #     key_words = summary(train.iloc[i]['content'])
    #     for word in key_words:
    #         if word not in label_keyword_map[label]:
    #             label_keyword_map[label].append(word)
    # for i in range(1,5):
    #     print(str(label_keyword_map[i]).decode("unicode_escape"))
    # train = dataframe_keyword_extraction(train)
    # to_csv(train, 'ssf_uda/data/CFFEX/illegal', 'train_extraction_0.8.csv')
    # unsup = read_csv('ssf_uda/data/CFFEX/illegal', 'unsup.csv')
    # unsup['label'] = 'unsup'
    # to_csv(unsup, 'ssf_uda/data/CFFEX/illegal', 'unsup.csv')
    # eval = read_csv('ssf_uda/data/CFFEX/illegal', 'eval_0.2.csv')
    # eval = dataframe_keyword_extraction(eval)
    # to_csv(eval, 'ssf_uda/data/CFFEX/illegal', 'eval_extraction_0.2.csv')
    # eval = dataframe_keyword_extraction(eval)
    # unsup = dataframe_keyword_extraction(unsup)
    # to_csv(unsup, 'ssf_uda/data/CFFEX/illegal', 'unsup_extration.csv')
    # unsup.insert(1,'label','unsup')

    # labels = ['unsup']*len(unsup)
    # unsup['label'] = labels
    # print(unsup.head())
    # to_csv(unsup,'ssf_uda/data/CFFEX/illegal', 'unsup.csv')
    # train_data = read_csv('ssf_uda/data/CFFEX/illegal', 'unsup.csv')
    # # train_data.drop(['text_split','emb_list'],axis=1,inplace=True)
    # show_info(train_data)
    # label_index = train_data.columns.get_loc('content')
    # for i in range(len(train_data)):
    #     content = train_data.iloc[i]['content']
    #     content = keyword_extraction(content)
    #
    #     train_data.iloc[i,label_index] = content
    #     # print(train_data.iloc[i]['content'])
    # show_info(train_data)
    # to_csv(train_data,'ssf_uda/data/CFFEX/illegal','unsup.csv')
    # to_csv(train_data,'ssf_uda/data/CFFEX/illegal',"train_extraction_0.8.csv")

    # eval = read_csv('ssf_uda/data/CFFEX/theme3', 'eval_0.2.csv')
    # unsup = read_csv('ssf_uda/data/CFFEX/illegal', 'unsup.csv')
    # show_info(train_data)
    # show_info(eval)
    # show_info(unsup)

    if FLAGS.process_for_split:
        train_data = read_csv(FLAGS.raw_data_dir, FLAGS.sup_train_file)
        dev_data = read_csv(FLAGS.raw_data_dir, FLAGS.sup_dev_file)
        # unsup_data = read_csv(FLAGS.raw_data_dir, FLAGS.unsup_file)
        train_data['text_split'] = train_data['content'].apply(
            lambda x: get_split(x, FLAGS.segment_len, FLAGS.segment_overlap))
        dev_data['text_split'] = dev_data['content'].apply(
            lambda x: get_split(x, FLAGS.segment_len, FLAGS.segment_overlap))
        # unsup_data['text_split'] = unsup_data['content'].apply(lambda x: get_split(x, FLAGS.segment_len, FLAGS.segment_overlap))
        to_csv(train_data, FLAGS.raw_data_dir, FLAGS.sup_train_file)
        to_csv(dev_data, FLAGS.raw_data_dir, FLAGS.sup_dev_file)
        # to_csv(unsup_data, FLAGS.raw_data_dir, FLAGS.unsup_file)

    if FLAGS.process_for_embedding:
        tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                               do_lower_case=True)
        processor = raw_data_utils.CFFEXProcessor(FLAGS.labels)
        uda = get_uda.Get_UDA(conf, tokenizer, processor)
        train_data = read_csv(FLAGS.raw_data_dir, FLAGS.sup_train_file)
        dev_data = read_csv(FLAGS.raw_data_dir, FLAGS.sup_dev_file)
        train_data = encode_emb_list(uda, train_data)
        dev_data = encode_emb_list(uda, dev_data)
        to_csv(train_data, FLAGS.raw_data_dir, FLAGS.sup_train_file)
        to_csv(dev_data, FLAGS.raw_data_dir, FLAGS.sup_dev_file)
        uda.kill()

Ejemplo n.º 5

0

Mostrar archivo

        )

    def predict(self, str):
        try:
            counter = timeCounter('predict stage')
            counter()
            self.q.put(str)
            predict_result = self.predictor.next()
            counter()
        except Exception as e:
            print(repr(e))
        return predict_result


if __name__ == "__main__":
    tokenizer = tokenization.FullTokenizer(vocab_file=conf['vocab_file'],
                                           do_lower_case=True)
    processor = raw_data_utils.get_processor(conf['task_name'])
    tokenizer = tokenization.FullTokenizer(vocab_file=conf['vocab_file'],
                                           do_lower_case=True)
    uda = Get_UDA(conf, tokenizer, processor)
    tf.logging.info("***** Running prediction *****")
    str = '股指期货就像孙悟空。人跟神，不能做亲密的小伙伴，实在不好愉快的玩耍。'
    predict_result = uda.predict(str)
    # for key in predict_result.keys():
    #     predict_result[key] = predict_result[key].item()
    p = predict_result
    print(type(p['probabilities']))
    print(type(list(p['probabilities'])))
    # for p in predict_result:
    #     print(p)
    #     break