Beispiel #1
0
def main(args):
    GoogleSTT(args.in_wav_folder)

    model=load_model(device)
    tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator", do_lower_case=False)

    if os.path.exists(args.out_infer_file):
        os.remove(args.out_infer_file)

    out_infer_folder = os.path.dirname(args.out_infer_file)
    if not os.path.exists(out_infer_folder): 
        os.makedirs(out_infer_folder)
        
    wf =  open(args.out_infer_file, "a")

    files = os.listdir(os.path.dirname(os.path.realpath(__file__))+'/tmp_ASR')   
    for filename in tqdm(files):
        if os.path.splitext(filename)[1].lower() == '.txt':
            fullPath = "{}\{}".format('tmp_ASR', filename).replace('\\', '/')

            args.infer_file = fullPath
            test_dataset  = load_and_cache_examples(args, tokenizer, mode="infer") 
            preds = evaluate(args, model, test_dataset)

            filename=filename.split('.txt')[0]
            text = filename + ':' + str(preds[0])
            print(text, file=wf)
Beispiel #2
0
def pred_sm(args):
    #os.environ['TRANSFORMERS_CACHE'] = os.path.dirname(os.path.realpath(__file__))+'/cache/'

    model=load_model(device)
    tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator", do_lower_case=False)

    if os.path.exists(args.out_infer_file):
        os.remove(args.out_infer_file)

    out_infer_folder = os.path.dirname(args.out_infer_file)
    if not os.path.exists(out_infer_folder): 
        os.makedirs(out_infer_folder)
        
    wf =  open(args.out_infer_file, "a")
    resultData = []
    fileList = []
    files = os.listdir(os.path.dirname(os.path.realpath(__file__))+'/tmp_ASR')
    files.sort()
    for filename in tqdm(files):
        if os.path.splitext(filename)[1].lower() == '.txt':
            fullPath = "{}\{}".format(os.path.dirname(os.path.realpath(__file__))+'/tmp_ASR', filename).replace('\\', '/')

            args.infer_file = fullPath
            test_dataset  = load_and_cache_examples(args, tokenizer, mode="infer") 
            preds = evaluate(args, model, test_dataset)

            filename=filename.split('.txt')[0]
            text = filename + ':' + str(preds[0])
            print(text, file=wf)
            fileList.append(filename)
            resultData.append(str(preds[0]))
    return fileList, resultData
def stack_base(args, processor, tokenizer, model, stack_train_examples,
               stack_dev_examples):
    train_dataset = load_and_cache_examples(args,
                                            processor,
                                            tokenizer,
                                            mode='stack',
                                            examples=stack_train_examples)
    eval_dataset = load_and_cache_examples(args,
                                           processor,
                                           tokenizer,
                                           mode='stack',
                                           examples=stack_dev_examples)
    train_loss = train(args, model, processor, tokenizer, train_dataset)
    logging.info("stack 训练结束:loss {}".format(train_loss))
    dev = evaluate(args, model, eval_dataset)
    logging.info("stack 验证结束:loss {}".format(dev))
Beispiel #4
0
def main(cli_args):
    # Read from config file and make args
    with open(
            os.path.join(cli_args.config_dir, cli_args.task,
                         cli_args.config_file)) as f:
        args = AttrDict(json.load(f))
    logger.info("Training/evaluation parameters {}".format(args))

    args.output_dir = os.path.join(args.ckpt_dir, args.output_dir)

    init_logger()
    set_seed(args)

    processor = processors[args.task](args)
    labels = processor.get_labels()
    if output_modes[args.task] == "regression":
        config = CONFIG_CLASSES[args.model_type].from_pretrained(
            args.model_name_or_path, num_labels=tasks_num_labels[args.task])
    else:
        config = CONFIG_CLASSES[args.model_type].from_pretrained(
            args.model_name_or_path,
            num_labels=tasks_num_labels[args.task],
            id2label={str(i): label
                      for i, label in enumerate(labels)},
            label2id={label: i
                      for i, label in enumerate(labels)},
        )
    tokenizer = TOKENIZER_CLASSES[args.model_type].from_pretrained(
        args.model_name_or_path, do_lower_case=args.do_lower_case)
    model = MODEL_FOR_SEQUENCE_CLASSIFICATION[args.model_type].from_pretrained(
        args.model_name_or_path, config=config)

    # GPU or CPU
    args.device = "cuda" if torch.cuda.is_available(
    ) and not args.no_cuda else "cpu"
    model.to(args.device)

    # Load dataset
    train_dataset = load_and_cache_examples(
        args, tokenizer, mode="train") if args.train_file else None
    dev_dataset = load_and_cache_examples(
        args, tokenizer, mode="dev") if args.dev_file else None
    test_dataset = load_and_cache_examples(
        args, tokenizer, mode="test") if args.test_file else None

    if dev_dataset == None:
        args.evaluate_test_during_training = True  # If there is no dev dataset, only use testset

    if args.do_train:
        global_step, tr_loss = train(args, model, train_dataset, dev_dataset,
                                     test_dataset)
        logger.info(" global_step = {}, average loss = {}".format(
            global_step, tr_loss))

    results = {}
    if args.do_eval:
        checkpoints = list(
            os.path.dirname(c) for c in sorted(
                glob.glob(args.output_dir + "/**/" + "pytorch_model.bin",
                          recursive=True)))
        if not args.eval_all_checkpoints:
            checkpoints = checkpoints[-1:]
        else:
            logging.getLogger("transformers.configuration_utils").setLevel(
                logging.WARN)  # Reduce logging
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1]
            model = MODEL_FOR_SEQUENCE_CLASSIFICATION[
                args.model_type].from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args,
                              model,
                              test_dataset,
                              mode="test",
                              global_step=global_step)
            result = dict(
                (k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as f_w:
            for key in sorted(results.keys()):
                f_w.write("{} = {}\n".format(key, str(results[key])))
def evaluate(args, model, tokenizer, device,data_type,prefix=""):
    # if data_type == 'test':
    #     dataset,examples,features = load_and_cache_examples(args, tokenizer, data_type, output_examples=True,prefix = prefix)
    # else:
    dataset, examples, features = load_and_cache_examples(args, tokenizer, data_type = data_type, output_examples=True,prefix = prefix)

    output_dir = os.path.join(args.output_dir,args.save_model_name)
    if not os.path.exists(output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(output_dir)

    # args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }


            example_indices = batch[3]

            outputs = model(**inputs)
            # pdb.set_trace()
        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(output_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(output_dir, "nbest_predictions_{}.json".format(prefix))


    output_null_log_odds_file = None

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        args.n_best_size,
        args.max_answer_length,
        args.do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        args.verbose_logging,
        False,
        args.null_score_diff_threshold,
        tokenizer,
    )

    # Compute the F1 and exact scores.
    if data_type == 'test':
        return 
    dev_dir = os.path.join(args.data_dir,args.dev_file)
    dev = json.load(open(dev_dir,'r'))
    prediction = json.load(open(output_prediction_file)) 
    F1, EM, TOTAL, SKIP = Eval.evaluate(dev,prediction)
    
    return F1, EM
def train(args):
    output_dir = os.path.join(args.output_dir,args.save_model_name)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    logfilename = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+" "+args.save_model_name+".log.txt"
    fh = logging.FileHandler(os.path.join(output_dir,logfilename), mode='a', encoding='utf-8')
    fh.setLevel(logging.INFO)
    # ch = logging.StreamHandler(sys.stdout)
    # ch.setLevel(logging.INFO)
    logger.addHandler(fh)
    # logger.addHandler(ch)

    model_dir = os.path.join("model",'chinese_roberta_wwm_large_ext_pytorch')
    tokenizer = BertTokenizer.from_pretrained(model_dir)
    train_dataset= load_and_cache_examples(args, tokenizer, data_type = 'train',output_examples=False,prefix =args.train_file.split('.')[0] )
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
    
    # setup device
    if args.use_tpu :# Colab TPU is not better than GPU
        import torch_xla
        import torch_xla.core.xla_model as xm
        import torch_xla.debug.metrics as met
        import torch_xla.distributed.parallel_loader as pl
        import torch_xla.distributed.xla_multiprocessing as xmp
        import torch_xla.utils.utils as xu
        device = xm.xla_device()
    else:
        device = torch.device('cuda:0')
    
    # model
    if args.do_finetune:
        status_dir = os.path.join(output_dir,"status.json")
        status = json.load(open(status_dir,'r'))
        current_model = os.path.join(output_dir, "current_model")
        model = BertForQuestionAnsweringWithMaskedLM.from_pretrained(current_model)
        
    else:
        origin_dir = os.path.join(args.output_dir,args.origin_model)
        model = BertForQuestionAnsweringWithMaskedLM.from_pretrained(origin_dir)
        status = {}
        status['best_epoch'] = 0
        status['best_EM'] = 0.0
        status['best_F1'] = 0.0
        status['current_epoch']  = 0
        # status['global_step'] = 0
        
    model.to(device)
    model = amp.initialize(model,opt_level="O1")
    # Prepare optimizer and schedule (linear warmup and decay)
    t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )
    model, optimizer = amp.initialize(model, optimizer, opt_level="O1") 
    tr_loss = 0.0
    # global_step = 0
    model.zero_grad()
    epochs_trained = 0
    train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
    # F1,EM = evaluate(args,model,tokenizer,device)
    # logger.info("Dev F1 = %s, EM = %s on epoch %s",str(F1),str(EM),str(-1))
    # Train!
    ## 随机分配mlm和mrc顺序,保证比例
    # pdb.set_trace()
    if args.mlm:
        task_split = torch.cat((torch.ones(2*len(train_dataloader)),torch.zeros(len(train_dataloader))))
        task_split = task_split[torch.randperm(task_split.size(0))]

    for epoch in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        tr_loss = 0
        
        mlm_proportion = float(2/3)
        for step, batch in enumerate(epoch_iterator):
            model.train()
            if args.mlm and task_split[(epoch%3)*len(train_dataloader)+step] ==1 :
                input_ids,masked_lm_labels = mask_tokens(batch[0],tokenizer,args) 
                masked_lm_labels = masked_lm_labels.to(device)
                input_ids = input_ids.to(device)
                batch = tuple(t.to(device) for t in batch)
                
                inputs =   {
                    "input_ids": input_ids,
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                    "masked_lm_labels":masked_lm_labels,
                }
            else:
                if args.acp:
                    answer_content_labels = make_answer_content(batch[0],batch[3],batch[4])
                    answer_content_labels = answer_content_labels.to(device)
                else:
                    answer_content_labels = None
                # pdb.set_trace()
                batch = tuple(t.to(device) for t in batch)
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                    "start_positions": batch[3],
                    "end_positions": batch[4],
                    "answer_content_labels":answer_content_labels
                }
            outputs = model(**inputs)
            # model outputs are always tuple in transformers (see doc)
            loss = outputs[0]
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            # loss.backward()
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step() 
                model.zero_grad()
            if (step + 1)% args.check_loss_step == 0 or step == len(train_dataloader):
                avg_loss = tr_loss/(step+1)
                logger.info("\t average_step_loss=%s @ step = %s on epoch = %s",str(avg_loss),str(step+1),str(epoch+1))
                
        # F1 , EM  = 11,22
        if args.do_eval:
            F1,EM = evaluate(args,model,tokenizer,device,data_type = 'dev',prefix = args.dev_file.split('.')[0])
            logger.info("Dev F1 = %s, EM = %s on epoch %s",str(F1),str(EM),str(epoch+1))
            # save the best model 
            output_dir = os.path.join(args.output_dir,args.save_model_name)
            if F1 > status['best_F1']:
                status['best_F1'] = F1
                status['best_EM'] = EM
                status['best_epoch'] = epoch
                model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
                best_model_dir = os.path.join(output_dir,"best_model")
                # output_dir = os.path.join(output_dir, 'checkpoint-{}'.format(epoch + 1))
                if not os.path.exists(best_model_dir):
                    os.makedirs(best_model_dir)
                model_to_save.save_pretrained(best_model_dir)
                logger.info("best epoch %d has been saved to %s",epoch,best_model_dir)
            # save current model
        status['current_epoch'] = epoch
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
        output_dir = os.path.join(args.output_dir,args.save_model_name)
        current_model_dir = os.path.join(output_dir,"current_model")
        
        if not os.path.exists(current_model_dir):
            os.makedirs(current_model_dir)
        model_to_save.save_pretrained(current_model_dir)
        logger.info("epoch %d has been saved to %s",epoch,current_model_dir)
        # save status
        status_dir = os.path.join(output_dir,"status.json")
        json.dump(status,open(status_dir,'w',encoding = 'utf8'))
Beispiel #7
0
        print('ENSEMBLE_DIR_LIST:{}'.format(ensemble_dir_list))
    model_path_list = [x.strip() for x in ensemble_dir_list]
    print('model_path_list:{}'.format(model_path_list))

    # device = torch.device(f'cuda:{GPU_IDS[0]}')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = EnsembleModel(model=model, model_path_list=model_path_list, device=device, lamb=lamb)
    labels = base_predict(test_dataset, model, id2label, ensemble=True, vote=True)
    return labels




if __name__ == '__main__':
    args = Args().get_parser()
    args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    processor = newsProcessor()
    args.label2id = processor.get_labels()
    args.id2label = {i: label for i, label in enumerate(args.label2id)}
    model, tokenizer = create_model(args)
    test_dataset = load_and_cache_examples(args, processor, tokenizer, mode='test')

    labels_list = single_predict(test_dataset, model, args.id2label)
    print(labels_list)

    labels_list = ensemble_predict(test_dataset, model, args.id2label)
    print(labels_list)

    text = ["对于我国的科技巨头华为而言,2019年注定是不平凡的一年,由于在5G领域遥遥领先于其他国家,华为遭到了不少方面的觊觎,并因此承受了太多不公平地对待,在零部件供应、核心技术研发、以及市场等多个领域受到了有意打压。但是华为并没有因此而一蹶不振,而是亮出了自己的一张又一张“底牌”,随着麒麟处理器、海思半导体以及鸿蒙操作系统的闪亮登场,华为也向世界证明了自己的实力,上演了一场几乎完美的绝地反击。"]
    label_list = text_predict(text, model, tokenizer, args.id2label)
    print(label_list)
    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S',
                        level=logging.INFO)

    args = Args().get_parser()
    args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    processor = newsProcessor()
    args.label2id = processor.get_labels()
    args.id2label = {i: label for i, label in enumerate(args.label2id)}
    args.output_dir = os.path.join(args.output_dir, args.bert_type)
    model, tokenizer = create_model(args)
    model.to(args.device)

    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                processor,
                                                tokenizer,
                                                mode="train")
        train_loss = train(args, model, processor, tokenizer, train_dataset)
        logging.info("训练结束:loss {}".format(train_loss))

    if args.do_eval:
        eval_dataset = load_and_cache_examples(args,
                                               processor,
                                               tokenizer,
                                               mode="dev")
        eval = evaluate(args, model, eval_dataset)
        logging.info("验证结束:{}".format(eval))

    if args.do_stack:
        stacking(args, processor, tokenizer, model)