def main(args): GoogleSTT(args.in_wav_folder) model=load_model(device) tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator", do_lower_case=False) if os.path.exists(args.out_infer_file): os.remove(args.out_infer_file) out_infer_folder = os.path.dirname(args.out_infer_file) if not os.path.exists(out_infer_folder): os.makedirs(out_infer_folder) wf = open(args.out_infer_file, "a") files = os.listdir(os.path.dirname(os.path.realpath(__file__))+'/tmp_ASR') for filename in tqdm(files): if os.path.splitext(filename)[1].lower() == '.txt': fullPath = "{}\{}".format('tmp_ASR', filename).replace('\\', '/') args.infer_file = fullPath test_dataset = load_and_cache_examples(args, tokenizer, mode="infer") preds = evaluate(args, model, test_dataset) filename=filename.split('.txt')[0] text = filename + ':' + str(preds[0]) print(text, file=wf)
def pred_sm(args): #os.environ['TRANSFORMERS_CACHE'] = os.path.dirname(os.path.realpath(__file__))+'/cache/' model=load_model(device) tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator", do_lower_case=False) if os.path.exists(args.out_infer_file): os.remove(args.out_infer_file) out_infer_folder = os.path.dirname(args.out_infer_file) if not os.path.exists(out_infer_folder): os.makedirs(out_infer_folder) wf = open(args.out_infer_file, "a") resultData = [] fileList = [] files = os.listdir(os.path.dirname(os.path.realpath(__file__))+'/tmp_ASR') files.sort() for filename in tqdm(files): if os.path.splitext(filename)[1].lower() == '.txt': fullPath = "{}\{}".format(os.path.dirname(os.path.realpath(__file__))+'/tmp_ASR', filename).replace('\\', '/') args.infer_file = fullPath test_dataset = load_and_cache_examples(args, tokenizer, mode="infer") preds = evaluate(args, model, test_dataset) filename=filename.split('.txt')[0] text = filename + ':' + str(preds[0]) print(text, file=wf) fileList.append(filename) resultData.append(str(preds[0])) return fileList, resultData
def stack_base(args, processor, tokenizer, model, stack_train_examples, stack_dev_examples): train_dataset = load_and_cache_examples(args, processor, tokenizer, mode='stack', examples=stack_train_examples) eval_dataset = load_and_cache_examples(args, processor, tokenizer, mode='stack', examples=stack_dev_examples) train_loss = train(args, model, processor, tokenizer, train_dataset) logging.info("stack 训练结束:loss {}".format(train_loss)) dev = evaluate(args, model, eval_dataset) logging.info("stack 验证结束:loss {}".format(dev))
def main(cli_args): # Read from config file and make args with open( os.path.join(cli_args.config_dir, cli_args.task, cli_args.config_file)) as f: args = AttrDict(json.load(f)) logger.info("Training/evaluation parameters {}".format(args)) args.output_dir = os.path.join(args.ckpt_dir, args.output_dir) init_logger() set_seed(args) processor = processors[args.task](args) labels = processor.get_labels() if output_modes[args.task] == "regression": config = CONFIG_CLASSES[args.model_type].from_pretrained( args.model_name_or_path, num_labels=tasks_num_labels[args.task]) else: config = CONFIG_CLASSES[args.model_type].from_pretrained( args.model_name_or_path, num_labels=tasks_num_labels[args.task], id2label={str(i): label for i, label in enumerate(labels)}, label2id={label: i for i, label in enumerate(labels)}, ) tokenizer = TOKENIZER_CLASSES[args.model_type].from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) model = MODEL_FOR_SEQUENCE_CLASSIFICATION[args.model_type].from_pretrained( args.model_name_or_path, config=config) # GPU or CPU args.device = "cuda" if torch.cuda.is_available( ) and not args.no_cuda else "cpu" model.to(args.device) # Load dataset train_dataset = load_and_cache_examples( args, tokenizer, mode="train") if args.train_file else None dev_dataset = load_and_cache_examples( args, tokenizer, mode="dev") if args.dev_file else None test_dataset = load_and_cache_examples( args, tokenizer, mode="test") if args.test_file else None if dev_dataset == None: args.evaluate_test_during_training = True # If there is no dev dataset, only use testset if args.do_train: global_step, tr_loss = train(args, model, train_dataset, dev_dataset, test_dataset) logger.info(" global_step = {}, average loss = {}".format( global_step, tr_loss)) results = {} if args.do_eval: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + "pytorch_model.bin", recursive=True))) if not args.eval_all_checkpoints: checkpoints = checkpoints[-1:] else: logging.getLogger("transformers.configuration_utils").setLevel( logging.WARN) # Reduce logging logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split("-")[-1] model = MODEL_FOR_SEQUENCE_CLASSIFICATION[ args.model_type].from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, test_dataset, mode="test", global_step=global_step) result = dict( (k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as f_w: for key in sorted(results.keys()): f_w.write("{} = {}\n".format(key, str(results[key])))
def evaluate(args, model, tokenizer, device,data_type,prefix=""): # if data_type == 'test': # dataset,examples,features = load_and_cache_examples(args, tokenizer, data_type, output_examples=True,prefix = prefix) # else: dataset, examples, features = load_and_cache_examples(args, tokenizer, data_type = data_type, output_examples=True,prefix = prefix) output_dir = os.path.join(args.output_dir,args.save_model_name) if not os.path.exists(output_dir) and args.local_rank in [-1, 0]: os.makedirs(output_dir) # args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] outputs = model(**inputs) # pdb.set_trace() for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions output_prediction_file = os.path.join(output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join(output_dir, "nbest_predictions_{}.json".format(prefix)) output_null_log_odds_file = None predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, False, args.null_score_diff_threshold, tokenizer, ) # Compute the F1 and exact scores. if data_type == 'test': return dev_dir = os.path.join(args.data_dir,args.dev_file) dev = json.load(open(dev_dir,'r')) prediction = json.load(open(output_prediction_file)) F1, EM, TOTAL, SKIP = Eval.evaluate(dev,prediction) return F1, EM
def train(args): output_dir = os.path.join(args.output_dir,args.save_model_name) if not os.path.exists(output_dir): os.makedirs(output_dir) logfilename = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+" "+args.save_model_name+".log.txt" fh = logging.FileHandler(os.path.join(output_dir,logfilename), mode='a', encoding='utf-8') fh.setLevel(logging.INFO) # ch = logging.StreamHandler(sys.stdout) # ch.setLevel(logging.INFO) logger.addHandler(fh) # logger.addHandler(ch) model_dir = os.path.join("model",'chinese_roberta_wwm_large_ext_pytorch') tokenizer = BertTokenizer.from_pretrained(model_dir) train_dataset= load_and_cache_examples(args, tokenizer, data_type = 'train',output_examples=False,prefix =args.train_file.split('.')[0] ) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) # setup device if args.use_tpu :# Colab TPU is not better than GPU import torch_xla import torch_xla.core.xla_model as xm import torch_xla.debug.metrics as met import torch_xla.distributed.parallel_loader as pl import torch_xla.distributed.xla_multiprocessing as xmp import torch_xla.utils.utils as xu device = xm.xla_device() else: device = torch.device('cuda:0') # model if args.do_finetune: status_dir = os.path.join(output_dir,"status.json") status = json.load(open(status_dir,'r')) current_model = os.path.join(output_dir, "current_model") model = BertForQuestionAnsweringWithMaskedLM.from_pretrained(current_model) else: origin_dir = os.path.join(args.output_dir,args.origin_model) model = BertForQuestionAnsweringWithMaskedLM.from_pretrained(origin_dir) status = {} status['best_epoch'] = 0 status['best_EM'] = 0.0 status['best_F1'] = 0.0 status['current_epoch'] = 0 # status['global_step'] = 0 model.to(device) model = amp.initialize(model,opt_level="O1") # Prepare optimizer and schedule (linear warmup and decay) t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) model, optimizer = amp.initialize(model, optimizer, opt_level="O1") tr_loss = 0.0 # global_step = 0 model.zero_grad() epochs_trained = 0 train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) # F1,EM = evaluate(args,model,tokenizer,device) # logger.info("Dev F1 = %s, EM = %s on epoch %s",str(F1),str(EM),str(-1)) # Train! ## 随机分配mlm和mrc顺序,保证比例 # pdb.set_trace() if args.mlm: task_split = torch.cat((torch.ones(2*len(train_dataloader)),torch.zeros(len(train_dataloader)))) task_split = task_split[torch.randperm(task_split.size(0))] for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) tr_loss = 0 mlm_proportion = float(2/3) for step, batch in enumerate(epoch_iterator): model.train() if args.mlm and task_split[(epoch%3)*len(train_dataloader)+step] ==1 : input_ids,masked_lm_labels = mask_tokens(batch[0],tokenizer,args) masked_lm_labels = masked_lm_labels.to(device) input_ids = input_ids.to(device) batch = tuple(t.to(device) for t in batch) inputs = { "input_ids": input_ids, "attention_mask": batch[1], "token_type_ids": batch[2], "masked_lm_labels":masked_lm_labels, } else: if args.acp: answer_content_labels = make_answer_content(batch[0],batch[3],batch[4]) answer_content_labels = answer_content_labels.to(device) else: answer_content_labels = None # pdb.set_trace() batch = tuple(t.to(device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], "answer_content_labels":answer_content_labels } outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() # loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() model.zero_grad() if (step + 1)% args.check_loss_step == 0 or step == len(train_dataloader): avg_loss = tr_loss/(step+1) logger.info("\t average_step_loss=%s @ step = %s on epoch = %s",str(avg_loss),str(step+1),str(epoch+1)) # F1 , EM = 11,22 if args.do_eval: F1,EM = evaluate(args,model,tokenizer,device,data_type = 'dev',prefix = args.dev_file.split('.')[0]) logger.info("Dev F1 = %s, EM = %s on epoch %s",str(F1),str(EM),str(epoch+1)) # save the best model output_dir = os.path.join(args.output_dir,args.save_model_name) if F1 > status['best_F1']: status['best_F1'] = F1 status['best_EM'] = EM status['best_epoch'] = epoch model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self best_model_dir = os.path.join(output_dir,"best_model") # output_dir = os.path.join(output_dir, 'checkpoint-{}'.format(epoch + 1)) if not os.path.exists(best_model_dir): os.makedirs(best_model_dir) model_to_save.save_pretrained(best_model_dir) logger.info("best epoch %d has been saved to %s",epoch,best_model_dir) # save current model status['current_epoch'] = epoch model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_dir = os.path.join(args.output_dir,args.save_model_name) current_model_dir = os.path.join(output_dir,"current_model") if not os.path.exists(current_model_dir): os.makedirs(current_model_dir) model_to_save.save_pretrained(current_model_dir) logger.info("epoch %d has been saved to %s",epoch,current_model_dir) # save status status_dir = os.path.join(output_dir,"status.json") json.dump(status,open(status_dir,'w',encoding = 'utf8'))
print('ENSEMBLE_DIR_LIST:{}'.format(ensemble_dir_list)) model_path_list = [x.strip() for x in ensemble_dir_list] print('model_path_list:{}'.format(model_path_list)) # device = torch.device(f'cuda:{GPU_IDS[0]}') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = EnsembleModel(model=model, model_path_list=model_path_list, device=device, lamb=lamb) labels = base_predict(test_dataset, model, id2label, ensemble=True, vote=True) return labels if __name__ == '__main__': args = Args().get_parser() args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') processor = newsProcessor() args.label2id = processor.get_labels() args.id2label = {i: label for i, label in enumerate(args.label2id)} model, tokenizer = create_model(args) test_dataset = load_and_cache_examples(args, processor, tokenizer, mode='test') labels_list = single_predict(test_dataset, model, args.id2label) print(labels_list) labels_list = ensemble_predict(test_dataset, model, args.id2label) print(labels_list) text = ["对于我国的科技巨头华为而言,2019年注定是不平凡的一年,由于在5G领域遥遥领先于其他国家,华为遭到了不少方面的觊觎,并因此承受了太多不公平地对待,在零部件供应、核心技术研发、以及市场等多个领域受到了有意打压。但是华为并没有因此而一蹶不振,而是亮出了自己的一张又一张“底牌”,随着麒麟处理器、海思半导体以及鸿蒙操作系统的闪亮登场,华为也向世界证明了自己的实力,上演了一场几乎完美的绝地反击。"] label_list = text_predict(text, model, tokenizer, args.id2label) print(label_list)
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) args = Args().get_parser() args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') processor = newsProcessor() args.label2id = processor.get_labels() args.id2label = {i: label for i, label in enumerate(args.label2id)} args.output_dir = os.path.join(args.output_dir, args.bert_type) model, tokenizer = create_model(args) model.to(args.device) if args.do_train: train_dataset = load_and_cache_examples(args, processor, tokenizer, mode="train") train_loss = train(args, model, processor, tokenizer, train_dataset) logging.info("训练结束:loss {}".format(train_loss)) if args.do_eval: eval_dataset = load_and_cache_examples(args, processor, tokenizer, mode="dev") eval = evaluate(args, model, eval_dataset) logging.info("验证结束:{}".format(eval)) if args.do_stack: stacking(args, processor, tokenizer, model)