def run_test(args): from pybert.io.task_data import TaskData from pybert.test.predictor import Predictor data = TaskData() targets, sentences = data.read_data(raw_data_path=config['test_path'], preprocessor=EnglishPreProcessor(), is_train=False) lines = list(zip(sentences, targets)) processor = BertProcessor(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case) label_list = processor.get_labels() id2label = {i: label for i, label in enumerate(label_list)} test_data = processor.get_test(lines=lines) test_examples = processor.create_examples(lines=test_data, example_type='test', cached_examples_file=config[ 'data_dir'] / f"cached_test_examples_{args.arch}") test_features = processor.create_features(examples=test_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config[ 'data_dir'] / "cached_test_features_{}_{}".format( args.eval_max_seq_len, args.arch )) test_dataset = processor.create_dataset(test_features) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.train_batch_size) model = BertForMultiLable.from_pretrained(config['checkpoint_dir'], num_labels=len(label_list)) # ----------- predicting logger.info('model predicting....') predictor = Predictor(model=model, logger=logger, n_gpu=args.n_gpu) result = predictor.predict(data=test_dataloader) print(result)
def main(): # **************************** 基础信息 *********************** logger = init_logger(log_name=config['model']['arch'], log_dir=config['output']['log_dir']) logger.info(f"seed is {config['train']['seed']}") device = 'cuda:%d' % config['train']['n_gpu'][0] if len( config['train']['n_gpu']) else 'cpu' seed_everything(seed=config['train']['seed'], device=device) logger.info('starting load data from disk') id2label = {value: key for key, value in config['label2id'].items()} #**************************** 数据生成 *********************** DT = DataTransformer(logger=logger, seed=config['train']['seed']) # 读取数据集以及数据划分 targets, sentences = DT.read_data( raw_data_path=config['data']['test_file_path'], preprocessor=EnglishPreProcessor(), is_train=False) tokenizer = BertTokenizer( vocab_file=config['pretrained']['bert']['vocab_path'], do_lower_case=config['train']['do_lower_case']) # train test_dataset = CreateDataset(data=list(zip(sentences, targets)), tokenizer=tokenizer, max_seq_len=config['train']['max_seq_len'], seed=config['train']['seed'], example_type='test') # 验证数据集 test_loader = DataLoader(dataset=test_dataset, batch_size=config['train']['batch_size'], num_workers=config['train']['num_workers'], shuffle=False, drop_last=False, pin_memory=False) # **************************** 模型 *********************** logger.info("initializing model") model = BertFine.from_pretrained( config['pretrained']['bert']['bert_model_dir'], cache_dir=config['output']['cache_dir'], num_classes=len(id2label)) # **************************** training model *********************** logger.info('model predicting....') predicter = Predicter( model=model, logger=logger, n_gpu=config['train']['n_gpu'], model_path=config['output']['checkpoint_dir'] / f"best_{config['model']['arch']}_model.pth", ) # 拟合模型 result = predicter.predict(data=test_loader) print(result) # 释放显存 if len(config['train']['n_gpu']) > 0: torch.cuda.empty_cache()
def main(): parser = ArgumentParser() parser.add_argument("--arch", default='bert', type=str) parser.add_argument("--do_data", action='store_true') parser.add_argument("--do_train", action='store_true') parser.add_argument("--do_test", action='store_true') parser.add_argument("--save_best", action='store_true') parser.add_argument("--do_lower_case", action='store_true') parser.add_argument('--data_name', default='kaggle', type=str) parser.add_argument("--epochs", default=6, type=int) parser.add_argument("--resume_path", default='', type=str) parser.add_argument("--mode", default='min', type=str) parser.add_argument("--monitor", default='valid_loss', type=str) parser.add_argument("--valid_size", default=0.2, type=float) parser.add_argument("--local_rank", type=int, default=-1) parser.add_argument("--sorted", default=1, type=int, help='1 : True 0:False ') parser.add_argument("--n_gpu", type=str, default='0', help='"0,1,.." or "0" or "" ') parser.add_argument('--gradient_accumulation_steps', type=int, default=1) parser.add_argument("--train_batch_size", default=8, type=int) parser.add_argument('--eval_batch_size', default=8, type=int) parser.add_argument("--train_max_seq_len", default=256, type=int) parser.add_argument("--eval_max_seq_len", default=256, type=int) parser.add_argument('--loss_scale', type=float, default=0) parser.add_argument("--warmup_proportion", default=0.1, type=int, ) parser.add_argument("--weight_decay", default=0.01, type=float) parser.add_argument("--adam_epsilon", default=1e-8, type=float) parser.add_argument("--grad_clip", default=1.0, type=float) parser.add_argument("--learning_rate", default=2e-5, type=float) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--fp16', action='store_true') parser.add_argument('--fp16_opt_level', type=str, default='O1') args = parser.parse_args() config['checkpoint_dir'] = config['checkpoint_dir'] / args.arch config['checkpoint_dir'].mkdir(exist_ok=True) # Good practice: save your training arguments together with the trained model torch.save(args, config['checkpoint_dir'] / 'training_args.bin') seed_everything(args.seed) init_logger(log_file=config['log_dir'] / f"{args.arch}.log") logger.info("Training/evaluation parameters %s", args) if args.do_data: from pybert.io.task_data import TaskData data = TaskData() targets, sentences = data.read_data(raw_data_path=config['raw_data_path'], preprocessor=EnglishPreProcessor(), is_train=True) data.train_val_split(X=sentences, y=targets, shuffle=True, stratify=False, valid_size=args.valid_size, data_dir=config['data_dir'], data_name=args.data_name) if args.do_train: run_train(args) if args.do_test: run_test(args)
def main(): # **************************** Log initial data *********************** logger = init_logger(log_name=config['model']['arch'], log_dir=config['output']['log_dir']) logger.info(f"seed is {config['train']['seed']}") device = f"cuda: {config['train']['n_gpu'][0] if len(config['train']['n_gpu']) else 'cpu'}" seed_everything(seed=config['train']['seed'], device=device) logger.info('starting load data from disk') id2label = {value: key for key, value in config['label2id'].items()} DT = DataTransformer(logger=logger, seed=config['train']['seed']) targets, sentences = DT.read_data( raw_data_path=config['data']['raw_data_path'], preprocessor=EnglishPreProcessor(), is_train=True) train, valid = DT.train_val_split( X=sentences, y=targets, save=True, shuffle=True, stratify=False, valid_size=config['train']['valid_size'], train_path=config['data']['train_file_path'], valid_path=config['data']['valid_file_path']) tokenizer = BertTokenizer( vocab_file=config['pretrained']['bert']['vocab_path'], do_lower_case=config['train']['do_lower_case']) # train train_dataset = CreateDataset(data=train, tokenizer=tokenizer, max_seq_len=config['train']['max_seq_len'], seed=config['train']['seed'], example_type='train') # valid valid_dataset = CreateDataset(data=valid, tokenizer=tokenizer, max_seq_len=config['train']['max_seq_len'], seed=config['train']['seed'], example_type='valid') # train loader train_loader = DataLoader(dataset=train_dataset, batch_size=config['train']['batch_size'], num_workers=config['train']['num_workers'], shuffle=True, drop_last=False, pin_memory=False) # validation set loader valid_loader = DataLoader(dataset=valid_dataset, batch_size=config['train']['batch_size'], num_workers=config['train']['num_workers'], shuffle=False, drop_last=False, pin_memory=False) # **************************** initialize model *********************** logger.info("initializing model") model = BertFine.from_pretrained( config['pretrained']['bert']['bert_model_dir'], cache_dir=config['output']['cache_dir'], num_classes=len(id2label)) # ************************** set params ************************* param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_steps = int( len(train_dataset.examples) / config['train']['batch_size'] / config['train']['gradient_accumulation_steps'] * config['train']['epochs']) # t_total: total number of training steps for the learning rate schedule # warmup: portion of t_total for the warmup optimizer = BertAdam(optimizer_grouped_parameters, lr=config['train']['learning_rate'], warmup=config['train']['warmup_proportion'], t_total=num_train_steps) # **************************** callbacks *********************** logger.info("initializing callbacks") # model checkpoint model_checkpoint = ModelCheckpoint( checkpoint_dir=config['output']['checkpoint_dir'], mode=config['callbacks']['mode'], monitor=config['callbacks']['monitor'], save_best_only=config['callbacks']['save_best_only'], arch=config['model']['arch'], logger=logger) # monitor train_monitor = TrainingMonitor(file_dir=config['output']['figure_dir'], arch=config['model']['arch']) # learning rate scheduler lr_scheduler = BertLR(optimizer=optimizer, learning_rate=config['train']['learning_rate'], t_total=num_train_steps, warmup=config['train']['warmup_proportion']) # **************************** training model *********************** logger.info('training model....') train_configs = { 'model': model, 'logger': logger, 'optimizer': optimizer, 'resume': config['train']['resume'], 'epochs': config['train']['epochs'], 'n_gpu': config['train']['n_gpu'], 'gradient_accumulation_steps': config['train']['gradient_accumulation_steps'], 'epoch_metrics': [F1Score(average='micro', task_type='binary')], 'batch_metrics': [AccuracyThresh(thresh=0.5)], 'criterion': BCEWithLogLoss(), 'model_checkpoint': model_checkpoint, 'training_monitor': train_monitor, 'lr_scheduler': lr_scheduler, 'early_stopping': None, 'verbose': 1 } trainer = Trainer(train_configs=train_configs) trainer.train(train_data=train_loader, valid_data=valid_loader) if len(config['train']['n_gpu']) > 0: torch.cuda.empty_cache()
def run_test(args, test=False, k=7, med_map='pybert/dataset/med_map.csv'): from pybert.io.task_data import TaskData from pybert.test.predictor import Predictor data = TaskData() targets, sentences = data.read_data(raw_data_path=config['test_path'], preprocessor=EnglishPreProcessor(), is_train=test) print( f'-----------------------------------------\ntargets {targets}\n---------------------------------------------------' ) lines = list(zip(sentences, targets)) processor = BertProcessor(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case) label_list = processor.get_labels() id2label = {i: label for i, label in enumerate(label_list)} test_data = processor.get_test(lines=lines) test_examples = processor.create_examples( lines=test_data, example_type='test', cached_examples_file=config['data_dir'] / f"cached_test_examples_{args.arch}") test_features = processor.create_features( examples=test_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config['data_dir'] / "cached_test_features_{}_{}".format(args.eval_max_seq_len, args.arch)) test_dataset = processor.create_dataset(test_features) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.train_batch_size) model = BertForMultiLable.from_pretrained(config['checkpoint_dir']) # ----------- predicting logger.info('model predicting....') predictor = Predictor(model=model, logger=logger, n_gpu=args.n_gpu, test=test) if test: results, targets = predictor.predict(data=test_dataloader) #print(f'results {results.shape}') #print(f'targets {targets.shape}') result = dict() metrics = [Recall(), Acc()] for metric in metrics: metric.reset() metric(logits=results, target=targets) value = metric.value() if value is not None: result[f'valid_{metric.name()}'] = value return result else: results = predictor.predict(data=test_dataloader) pred = np.argsort(results)[:, -k:][:, ::-1] with open('pybert/dataset/med_map.csv', mode='r') as infile: reader = csv.reader(infile) med_dict = {int(rows[0]): rows[1] for rows in reader} pred = np.vectorize(med_dict.get)(pred) return pred
def main(): # **************************** log *********************** logger = init_logger(log_name=config['model']['arch'], log_dir=config['output']['log_dir']) logger.info(f"seed is {config['train']['seed']}") device = 'cuda:%d' % config['train']['n_gpu'][0] if len(config['train']['n_gpu']) else 'cpu' seed_everything(seed=config['train']['seed'],device=device) logger.info('starting load data from disk') id2label = {value: key for key, value in config['label2id'].items()} #**************************** data input *********************** DT = DataTransformer(logger = logger,seed = config['train']['seed']) # read test data targets, sentences = DT.read_data(raw_data_path=config['data']['test_file_path'], preprocessor=EnglishPreProcessor(), is_train=False) tokenizer = BertTokenizer(vocab_file=config['pretrained']['bert']['vocab_path'], do_lower_case=config['train']['do_lower_case']) # prepare test dataset test_dataset = CreateDataset(data = list(zip(sentences,targets)), tokenizer = tokenizer, max_seq_len = config['train']['max_seq_len'], seed = config['train']['seed'], example_type = 'test') # pytorch dataloader test_loader = DataLoader(dataset = test_dataset, batch_size = config['train']['batch_size'], num_workers = config['train']['num_workers'], shuffle = False, drop_last = False, pin_memory = False) # **************************** start model *********************** logger.info("initializing model") model = BertFine.from_pretrained(config['pretrained']['bert']['bert_model_dir'], cache_dir=config['output']['cache_dir'], num_classes = len(id2label)) # **************************** training model *********************** logger.info('model predicting....') '''predicter = Predicter(model = model, logger = logger, n_gpu=config['train']['n_gpu'], model_path = config['output']['checkpoint_dir'] / f"best_{config['model']['arch']}_model.pth", )''' predicter = Predicter(model = model, logger = logger, n_gpu=config['train']['n_gpu'], model_path = config['output']['checkpoint_dir'] / f"best_{config['model']['arch']}_model.pth" ) # predict results result = predicter.predict(data = test_loader) result = np.where(result > 0.5, 1, 0) print('accuracy score', accuracy_score(targets, result)) print('\nF1 score', f1_score(targets, result)) print('\nclassification report', classification_report(targets, result)) # empty cache after testing if len(config['train']['n_gpu']) > 0: torch.cuda.empty_cache()
def main(): logger = init_logger(log_name=config['model']['arch'], log_dir=config['output']['log_dir']) logger.info(f"seed is {config['train']['seed']}") device = 'cuda:%d' % config['train']['n_gpu'][0] if len( config['train']['n_gpu']) else 'cpu' seed_everything(seed=config['train']['seed'], device=device) logger.info('starting load data from disk') id2label = {value: key for key, value in config['label2id'].items()} DT = DataTransformer(logger=logger, seed=config['train']['seed']) targets, sentences, ids = DT.read_data( raw_data_path=config['data']['test_file_path'], preprocessor=EnglishPreProcessor(), is_train=False) tokenizer = BertTokenizer( vocab_file=config['pretrained']['bert']['vocab_path'], do_lower_case=config['train']['do_lower_case']) # test dataset test_dataset = CreateDataset(data=list(zip(sentences, targets)), tokenizer=tokenizer, max_seq_len=config['train']['max_seq_len'], seed=config['train']['seed'], example_type='test') test_loader = DataLoader(dataset=test_dataset, batch_size=config['train']['batch_size'], num_workers=config['train']['num_workers'], shuffle=False, drop_last=False, pin_memory=False) # **************************** load pretrained model from cache *********************** logger.info("initializing model") model = BertFine.from_pretrained( config['pretrained']['bert']['bert_model_dir'], cache_dir=config['output']['cache_dir'], num_classes=len(id2label)) # **************************** inference *********************** logger.info('model predicting....') predicter = Predicter( model=model, logger=logger, n_gpu=config['train']['n_gpu'], model_path=config['output']['checkpoint_dir'] / f"best_{config['model']['arch']}_model.pth", ) # predict result = predicter.predict(data=test_loader) file = open(config['output']['inference_output_dir'], 'w') for index, line, score in zip(ids, sentences, result): file.write(str(index) + '\t' + line + '\t' + str(score[0])) file.write('\n') file.close() if len(config['train']['n_gpu']) > 0: torch.cuda.empty_cache()
def main(): # **************************** SETUP/READ FROM CONFIG *********************** logger = init_logger(log_name=config['model']['arch'], log_dir=config['output']['log_dir']) logger.info(f"seed is {config['train']['seed']}") device = 'cuda:%d' % config['train']['n_gpu'][0] if len( config['train']['n_gpu']) else 'cpu' seed_everything(seed=config['train']['seed'], device=device) logger.info('starting load data from disk') id2label = {value: key for key, value in config['label2id'].items()} #**************************** *********************** DT = DataTransformer(logger=logger, seed=config['train']['seed']) # Preprocessing targets, sentences = DT.read_data( raw_data_path=config['data']['test_file_path'], preprocessor=EnglishPreProcessor(), is_train=False) tokenizer = BertTokenizer( vocab_file=config['pretrained']['bert']['vocab_path'], do_lower_case=config['train']['do_lower_case']) #**************************** TOKENIZING ********************************* test_dataset = CreateDataset(data=list(zip(sentences, targets)), tokenizer=tokenizer, max_seq_len=config['train']['max_seq_len'], seed=config['train']['seed'], example_type='test') #*************************** DATALOADER ****************************** test_loader = DataLoader(dataset=test_dataset, batch_size=config['train']['batch_size'], num_workers=config['train']['num_workers'], shuffle=False, drop_last=False, pin_memory=False) # **************************** LOAD MODEL *********************** logger.info("initializing model") model = BertFine.from_pretrained( config['pretrained']['bert']['bert_model_dir'], cache_dir=config['output']['cache_dir'], num_classes=len(id2label)) # **************************** RUNNING PREDICTIONS *********************** logger.info('model predicting....') predicter = Predicter( model=model, logger=logger, n_gpu=config['train']['n_gpu'], model_path=config['output']['checkpoint_dir'] / f"best_{config['model']['arch']}_model.pth", ) # *************************OUTPUT RESULTS TO CSV************************* result = predicter.predict(data=test_loader) print(result) df = pd.DataFrame(result) cols = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] df.columns = cols print(df.head()) df.to_csv('pybert/output/result/result.csv') # ******************************EMPTY GPU CACHE************************************ if len(config['train']['n_gpu']) > 0: torch.cuda.empty_cache()
html.unescape(abstract)).lstrip(';\n\n') description = line[3] description = html2text.html2text( html.unescape(description)).lstrip(';\n\n') input = title + ';' + abstract + ';' + description labels = line[4:] onehot_label = np.zeros(645) for label in labels: for i, subclass in enumerate(subclass_list): if subclass == label: onehot_label[i] = 1 break if preprocessor: input = preprocessor(input) if is_train: train_sentences.append(input) train_targets.append(onehot_label) else: val_sentences.append(input) val_targets.append(onehot_label) return train_targets, train_sentences, val_targets, val_sentences if __name__ == "__main__": data = TaskData() train_targets, train_sentences, val_targets, val_sentences = data.read_data( config, raw_data_path= "/Users/xiaohan/Desktop/bert_HMC/data/summary/summary_1574.csv", preprocessor=EnglishPreProcessor())