def run_test(args): from pybert.io.task_data import TaskData from pybert.test.predictor import Predictor data = TaskData() targets, sentences = data.read_data(raw_data_path=config['test_path'], preprocessor=EnglishPreProcessor(), is_train=False) lines = list(zip(sentences, targets)) processor = BertProcessor(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case) label_list = processor.get_labels() id2label = {i: label for i, label in enumerate(label_list)} test_data = processor.get_test(lines=lines) test_examples = processor.create_examples(lines=test_data, example_type='test', cached_examples_file=config[ 'data_dir'] / f"cached_test_examples_{args.arch}") test_features = processor.create_features(examples=test_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config[ 'data_dir'] / "cached_test_features_{}_{}".format( args.eval_max_seq_len, args.arch )) test_dataset = processor.create_dataset(test_features) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.train_batch_size) model = BertForMultiLable.from_pretrained(config['checkpoint_dir'], num_labels=len(label_list)) # ----------- predicting logger.info('model predicting....') predictor = Predictor(model=model, logger=logger, n_gpu=args.n_gpu) result = predictor.predict(data=test_dataloader) print(result)
def run_test(args): from pybert.io.task_data import TaskData from pybert.test.predictor import Predictor data = TaskData() ids, targets, sentences = data.read_data( raw_data_path=config['test_path'], preprocessor=ChinesePreProcessor(), is_train=False) lines = list(zip(sentences, targets)) #print(ids,sentences) processor = BertProcessor(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case) label_list = processor.get_labels() id2label = {i: label for i, label in enumerate(label_list)} test_data = processor.get_test(lines=lines) test_examples = processor.create_examples( lines=test_data, example_type='test', cached_examples_file=config['data_dir'] / f"cached_test_examples_{args.arch}") test_features = processor.create_features( examples=test_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config['data_dir'] / "cached_test_features_{}_{}".format(args.eval_max_seq_len, args.arch)) test_dataset = processor.create_dataset(test_features) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn) model = BertForMultiLable.from_pretrained(config['checkpoint_dir'], num_labels=len(label_list)) # ----------- predicting logger.info('model predicting....') predictor = Predictor(model=model, logger=logger, n_gpu=args.n_gpu) result = predictor.predict(data=test_dataloader) ids = np.array(ids) df1 = pd.DataFrame(ids, index=None) df2 = pd.DataFrame(result, index=None) all_df = pd.concat([df1, df2], axis=1) all_df.columns = ['id', 'sg', 'pj'] all_df['sg'] = all_df['sg'].apply(lambda x: 1 if x > 0.5 else 0) all_df['pj'] = all_df['pj'].apply(lambda x: 1 if x > 0.5 else 0) #all_df['qs'] = all_df['qs'].apply(lambda x: 1 if x>0.5 else 0) #all_df['tz'] = all_df['tz'].apply(lambda x: 1 if x>0.5 else 0) #all_df['ggjc'] = all_df['ggjc'].apply(lambda x: 1 if x>0.5 else 0) #all_df.columns = ['id','zy','gfgqzr','qs','tz','ggjc'] #all_df['zy'] = all_df['zy'].apply(lambda x: 1 if x>0.5 else 0) #all_df['gfgqzr'] = all_df['gfgqzr'].apply(lambda x: 1 if x>0.5 else 0) #all_df['qs'] = all_df['qs'].apply(lambda x: 1 if x>0.5 else 0) #all_df['tz'] = all_df['tz'].apply(lambda x: 1 if x>0.5 else 0) #all_df['ggjc'] = all_df['ggjc'].apply(lambda x: 1 if x>0.5 else 0) all_df.to_csv( "/home/LAB/liqian/test/game/Fin/CCKS-Cls/test_output/cls_out.csv", index=False)
def build_examples(file_path, max_seq_len, masked_lm_prob, max_predictions_per_seq, vocab_list): f = open(file_path, 'r') lines = f.readlines() examples = [] max_num_tokens = max_seq_len - 2 for line_cnt, line in enumerate(lines): if line_cnt % 50000 == 0: logger.info(f"Loading line {line_cnt}") example = {} guid = f'corpus-{line_cnt}' tokens_a = line.strip("\n").split(" ")[:max_num_tokens] tokens = ["[CLS]"] + tokens_a + ["[SEP]"] segment_ids = [0 for _ in range(len(tokens_a) + 2)] # remove too short sample if len(tokens_a) < 5: continue tokens, masked_lm_positions, masked_lm_labels = create_masked_lm_predictions( tokens, masked_lm_prob, max_predictions_per_seq, vocab_list) if line_cnt < 2: print("-------------------------Example-----------------------") print("guid: %s" % (guid)) print("tokens: %s" % " ".join([str(x) for x in tokens])) print("masked_lm_labels: %s" % " ".join([str(x) for x in masked_lm_labels])) print("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) print("masked_lm_positions: %s" % " ".join([str(x) for x in masked_lm_positions])) example['guid'] = guid example['tokens'] = tokens example['segment_ids'] = segment_ids example['masked_lm_positions'] = masked_lm_positions example['masked_lm_labels'] = masked_lm_labels examples.append(example) f.close() return examples
def main(): parser = ArgumentParser() parser.add_argument("--arch", default='bert', type=str) parser.add_argument("--do_data", action='store_true') parser.add_argument("--do_train", action='store_true') parser.add_argument("--do_test", action='store_true') parser.add_argument("--save_best", action='store_true') parser.add_argument("--do_lower_case", action='store_true') parser.add_argument('--data_name', default='train', type=str) parser.add_argument("--epochs", default=4, type=int) parser.add_argument("--resume_path", default='', type=str) parser.add_argument("--mode", default='max', type=str) parser.add_argument("--monitor", default='valid_f1', type=str) parser.add_argument("--valid_size", default=0.2, type=float) parser.add_argument("--local_rank", type=int, default=-1) parser.add_argument("--sorted", default=1, type=int, help='1 : True 0:False ') parser.add_argument("--n_gpu", type=str, default='0', help='"0,1,.." or "0" or "" ') parser.add_argument('--gradient_accumulation_steps', type=int, default=1) parser.add_argument("--train_batch_size", default=8, type=int) parser.add_argument('--eval_batch_size', default=8, type=int) parser.add_argument("--train_max_seq_len", default=256, type=int) parser.add_argument("--eval_max_seq_len", default=256, type=int) parser.add_argument('--loss_scale', type=float, default=0) parser.add_argument("--warmup_proportion", default=0.1, type=int, ) parser.add_argument("--weight_decay", default=0.01, type=float) parser.add_argument("--adam_epsilon", default=1e-8, type=float) parser.add_argument("--grad_clip", default=1.0, type=float) parser.add_argument("--learning_rate", default=2e-5, type=float) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--fp16', action='store_true') parser.add_argument('--fp16_opt_level', type=str, default='O1') args = parser.parse_args() config['checkpoint_dir'] = config['checkpoint_dir'] / args.arch config['checkpoint_dir'].mkdir(exist_ok=True) # Good practice: save your training arguments together with the trained model torch.save(args, config['checkpoint_dir'] / 'training_args.bin') seed_everything(args.seed) init_logger(log_file=config['log_dir'] / f"{args.arch}.log") logger.info("Training/evaluation parameters %s", args) if args.do_data: from pybert.io.task_data import TaskData processor = BertProcessor(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case) label_list = processor.get_labels() label2id = {label: i for i, label in enumerate(label_list)} data = TaskData() targets, sentences = data.read_data(raw_data_path=config['raw_data_path'], preprocessor=None, is_train=True,label2id=label2id) data.train_val_split(X=sentences, y=targets, shuffle=True, stratify=targets, valid_size=args.valid_size, data_dir=config['data_dir'], data_name=args.data_name) if args.do_train: run_train(args) if args.do_test: run_test(args)
def run_test(args): from pybert.io.task_data import TaskData from pybert.test.predictor import Predictor import pickle import os processor = BertProcessor(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case) label_list = processor.get_labels() label2id = {label: i for i, label in enumerate(label_list)} id2label = {i: label for i, label in enumerate(label_list)} test_data = processor.get_train(config['data_dir'] / f"{args.data_name}.test.pkl") print ("Test data is:") print (test_data) print ("Label list is:") print (label_list) print ("----------------------------------------") # test_data = processor.get_test(lines=lines) test_examples = processor.create_examples(lines=test_data, example_type='test', cached_examples_file=config[ 'data_cache'] / f"cached_test_examples_{args.arch}") test_features = processor.create_features(examples=test_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config[ 'data_cache'] / "cached_test_features_{}_{}".format( args.eval_max_seq_len, args.arch )) test_dataset = processor.create_dataset(test_features) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.train_batch_size) model = BertForMultiLable.from_pretrained(config['checkpoint_dir'], num_labels=len(label_list)) # ----------- predicting logger.info('model predicting....') predictor = Predictor(model=model, logger=logger, n_gpu=args.n_gpu, batch_metrics=[AccuracyThresh(thresh=0.5)], epoch_metrics=[AUC(average='micro', task_type='binary'), MultiLabelReport(id2label=id2label)]) result, test_predicted, test_true = predictor.predict(data=test_dataloader) pickle.dump(test_true, open(os.path.join(config["test/checkpoint_dir"], "test_true.p"), "wb")) pickle.dump(test_predicted, open(os.path.join(config["test/checkpoint_dir"], "test_predicted.p"), "wb")) pickle.dump(id2label, open(os.path.join(config["test/checkpoint_dir"], "id2label.p"), "wb")) print ("Predictor results:") print(result) print ("-----------------------------------------------")
def run_test(args): from pybert.io.task_data import TaskData from pybert.test.predictor import Predictor data = TaskData() # targets, sentences = data.read_data(raw_data_path=config['test_path'], # preprocessor=EnglishPreProcessor(), # is_train=False) _, _, targets, sentences = data.read_data(config, raw_data_path=config['test_path'], is_train=False) lines = list(zip(sentences, targets)) # processor = BertProcessor(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case) processor = BertProcessor() label_list = processor.get_labels() id2label = {i: label for i, label in enumerate(label_list)} test_data = processor.get_test(lines=lines) test_examples = processor.create_examples(lines=test_data, example_type='test', cached_examples_file=config[ 'data_dir'] / f"cached_test_examples_{args.arch}") test_features = processor.create_features(examples=test_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config[ 'data_dir'] / "cached_test_features_{}_{}".format( args.eval_max_seq_len, args.arch )) test_dataset = processor.create_dataset(test_features) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn) model = BertForMultiLable.from_pretrained(config['checkpoint_dir'], num_labels=len(label_list)) # ----------- predicting logger.info('model predicting....') predictor = Predictor(model=model, logger=logger, n_gpu=args.n_gpu) result = predictor.predict(data=test_dataloader) result[result<0.5] = 0 result[result>=0.5] = 1 labels = [] for i in range(result.shape[0]): ids = np.where(result[i]==1)[0] each_patent_label = [id2label[id] for id in ids] labels.append(each_patent_label) if os.path.exists(config['predictions']): os.remove(config['predictions']) with open(config['test_path'], 'r') as f: reader = csv.reader(f) for j, line in enumerate(reader): id = line[0] with open(config['predictions'], 'a+') as g: g.write("{}\t".format(id)) for label in labels[j]: g.write("{}\t".format(label)) g.write("\n")
def run_test(args): from pybert.test.predictor import Predictor processor = BertProcessor(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case) test_data = processor.get_test(config['test_path']) test_examples = processor.create_examples( lines=test_data, example_type='test', cached_examples_file=config['data_dir'] / f"cached_test_examples_{args.arch}") test_features = processor.create_features( examples=test_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config['data_dir'] / "cached_test_features_{}_{}".format(args.eval_max_seq_len, args.arch)) test_dataset = processor.create_dataset(test_features) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.eval_batch_size) idx2word = {} for (w, i) in processor.tokenizer.vocab.items(): idx2word[i] = w label_list = processor.get_labels(label_path=config['data_label_path']) idx2label = {i: label for i, label in enumerate(label_list)} if args.test_path: args.test_path = Path(args.test_path) model = BertForMultiLable.from_pretrained(args.test_path, num_labels=len(label_list)) else: model = BertForMultiLable.from_pretrained(config['bert_model_dir'], num_labels=len(label_list)) for p in model.bert.parameters(): p.require_grad = False # ----------- predicting ----------- writer = SummaryWriter() logger.info('model predicting....') predictor = Predictor(model=model, logger=logger, n_gpu=args.n_gpu, i2w=idx2word, i2l=idx2label) result = predictor.predict(data=test_dataloader) if args.predict_labels: predictor.labels(result, args.predict_idx)
def __init__(self, training_path, file_id, tokenizer,reduce_memory=False): self.tokenizer = tokenizer self.file_id = file_id data_file = training_path / f"file_{self.file_id}.json" metrics_file = training_path / f"file_{self.file_id}_metrics.json" assert data_file.is_file() and metrics_file.is_file() metrics = json.loads(metrics_file.read_text()) num_samples = metrics['num_training_examples'] seq_len = metrics['max_seq_len'] self.temp_dir = None self.working_dir = None if reduce_memory: self.temp_dir = TemporaryDirectory() self.working_dir = Path(self.temp_dir.name) input_ids = np.memmap(filename=self.working_dir / 'input_ids.memmap', mode='w+', dtype=np.int32, shape=(num_samples, seq_len)) input_masks = np.memmap(filename=self.working_dir / 'input_masks.memmap', shape=(num_samples, seq_len), mode='w+', dtype=np.bool) segment_ids = np.memmap(filename=self.working_dir / 'segment_ids.memmap', shape=(num_samples, seq_len), mode='w+', dtype=np.bool) lm_label_ids = np.memmap(filename=self.working_dir / 'lm_label_ids.memmap', shape=(num_samples, seq_len), mode='w+', dtype=np.int32) lm_label_ids[:] = -1 else: input_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.int32) input_masks = np.zeros(shape=(num_samples, seq_len), dtype=np.bool) segment_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.bool) lm_label_ids = np.full(shape=(num_samples, seq_len), dtype=np.int32, fill_value=-1) logger.info(f"Loading training examples for {str(data_file)}") with data_file.open() as f: for i, line in enumerate(f): line = line.strip() example = json.loads(line) features = convert_example_to_features(example, tokenizer, seq_len) input_ids[i] = features.input_ids segment_ids[i] = features.segment_ids input_masks[i] = features.input_mask lm_label_ids[i] = features.lm_label_ids assert i == num_samples - 1 # Assert that the sample count metric was true logger.info("Loading complete!") self.num_samples = num_samples self.seq_len = seq_len self.input_ids = input_ids self.input_masks = input_masks self.segment_ids = segment_ids self.lm_label_ids = lm_label_ids
def run_test(args): from pybert.io.task_data import TaskData from pybert.test.predictor import Predictor data = TaskData() ids,targets, sentences = data.read_data(raw_data_path=config['test_path'], preprocessor=None, is_train=False) lines = list(zip(sentences, targets)) processor = BertProcessor(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case) label_list = processor.get_labels() id2label = {i: label for i, label in enumerate(label_list)} test_data = processor.get_test(lines=lines) test_examples = processor.create_examples(lines=test_data, example_type='test', cached_examples_file=config[ 'data_dir'] / f"cached_test_examples_{args.arch}") test_features = processor.create_features(examples=test_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config[ 'data_dir'] / "cached_test_features_{}_{}".format( args.eval_max_seq_len, args.arch )) test_dataset = processor.create_dataset(test_features) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.train_batch_size) model = BertForMultiClass.from_pretrained(config['checkpoint_dir'], num_labels=len(label_list)) # ----------- predicting logger.info('model predicting....') predictor = Predictor(model=model, logger=logger, n_gpu=args.n_gpu) result = predictor.predict(data=test_dataloader) import numpy as np result=np.argmax(result,axis=1) with open('submit1.csv','w',encoding='utf-8') as f: for id,pre in zip(ids,result): f.write(id+','+str(pre)+'\n') print(result)
def main(): parser = ArgumentParser() parser.add_argument("--arch", default='bert', type=str) parser.add_argument("--do_data", action='store_true') parser.add_argument("--do_train", action='store_true') parser.add_argument("--do_test", action='store_true') parser.add_argument("--save_best", action='store_true') parser.add_argument("--do_lower_case", action='store_true') parser.add_argument('--data_name', default='kaggle', type=str) parser.add_argument("--mode", default='min', type=str) parser.add_argument("--monitor", default='valid_loss', type=str) parser.add_argument("--epochs", default=20, type=int) parser.add_argument("--resume_path", default='', type=str) parser.add_argument("--predict_checkpoints", type=int, default=0) parser.add_argument("--valid_size", default=0.2, type=float) parser.add_argument("--local_rank", type=int, default=-1) parser.add_argument("--sorted", default=1, type=int, help='1 : True 0:False ') parser.add_argument("--n_gpu", type=str, default='0', help='"0,1,.." or "0" or "" ') parser.add_argument('--gradient_accumulation_steps', type=int, default=1) parser.add_argument("--train_batch_size", default=8, type=int) parser.add_argument('--eval_batch_size', default=8, type=int) parser.add_argument("--train_max_seq_len", default=256, type=int) parser.add_argument("--eval_max_seq_len", default=256, type=int) parser.add_argument('--loss_scale', type=float, default=0) parser.add_argument("--warmup_proportion", default=0.1, type=float) parser.add_argument("--weight_decay", default=0.01, type=float) parser.add_argument("--adam_epsilon", default=1e-8, type=float) parser.add_argument("--grad_clip", default=1.0, type=float) parser.add_argument("--learning_rate", default=2e-5, type=float) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--fp16', action='store_true') parser.add_argument('--fp16_opt_level', type=str, default='O1') args = parser.parse_args() init_logger( log_file=config['log_dir'] / f'{args.arch}-{time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())}.log' ) config['checkpoint_dir'] = config['checkpoint_dir'] / args.arch config['checkpoint_dir'].mkdir(exist_ok=True) # Good practice: save your training arguments together with the trained model torch.save(args, config['checkpoint_dir'] / 'training_args.bin') seed_everything(args.seed) logger.info("Training/evaluation parameters %s", args) args.save_best = False args.do_train = True args.resume_path = 'pybert/output/checkpoints/bert/checkpoint-epoch-3' args.do_lower_case = True if args.do_data: from pybert.io.task_data import TaskData data = TaskData() targets, sentences = data.read_data( raw_data_path=config['raw_data_path'], preprocessor=EnglishPreProcessor(), is_train=True) data.train_val_split(X=sentences, y=targets, shuffle=True, stratify=False, valid_size=args.valid_size, data_dir=config['data_dir'], data_name=args.data_name) if args.do_train: run_train(args) if args.do_test: run_test(args)
def run_test(args, test=False, k=7, med_map='pybert/dataset/med_map.csv'): from pybert.io.task_data import TaskData from pybert.test.predictor import Predictor data = TaskData() targets, sentences = data.read_data(raw_data_path=config['test_path'], preprocessor=EnglishPreProcessor(), is_train=test) print( f'-----------------------------------------\ntargets {targets}\n---------------------------------------------------' ) lines = list(zip(sentences, targets)) processor = BertProcessor(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case) label_list = processor.get_labels() id2label = {i: label for i, label in enumerate(label_list)} test_data = processor.get_test(lines=lines) test_examples = processor.create_examples( lines=test_data, example_type='test', cached_examples_file=config['data_dir'] / f"cached_test_examples_{args.arch}") test_features = processor.create_features( examples=test_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config['data_dir'] / "cached_test_features_{}_{}".format(args.eval_max_seq_len, args.arch)) test_dataset = processor.create_dataset(test_features) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.train_batch_size) model = BertForMultiLable.from_pretrained(config['checkpoint_dir']) # ----------- predicting logger.info('model predicting....') predictor = Predictor(model=model, logger=logger, n_gpu=args.n_gpu, test=test) if test: results, targets = predictor.predict(data=test_dataloader) #print(f'results {results.shape}') #print(f'targets {targets.shape}') result = dict() metrics = [Recall(), Acc()] for metric in metrics: metric.reset() metric(logits=results, target=targets) value = metric.value() if value is not None: result[f'valid_{metric.name()}'] = value return result else: results = predictor.predict(data=test_dataloader) pred = np.argsort(results)[:, -k:][:, ::-1] with open('pybert/dataset/med_map.csv', mode='r') as infile: reader = csv.reader(infile) med_dict = {int(rows[0]): rows[1] for rows in reader} pred = np.vectorize(med_dict.get)(pred) return pred
def main(): parser = ArgumentParser() parser.add_argument("--arch", default='bert', type=str) parser.add_argument("--do_data", action='store_true') parser.add_argument("--train", action='store_true') parser.add_argument("--test", action='store_true') parser.add_argument("--save_best", action='store_true') parser.add_argument("--do_lower_case", action='store_true') parser.add_argument('--data_name', default='job_dataset', type=str) parser.add_argument("--epochs", default=10, type=int) parser.add_argument("--resume_path", default='', type=str) parser.add_argument("--test_path", default='', type=str) parser.add_argument("--mode", default='min', type=str) parser.add_argument("--monitor", default='valid_loss', type=str) parser.add_argument("--valid_size", default=0.05, type=float) parser.add_argument("--local_rank", type=int, default=-1) parser.add_argument("--sorted", default=1, type=int, help='1 : True 0:False ') parser.add_argument("--n_gpu", type=str, default='0', help='"0,1,.." or "0" or "" ') parser.add_argument('--gradient_accumulation_steps', type=int, default=1) parser.add_argument("--train_batch_size", default=4, type=int) parser.add_argument('--eval_batch_size', default=4, type=int) parser.add_argument("--train_max_seq_len", default=256, type=int) parser.add_argument("--eval_max_seq_len", default=256, type=int) parser.add_argument('--loss_scale', type=float, default=0) parser.add_argument( "--warmup_proportion", default=0.1, type=int, ) parser.add_argument("--weight_decay", default=0.01, type=float) parser.add_argument("--adam_epsilon", default=1e-8, type=float) parser.add_argument("--grad_clip", default=1.0, type=float) parser.add_argument("--learning_rate", default=1.0e-4, type=float) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--fp16', action='store_true') parser.add_argument('--fp16_opt_level', type=str, default='O1') parser.add_argument('--predict_labels', type=bool, default=False) parser.add_argument('--predict_idx', type=str, default="0", help=' "idx" or "start-end" or "all" ') args = parser.parse_args() config['checkpoint_dir'] = config['checkpoint_dir'] / args.arch config['checkpoint_dir'].mkdir(exist_ok=True) torch.save(args, config['checkpoint_dir'] / 'training_args.bin') seed_everything(args.seed) init_logger(log_file=config['log_dir'] / f"{args.arch}.log") logger.info("Training/evaluation parameters %s", args) if args.do_data: from pybert.io.task_data import TaskData data = TaskData() targets, sentences = data.read_data( raw_data_path=config['raw_data_path'], preprocessor=EnglishPreProcessor(), is_train=True) data.train_val_split(X=sentences, y=targets, shuffle=False, stratify=False, valid_size=args.valid_size, data_dir=config['data_dir'], data_name=args.data_name) if args.train: run_train(args) if args.test: run_test(args)
def main(): parser = ArgumentParser() parser.add_argument("--do_data", default=False, action='store_true') parser.add_argument("--do_corpus", default=False, action='store_true') parser.add_argument("--do_vocab", default=False, action='store_true') parser.add_argument("--do_split", default=False, action='store_true') parser.add_argument('--seed',default=42,type=int) parser.add_argument("--line_per_file", default=1000000000, type=int) parser.add_argument("--file_num", type=int, default=10, help="Number of dynamic masking to pregenerate") parser.add_argument("--max_seq_len", type=int, default=128) parser.add_argument("--short_seq_prob", type=float, default=0.1, help="Probability of making a short sentence as a training example") parser.add_argument("--masked_lm_prob", type=float, default=0.15, help="Probability of masking each token for the LM task") parser.add_argument("--max_predictions_per_seq", type=int, default=20, help="Maximum number of tokens to mask in each sequence") args = parser.parse_args() seed_everything(args.seed) if args.do_corpus: corpus = [] train_path = str(config['data_dir'] / 'train.txt') with open(train_path, 'r') as fr: for ex_id, line in enumerate(fr): line = line.strip("\n") lines = [" ".join(x.split("/")[0].split("_")) for x in line.split(" ")] if ex_id == 0: logger.info(f"Train example: {' '.join(lines)}") corpus.append(" ".join(lines)) test_path = str(config['data_dir'] / 'test.txt') with open(test_path, 'r') as fr: for ex_id, line in enumerate(fr): line = line.strip("\n") lines = line.split("_") if ex_id == 0: logger.info(f"Test example: {' '.join(lines)}") corpus.append(" ".join(lines)) corpus_path = str(config['data_dir'] / 'corpus.txt') with open(corpus_path, 'r') as fr: for ex_id, line in enumerate(fr): line = line.strip("\n") lines = line.split("_") if ex_id == 0: logger.info(f"Corpus example: {' '.join(lines)}") corpus.append(" ".join(lines)) corpus = list(set(corpus)) logger.info(f"corpus size: {len(corpus)}") random_order = list(range(len(corpus))) np.random.shuffle(random_order) corpus = [corpus[i] for i in random_order] new_corpus_path = config['data_dir'] / "corpus/corpus.txt" if not new_corpus_path.exists(): new_corpus_path.parent.mkdir(exist_ok = True) with open(new_corpus_path, 'w') as fr: for line in corpus: fr.write(line + "\n") if args.do_split: new_corpus_path = config['data_dir'] / "corpus/corpus.txt" split_save_path = config['data_dir'] / "corpus/train" if not split_save_path.exists(): split_save_path.mkdir(exist_ok=True) line_per_file = args.line_per_file command = f'split -a 4 -l {line_per_file} -d {new_corpus_path} {split_save_path}/shard_' os.system(f"{command}") if args.do_vocab: vocab = Vocabulary(min_freq=0,add_unused=True) vocab.read_data(data_path=config['data_dir'] / "corpus/train") vocab.build_vocab() vocab.save(file_path=config['data_dir'] / 'corpus/vocab_mapping.pkl') vocab.save_bert_vocab(file_path=config['bert_vocab_path']) logger.info(f"vocab size: {len(vocab)}") bert_base_config['vocab_size'] = len(vocab) save_json(data=bert_base_config, file_path=config['bert_config_file']) if args.do_data: vocab_list = load_vocab(config['bert_vocab_path']) data_path = config['data_dir'] / "corpus/train" files = sorted([f for f in data_path.iterdir() if f.exists() and "." not in str(f)]) print(files) logger.info("--- pregenerate training data parameters ---") logger.info(f'max_seq_len: {args.max_seq_len}') logger.info(f"max_predictions_per_seq: {args.max_predictions_per_seq}") logger.info(f"masked_lm_prob: {args.masked_lm_prob}") logger.info(f"seed: {args.seed}") logger.info(f"file num : {args.file_num}") for idx in range(args.file_num): logger.info(f"pregenetate file_{idx}.json") save_filename = data_path / f"file_{idx}.json" num_instances = 0 with save_filename.open('w') as fw: for file_idx in range(len(files)): file_path = files[file_idx] file_examples = build_examples(file_path,max_seq_len=args.max_seq_len, masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq, vocab_list = vocab_list) file_examples = [json.dumps(instance) for instance in file_examples] for instance in file_examples: fw.write(instance + '\n') num_instances += 1 metrics_file = data_path / f"file_{idx}_metrics.json" print(f"num_instances: {num_instances}") with metrics_file.open('w') as metrics_file: metrics = { "num_training_examples": num_instances, "max_seq_len": args.max_seq_len } metrics_file.write(json.dumps(metrics))
def main(): parser = ArgumentParser() parser.add_argument("--file_num", type=int, default=10, help="Number of pregenerate file") parser.add_argument("--reduce_memory", action="store_true", help="Store training data as on-disc memmaps to massively reduce memory usage") parser.add_argument("--epochs", type=int, default=2, help="Number of epochs to train for") parser.add_argument('--num_eval_steps', default=200) parser.add_argument('--num_save_steps', default=5000) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--train_batch_size", default=24, type=int, help="Total batch size for training.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--warmup_proportion",default=0.1,type=float, help="Linear warmup over warmup_steps.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", default=1e-4, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--fp16_opt_level', type=str, default='O1', help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") args = parser.parse_args() pregenerated_data = config['data_dir'] / "corpus/train" assert pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by prepare_lm_data_mask.py!" samples_per_epoch = 0 for i in range(args.file_num): data_file = pregenerated_data / f"file_{i}.json" metrics_file = pregenerated_data / f"file_{i}_metrics.json" if data_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch += metrics['num_training_examples'] else: if i == 0: exit("No training data was found!") print(f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).") print("This script will loop over the available data, but training diversity may be negatively impacted.") break logger.info(f"samples_per_epoch: {samples_per_epoch}") if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( f"device: {device} n_gpu: {n_gpu}, distributed training: {bool(args.local_rank != -1)}, 16-bits training: {args.fp16}") if args.gradient_accumulation_steps < 1: raise ValueError(f"Invalid gradient_accumulation_steps parameter: {args.gradient_accumulation_steps}, should be >= 1") args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps seed_everything(args.seed) tokenizer = CustomTokenizer(vocab_file=config['bert_vocab_path']) total_train_examples = samples_per_epoch * args.epochs num_train_optimization_steps = int( total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() args.warmup_steps = int(num_train_optimization_steps * args.warmup_proportion) # Prepare model with open(str(config['bert_config_file']), "r", encoding='utf-8') as reader: json_config = json.loads(reader.read()) print(json_config) bert_config = BertConfig.from_json_file(str(config['bert_config_file'])) model = BertForMaskedLM(config=bert_config) # model = BertForMaskedLM.from_pretrained(config['checkpoint_dir'] / 'checkpoint-580000') if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # if args.fp16: # try: # from apex.optimizers import FP16_Optimizer # from apex.optimizers import FusedAdam # except ImportError: # raise ImportError( # "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") # # optimizer = FusedAdam(optimizer_grouped_parameters, # lr=args.learning_rate, # bias_correction=False, # max_grad_norm=1.0) # if args.loss_scale == 0: # optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) # else: # optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) # else: # optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) # scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) global_step = 0 metric = LMAccuracy() tr_acc = AverageMeter() tr_loss = AverageMeter() train_logs = {} logger.info("***** Running training *****") logger.info(f" Num examples = {total_train_examples}") logger.info(f" Batch size = {args.train_batch_size}" ) logger.info(f" Num steps = {num_train_optimization_steps}" ) logger.info(f" warmup_steps = {args.warmup_steps}") model.train() for epoch in range(args.epochs): for idx in range(args.file_num): epoch_dataset = PregeneratedDataset(file_id=idx, training_path=pregenerated_data, tokenizer=tokenizer, reduce_memory=args.reduce_memory) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids = batch outputs = model(input_ids, segment_ids, input_mask, lm_label_ids) pred_output = outputs[1] loss = outputs[0] metric(logits=pred_output.view(-1, bert_config.vocab_size), target=lm_label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() nb_tr_steps += 1 tr_acc.update(metric.value(), n=input_ids.size(0)) tr_loss.update(loss.item(), n=1) if (step + 1) % args.gradient_accumulation_steps == 0: # if args.fp16: # # modify learning rate with special warm up BERT uses # # if args.fp16 is False, BertAdam is used that handles this automatically # lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) # for param_group in optimizer.param_groups: # param_group['lr'] = lr_this_step scheduler.step() # Update learning rate schedule optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % args.num_eval_steps == 0: train_logs['loss'] = tr_loss.avg train_logs['acc'] = tr_acc.avg show_info = f'\n[Training]:[{epoch}/{args.epochs}]{global_step}/{num_train_optimization_steps} ' + "-".join( [f' {key}: {value:.4f} ' for key, value in train_logs.items()]) logger.info(show_info) tr_acc.reset() tr_loss.reset() if global_step % args.num_save_steps == 0: if args.local_rank in [-1, 0] and args.num_save_steps > 0: # Save model checkpoint output_dir = config['checkpoint_dir'] / f'checkpoint-{global_step}' if not output_dir.exists(): output_dir.mkdir() # save model model_to_save = model.module if hasattr(model,'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(str(output_dir)) torch.save(args, str(output_dir / 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) torch.save(args, str(output_dir / 'training_args.bin')) #save config output_config_file = output_dir / CONFIG_NAME with open(str(output_config_file), 'w') as f: f.write(model_to_save.config.to_json_string()) #save vocab tokenizer.save_vocabulary(output_dir)
def main(): parser = ArgumentParser() parser.add_argument("--arch", default='bert', type=str) # 使用的预训练语言模型 parser.add_argument("--do_data", action='store_true') # 进行数据切分 parser.add_argument("--do_train", action='store_true') # 进行模型训练 parser.add_argument("--do_test", action='store_true') # 进行模型推断 parser.add_argument("--save_best", action='store_true') # 保留最好的模型 parser.add_argument("--do_lower_case", action='store_true') parser.add_argument('--data_name', default='ccks', type=str) # 数据集的名字 parser.add_argument("--mode", default='min', type=str) # 设置monitor关注的角度 parser.add_argument("--monitor", default='valid_loss', type=str) parser.add_argument("--task_type", default='base', type=str) parser.add_argument("--epochs", default=4, type=int) parser.add_argument("--resume_path", default='', type=str) # 恢复路径,从pretrained model中载入模型 parser.add_argument("--predict_checkpoints", type=int, default=0) parser.add_argument("--valid_size", default=0.2, type=float) # 验证集的大小 parser.add_argument("--local_rank", type=int, default=-1) parser.add_argument("--sorted", default=1, type=int, help='1 : True 0:False ') # 表示是否按照序列的长度排序 parser.add_argument("--n_gpu", type=str, default='0', help='"0,1,.." or "0" or "" ') parser.add_argument( '--gradient_accumulation_steps', type=int, default=1) # gradient_accumulation_steps的大小,用于解决内存小,无法使用大batch_size的问题 parser.add_argument("--train_batch_size", default=8, type=int) # 训练集batch_size parser.add_argument('--eval_batch_size', default=8, type=int) # 测试集batch_size parser.add_argument("--train_max_seq_len", default=256, type=int) # 训练集sequence的最大长度 parser.add_argument("--eval_max_seq_len", default=256, type=int) # 测试集sequence的最大长度 parser.add_argument('--loss_scale', type=float, default=0) # TODO: 理解loss scale的作用 parser.add_argument("--warmup_proportion", default=0.1, type=float) # 用于learning rate上的warmup proportion parser.add_argument("--weight_decay", default=0.01, type=float) # TODO: 理解weight decay的含义 parser.add_argument("--adam_epsilon", default=1e-8, type=float) # adam优化器的参数 parser.add_argument("--grad_clip", default=1.0, type=float) # TODO: 理解grad clip的含义 parser.add_argument("--learning_rate", default=2e-5, type=float) # 学习率 parser.add_argument('--seed', type=int, default=42) # 随机数种子 parser.add_argument('--fp16', action='store_true') # TODO: 理解fp16是什么 parser.add_argument('--fp16_opt_level', type=str, default='O1') args = parser.parse_args() # 初始化日志记录器logger config['log_dir'].mkdir(exist_ok=True) # 源代码没有写这句代码 init_logger( log_file=config['log_dir'] / f'{args.arch}-{time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())}.log' ) config['checkpoint_dir'] = config[ 'checkpoint_dir'] / args.arch / args.task_type # 重新调整输出的位置 config['checkpoint_dir'].mkdir(exist_ok=True) BASE_DIR = Path('pybert') config[ 'raw_data_path'] = BASE_DIR / f'dataset/train_{args.task_type}_sample.csv' config['test_path'] = BASE_DIR / f'dataset/test_{args.task_type}.csv' config['figure_dir'] = config['figure_dir'] / f'{args.task_type}' config['figure_dir'].mkdir(exist_ok=True) # 动态修改文件路径 # BASE_DIR = Path('pybert') # if args.task_type == 'trans': # config['raw_data_path'] = BASE_DIR / 'dataset/train_trans_sample.csv' # config['test_path'] = BASE_DIR / 'dataset/test_trans.csv' # config['figure_dir'] = config['figure_dir'] / f'{args.task_type}' # config['figure_dir'].mkdir(exist_ok=True) # elif args.task_type == 'base': # config['raw_data_path'] = BASE_DIR / 'dataset/train_base_sample.csv' # config['test_path'] = BASE_DIR / 'dataset/test_base.csv' # config['figure_dir'] = config['figure_dir'] / f'{args.task_type}' # config['figure_dir'].mkdir(exist_ok=True) # else: # raise ValueError(f"Invalid task_type {args.task_type}") # Good practice: save your training arguments together with the trained model torch.save(args, config['checkpoint_dir'] / 'training_args.bin') seed_everything(args.seed) # 一个方法设置所有的seed logger.info("Training/evaluation parameters %s", args) if args.do_data: from pybert.io.task_data import TaskData data = TaskData() ids, targets, sentences = data.read_data( raw_data_path=config['raw_data_path'], preprocessor=ChinesePreProcessor(), is_train=True) data.train_val_split(X=sentences, y=targets, shuffle=True, stratify=False, valid_size=args.valid_size, data_dir=config['data_dir'], data_name=args.data_name, task_type=args.task_type) # 增加了task_type参数 if args.do_train: run_train(args) if args.do_test: run_test(args)
def run_train(args): # --------- data model_to_use = "roberta-large" processor = RobertaProcessor(model_type=model_to_use) #vocab_path=config['roberta_vocab_path'], merge_path=config['roberta_merge_path']) label_list = processor.get_labels() label2id = {label: i for i, label in enumerate(label_list)} id2label = {i: label for i, label in enumerate(label_list)} train_data = processor.get_train(config['data_dir'] / f"{args.data_name}.train.pkl") train_examples = processor.create_examples(lines=train_data, example_type='train', cached_examples_file=config[ 'data_dir'] / f"cached_train_examples_{args.arch}") train_features = processor.create_features(examples=train_examples, max_seq_len=args.train_max_seq_len, cached_features_file=config[ 'data_dir'] / "cached_train_features_{}_{}".format( args.train_max_seq_len, args.arch )) train_dataset = processor.create_dataset(train_features, is_sorted=args.sorted) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn) valid_data = processor.get_dev(config['data_dir'] / f"{args.data_name}.valid.pkl") valid_examples = processor.create_examples(lines=valid_data, example_type='valid', cached_examples_file=config[ 'data_dir'] / f"cached_valid_examples_{args.arch}") valid_features = processor.create_features(examples=valid_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config[ 'data_dir'] / "cached_valid_features_{}_{}".format( args.eval_max_seq_len, args.arch )) valid_dataset = processor.create_dataset(valid_features) valid_sampler = SequentialSampler(valid_dataset) valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn) # ------- model logger.info("initializing model") if args.resume_path: args.resume_path = Path(args.resume_path) model = RobertaForMultiLable.from_pretrained(args.resume_path, num_labels=len(label_list)) else: model = RobertaForMultiLable.from_pretrained(model_to_use, num_labels=len(label_list)) #config['roberta_model_dir'] print("""\n\nname module\n----------------------""") for name, module in model.named_children(): if name == "roberta": for n, _ in module.named_children(): print(f"{name}:{n}") else: print("{:15} {}".format(name, module)) print() #return # print("================= train dataloader length is", len(train_dataloader), "=================\n") t_total = int(len(train_dataloader) / args.gradient_accumulation_steps * args.epochs) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and 'classifier.weight' not in n], 'weight_decay': args.weight_decay}, # {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, {'params': model.classifier.weight, 'lr': 5e-4} # best: 5e-4 # {'params': model.classifier.bias, 'lr': 5e-4, 'weight_decay': 0.0} ] # model.parameters() warmup_steps = int(t_total * args.warmup_proportion) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # ---- callbacks logger.info("initializing callbacks") train_monitor = TrainingMonitor(file_dir=config['figure_dir'], arch=args.arch) model_checkpoint = ModelCheckpoint(checkpoint_dir=config['checkpoint_dir'],mode=args.mode, monitor=args.monitor,arch=args.arch, save_best_only=args.save_best) # **************************** training model *********************** logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Num Epochs = %d", args.epochs) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * ( torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) trainer = Trainer(args=args, model=model, logger=logger, criterion=BCEWithLogLoss(), optimizer=optimizer, scheduler=scheduler, early_stopping=None, training_monitor=train_monitor, model_checkpoint=model_checkpoint, batch_metrics=[AccuracyThresh(thresh=0.5)], epoch_metrics=[AUC(average='weighted', task_type='binary'), # average='micro' MultiLabelReport(id2label=id2label)]) trainer.train(train_data=train_dataloader, valid_data=valid_dataloader)
def run_train(args): # --------- data processor = BertProcessor(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case) label_list = processor.get_labels() label2id = {label: i for i, label in enumerate(label_list)} id2label = {i: label for i, label in enumerate(label_list)} train_data = processor.get_train(config['data_dir'] / f"{args.data_name}.train.pkl") train_examples = processor.create_examples(lines=train_data, example_type='train', cached_examples_file=config['data_dir'] / f"cached_train_examples_{args.arch}") train_features = processor.create_features(examples=train_examples, max_seq_len=args.train_max_seq_len, cached_features_file=config[ 'data_dir'] / "cached_train_features_{}_{}".format( args.train_max_seq_len, args.arch )) train_dataset = processor.create_dataset(train_features, is_sorted=args.sorted) if args.sorted: train_sampler = SequentialSampler(train_dataset) else: train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) valid_data = processor.get_dev(config['data_dir'] / f"{args.data_name}.valid.pkl") valid_examples = processor.create_examples(lines=valid_data, example_type='valid', cached_examples_file=config['data_dir'] / f"cached_valid_examples_{args.arch}") valid_features = processor.create_features(examples=valid_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config['data_dir'] / "cached_valid_features_{}_{}".format( args.eval_max_seq_len, args.arch)) valid_dataset = processor.create_dataset(valid_features) valid_sampler = SequentialSampler(valid_dataset) valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.eval_batch_size) # ------- model logger.info("initializing model") if args.resume_path: args.resume_path = Path(args.resume_path) model = BertForMultiClass.from_pretrained(args.resume_path, num_labels=len(label_list)) else: model = BertForMultiClass.from_pretrained(config['bert_model_dir'], num_labels=len(label_list)) t_total = int(len(train_dataloader) / args.gradient_accumulation_steps * args.epochs) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] warmup_steps = int(t_total * args.warmup_proportion) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # ---- callbacks logger.info("initializing callbacks") train_monitor = TrainingMonitor(file_dir=config['figure_dir'], arch=args.arch) model_checkpoint = ModelCheckpoint(checkpoint_dir=config['checkpoint_dir'], mode=args.mode, monitor=args.monitor, arch=args.arch, save_best_only=args.save_best) # **************************** training model *********************** logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Num Epochs = %d", args.epochs) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * ( torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) trainer = Trainer(n_gpu=args.n_gpu, model=model, epochs=args.epochs, logger=logger, criterion=CrossEntropy(), optimizer=optimizer, lr_scheduler=lr_scheduler, early_stopping=None, training_monitor=train_monitor, fp16=args.fp16, resume_path=args.resume_path, grad_clip=args.grad_clip, model_checkpoint=model_checkpoint, gradient_accumulation_steps=args.gradient_accumulation_steps, evaluate=F1Score(), class_report=ClassReport(target_names=[id2label[x] for x in range(len(label2id))])) trainer.train(train_data=train_dataloader, valid_data=valid_dataloader, seed=args.seed)
def main(): parser = ArgumentParser() parser.add_argument("--arch", default='bert', type=str) parser.add_argument("--do_data", action='store_true') parser.add_argument("--do_train", action='store_true') parser.add_argument("--do_test", action='store_true') parser.add_argument("--save_best", action='store_true') parser.add_argument("--do_lower_case", action='store_true') # parser.add_argument('--data_name', default='HPC', type=str) parser.add_argument("--mode", default='min', type=str) parser.add_argument("--monitor", default='valid_loss', type=str) parser.add_argument("--epochs", default=10, type=int) parser.add_argument("--resume_path", default='', type=str) parser.add_argument("--predict_checkpoints", type=int, default=0) parser.add_argument("--valid_size", default=0.2, type=float) parser.add_argument("--local_rank", type=int, default=-1) parser.add_argument("--sorted", default=1, type=int, help='1 : True 0:False ') parser.add_argument("--n_gpu", type=str, default='0', help='"0,1,.." or "0" or "" ') parser.add_argument('--gradient_accumulation_steps', type=int, default=1) parser.add_argument("--train_batch_size", default=8, type=int) parser.add_argument('--eval_batch_size', default=8, type=int) parser.add_argument("--train_max_seq_len", default=256, type=int) parser.add_argument("--eval_max_seq_len", default=256, type=int) parser.add_argument('--loss_scale', type=float, default=0) parser.add_argument("--warmup_proportion", default=0.1, type=float) parser.add_argument("--weight_decay", default=0.01, type=float) parser.add_argument("--adam_epsilon", default=1e-8, type=float) parser.add_argument("--grad_clip", default=1.0, type=float) parser.add_argument("--learning_rate", default=2e-5, type=float) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--fp16', action='store_true') parser.add_argument('--fp16_opt_level', type=str, default='O1') args = parser.parse_args() init_logger(log_file=config['log_dir'] / f'{args.arch}-{time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())}.log') config['checkpoint_dir'] = config['checkpoint_dir'] / args.arch config['checkpoint_dir'].mkdir(exist_ok=True) # Good practice: save your training arguments together with the trained model torch.save(args, config['checkpoint_dir'] / 'training_args.bin') seed_everything(args.seed) logger.info("Training/evaluation parameters %s", args) if args.do_data: data_names = [] train_sentenses_all = [] train_target_all = [] from pybert.io.task_data import TaskData data = TaskData() total_valid = 0 for filename in os.listdir(config['summary_path']): if filename == ".DS_Store" or filename == "summary": continue filename_int = int(filename.split('.')[0].split('_')[-1]) if filename_int > 3500: try: raw_data_path = os.path.join(config['summary_path'], filename) # train_targets, train_sentences, val_targets, val_sentences = data.read_data(config, # raw_data_path=raw_data_path, # preprocessor=EnglishPreProcessor()) train_targets, train_sentences, val_targets, val_sentences = data.read_data(config, raw_data_path=raw_data_path) train_sentenses_all = train_sentenses_all + train_sentences train_target_all = train_target_all + train_targets total_valid = len(train_target_all) print("valid number: ", total_valid) # data.save_pickle(train_sentences, train_targets, data_dir=config['data_dir'], # data_name=filename.split('.')[0].split('_')[-1], is_train=True) # data.save_pickle(val_sentences, val_targets, data_dir=config['data_dir'], # data_name=filename.split('.')[0].split('_')[-1], is_train=False) # data_names.append(filename.split('.')[0].split('_')[-1]) except: pass total_valid = len(train_target_all) print("valid number: ", total_valid) data.save_pickle(train_sentenses_all, train_target_all, data_dir=config['data_dir'], data_name="all_valid", is_train=False) # with open(config['data_name'], 'w') as f: # json.dump(data_names, f) with open(config['data_name'], 'r') as f: data_names = json.load(f) if args.do_train: run_train(args, data_names) if args.do_test: run_test(args)
def run_train(args, data_names): # --------- data # processor = BertProcessor(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case) processor = BertProcessor() label_list = processor.get_labels() label2id = {label: i for i, label in enumerate(label_list)} id2label = {i: label for i, label in enumerate(label_list)} # train_data = processor.get_train(config['data_dir'] / f"{data_name}.train.pkl") # train_examples = processor.create_examples(lines=train_data, # example_type='train', # cached_examples_file=config[ # 'data_dir'] / f"cached_train_examples_{args.arch}") # train_features = processor.create_features(examples=train_examples, # max_seq_len=args.train_max_seq_len, # cached_features_file=config[ # 'data_dir'] / "cached_train_features_{}_{}".format( # args.train_max_seq_len, args.arch # )) # train_dataset = processor.create_dataset(train_features, is_sorted=args.sorted) # if args.sorted: # train_sampler = SequentialSampler(train_dataset) # else: # train_sampler = RandomSampler(train_dataset) # train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, # collate_fn=collate_fn) # # valid_data = processor.get_dev(config['data_dir'] / f"{data_name}.valid.pkl") # valid_examples = processor.create_examples(lines=valid_data, # example_type='valid', # cached_examples_file=config[ # 'data_dir'] / f"cached_valid_examples_{args.arch}") # # valid_features = processor.create_features(examples=valid_examples, # max_seq_len=args.eval_max_seq_len, # cached_features_file=config[ # 'data_dir'] / "cached_valid_features_{}_{}".format( # args.eval_max_seq_len, args.arch # )) # valid_dataset = processor.create_dataset(valid_features) # valid_sampler = SequentialSampler(valid_dataset) # valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.eval_batch_size, # collate_fn=collate_fn) # ------- model logger.info("initializing model") if args.resume_path: args.resume_path = Path(args.resume_path) model = BertForMultiLable.from_pretrained(args.resume_path, num_labels=len(label_list)) else: # model = BertForMultiLable.from_pretrained(config['bert_model_dir'], num_labels=len(label_list)) model = BertForMultiLable.from_pretrained("bert-base-multilingual-cased", num_labels=len(label_list)) #t_total = int(len(train_dataloader) / args.gradient_accumulation_steps * args.epochs) t_total = 200000 param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay': args.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] warmup_steps = int(t_total * args.warmup_proportion) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # ---- callbacks logger.info("initializing callbacks") train_monitor = TrainingMonitor(file_dir=config['figure_dir'], arch=args.arch) model_checkpoint = ModelCheckpoint(checkpoint_dir=config['checkpoint_dir'],mode=args.mode, monitor=args.monitor,arch=args.arch, save_best_only=args.save_best) # **************************** training model *********************** logger.info("***** Running training *****") #logger.info(" Num examples = %d", len(train_examples)) logger.info(" Num Epochs = %d", args.epochs) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * ( torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) trainer = Trainer(args= args,model=model,logger=logger,criterion=BCEWithLogLoss(),optimizer=optimizer, scheduler=scheduler,early_stopping=None,training_monitor=train_monitor, model_checkpoint=model_checkpoint, batch_metrics=[AccuracyThresh(thresh=0.5)], epoch_metrics=[AUC(average='micro', task_type='binary'), MultiLabelReport(id2label=id2label), F1Score(average='micro', task_type='binary')]) trainer.model.zero_grad() seed_everything(trainer.args.seed) # Added here for reproductibility (even between python 2 a iter_num = 0 valid_dataloader = get_valid_dataloader(args) for epoch in range(trainer.start_epoch, trainer.start_epoch + trainer.args.epochs): trainer.logger.info(f"Epoch {epoch}/{trainer.args.epochs}") update_epoch = True for i, data_name in enumerate(data_names): filename_int = int(data_name) if filename_int > 3500: continue trainer.logger.info(f"Epoch {epoch} - summary {i+1}/{len(data_names)}"+ f": summary_{data_name}") # train_dataloader, valid_dataloader = get_dataloader(args, data_name) train_dataloader = get_dataloader(args, data_name) # train_log, valid_log = trainer.train(train_data=train_dataloader, valid_data=valid_dataloader, epoch=update_epoch) train_log = trainer.train(train_data=train_dataloader, epoch=update_epoch) update_epoch = False # if train_log == None: # continue iter_num += 1 # logs = dict(train_log) # show_info = f'\nEpoch: {epoch} - ' + "-".join([f' {key}: {value:.4f} ' for key, value in logs.items()]) # trainer.logger.info(show_info) if iter_num % 50 == 0: valid_log = trainer.valid_epoch(valid_dataloader) logs = dict(valid_log) show_info = f'\nEpoch: {epoch} - ' + "-".join([f' {key}: {value:.4f} ' for key, value in logs.items()]) trainer.logger.info(show_info) # save if trainer.training_monitor: trainer.training_monitor.epoch_step(logs) # save model if trainer.model_checkpoint: if iter_num % 50 == 0: # state = trainer.save_info(epoch, best=logs[trainer.model_checkpoint.monitor]) state = trainer.save_info(iter_num, best=logs[trainer.model_checkpoint.monitor]) trainer.model_checkpoint.bert_epoch_step(current=logs[trainer.model_checkpoint.monitor], state=state) # early_stopping if trainer.early_stopping: trainer.early_stopping.epoch_step(epoch=epoch, current=logs[trainer.early_stopping.monitor]) if trainer.early_stopping.stop_training: break
def run_train(args): # --------- data processor = BertProcessor(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case) label_list = processor.get_labels(args.task_type) label2id = {label: i for i, label in enumerate(label_list)} id2label = {i: label for i, label in enumerate(label_list)} train_data = processor.get_train( config['data_dir'] / f"{args.data_name}.train.{args.task_type}.pkl") train_examples = processor.create_examples( lines=train_data, example_type=f'train_{args.task_type}', cached_examples_file=config['data_dir'] / f"cached_train_{args.task_type}_examples_{args.arch}") train_features = processor.create_features( examples=train_examples, max_seq_len=args.train_max_seq_len, cached_features_file=config['data_dir'] / "cached_train_{}_features_{}_{}".format( args.task_type, args.train_max_seq_len, args.arch)) train_dataset = processor.create_dataset(train_features, is_sorted=args.sorted) if args.sorted: train_sampler = SequentialSampler(train_dataset) else: train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn) valid_data = processor.get_dev( config['data_dir'] / f"{args.data_name}.valid.{args.task_type}.pkl") valid_examples = processor.create_examples( lines=valid_data, example_type=f'valid_{args.task_type}', cached_examples_file=config['data_dir'] / f"cached_valid_{args.task_type}_examples_{args.arch}") valid_features = processor.create_features( examples=valid_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config['data_dir'] / "cached_valid_{}_features_{}_{}".format( args.task_type, args.eval_max_seq_len, args.arch)) valid_dataset = processor.create_dataset(valid_features) valid_sampler = SequentialSampler(valid_dataset) valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn) # ------- model logger.info("initializing model") if args.resume_path: args.resume_path = Path(args.resume_path) model = BertForMultiLable.from_pretrained(args.resume_path, num_labels=len(label_list)) else: if args.task_type == 'trans': model = BertForMultiLable_Fewshot.from_pretrained( Path('pybert/output/checkpoints/bert/base'), num_labels=len(label_list)) #model = BertForMultiLable.from_pretrained(config['bert_model_dir'], num_labels=len(label_list)) else: model = BertForMultiLable.from_pretrained( config['bert_model_dir'], num_labels=len(label_list)) t_total = int( len(train_dataloader) / args.gradient_accumulation_steps * args.epochs) # 下面是optimizer和scheduler的设计 param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] warmup_steps = int(t_total * args.warmup_proportion) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # ---- callbacks logger.info("initializing callbacks") train_monitor = TrainingMonitor( file_dir=config['figure_dir'], arch=args.arch ) # TODO: 理解train_monitor的作用,感觉就是一个用来绘图的东西,用于记录每一个epoch中得到的结果 model_checkpoint = ModelCheckpoint(checkpoint_dir=config['checkpoint_dir'], mode=args.mode, monitor=args.monitor, arch=args.arch, save_best_only=args.save_best) # **************************** training model *********************** logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Num Epochs = %d", args.epochs) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) trainer = Trainer( args=args, model=model, logger=logger, criterion=BCEWithLogLoss(), optimizer=optimizer, scheduler=scheduler, early_stopping=None, training_monitor=train_monitor, model_checkpoint=model_checkpoint, batch_metrics=[ AccuracyThresh(thresh=0.5) ], # 作用于batch之上的metrics,在每次loss.backward()之后都会执行计算,记得区分它与loss epoch_metrics=[ AUC(average='micro', task_type='binary'), # 作用于epoch之上的metrics MultiLabelReport(id2label=id2label), F1Score(task_type='binary', average='micro', search_thresh=True) ]) # TODO: 考虑是否应该使用F1-score替代指标 trainer.train(train_data=train_dataloader, valid_data=valid_dataloader)
def run_train(args): # --------- data processor = BertProcessor(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case) label_list = processor.get_labels() label2id = {label: i for i, label in enumerate(label_list)} id2label = {i: label for i, label in enumerate(label_list)} ##Get the data for the soft training task train_data = processor.get_train(config['data_dir'] / f"{args.data_name}.label_train.pkl") print("Train data is:") print(train_data) train_examples = processor.create_examples( lines=train_data, example_type='train', cached_examples_file=config['data_cache'] / f"cached_train_label_examples_finetune{args.arch}") # print ("Training examples are:") # print (train_examples) train_features = processor.create_features( examples=train_examples, max_seq_len=args.train_max_seq_len, cached_features_file=config['data_cache'] / "cached_train_label_features_finetune{}_{}".format( args.train_max_seq_len, args.arch)) train_dataset = processor.create_dataset(train_features, is_sorted=args.sorted) if args.sorted: train_sampler = SequentialSampler(train_dataset) else: train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) ########################################################################### ##Get data for the semi-supervised task # processor_semi = BertProcessor_semi(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case) # label_list = processor_semi.get_labels() # label2id = {label: i for i, label in enumerate(label_list)} # id2label = {i: label for i, label in enumerate(label_list)} train_data_semi = processor.get_train_semi(config['unlabel_data_path']) print("Train data is:") print(train_data) train_examples_semi = processor.create_examples_semi( lines=train_data_semi, example_type='train', cached_examples_file=config['data_cache'] / f"cached_train_unlabel_examples_finetune{args.arch}") # print ("Training examples are:") # print (train_examples) train_features_semi = processor.create_features_semi( examples=train_examples_semi, max_seq_len=args.train_max_seq_len, cached_features_file=config['data_cache'] / "cached_train_unlabel_features_finetune{}_{}".format( args.train_max_seq_len, args.arch)) train_dataset_semi = processor.create_dataset_semi(train_features_semi, is_sorted=args.sorted) if args.sorted: train_sampler_semi = SequentialSampler(train_dataset_semi) else: train_sampler_semi = RandomSampler(train_dataset_semi) train_dataloader_semi = DataLoader(train_dataset_semi, sampler=train_sampler_semi, batch_size=args.train_batch_size) valid_data = processor.get_dev(config['data_dir'] / f"{args.data_name}.label_valid.pkl") valid_examples = processor.create_examples( lines=valid_data, example_type='valid', cached_examples_file=config['data_cache'] / f"cached_valid_examples_label_finetune{args.arch}") valid_features = processor.create_features( examples=valid_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config['data_cache'] / "cached_valid_features_label_finetune{}_{}".format( args.eval_max_seq_len, args.arch)) valid_dataset = processor.create_dataset(valid_features) valid_sampler = SequentialSampler(valid_dataset) valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.eval_batch_size) # ------- model logger.info("initializing model") if args.resume_path: args.resume_path = Path(args.resume_path) model = BertForMultiLable.from_pretrained(args.resume_path, num_labels=len(label_list)) else: print("Labels are:") print(label_list) # model = BertForMultiLable.from_pretrained(config['bert_model_dir'], num_labels=len(label_list)) #model = BertForMultiLable.from_pretrained("pybert/output/checkpoints_label_finetune_soft_joint_corr_emotion/bert", num_labels=len(label_list)) model = BertForMultiLable.from_pretrained("bert-base-uncased") # model = BertForMultiLable.from_pretrained("bert-base-uncased", num_labels=len(label_list)) t_total = int( len(train_dataloader) / args.gradient_accumulation_steps * args.epochs) param_optimizer = list(model.named_parameters()) # param_optimizer = list(filter(lambda named_param: named_param[1].requires_grad, model.named_parameters())) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] warmup_steps = int(t_total * args.warmup_proportion) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) ##For semi-supervision t_total_semi = int( len(train_dataloader_semi) / args.gradient_accumulation_steps * args.epochs) ##params for this model only contains the params other than the label graph # param_optimizer_semi = [(name, param) for (name, param) in list(model.named_parameters()) if "label_graph" not in name] param_optimizer_semi = [(name, param) for name, param in model.named_parameters() if name == 'label_graph.weight'] # param_optimizer = list(filter(lambda named_param: named_param[1].requires_grad, model.named_parameters())) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters_semi = [{ 'params': [ p for n, p in param_optimizer_semi if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in param_optimizer_semi if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] warmup_steps_semi = int(t_total_semi * args.warmup_proportion) optimizer_semi = AdamW(optimizer_grouped_parameters_semi, lr=args.learning_rate, eps=args.adam_epsilon) lr_scheduler_semi = WarmupLinearSchedule(optimizer_semi, warmup_steps=warmup_steps_semi, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # ---- callbacks logger.info("initializing callbacks") train_monitor = TrainingMonitor(file_dir=config['figure_dir'], arch=args.arch) model_checkpoint = ModelCheckpoint(checkpoint_dir=config['checkpoint_dir'], mode=args.mode, monitor=args.monitor, arch=args.arch, save_best_only=args.save_best) # **************************** training model *********************** logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Num Epochs = %d", args.epochs) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) trainer = Trainer( n_gpu=args.n_gpu, model=model, epochs=args.epochs, logger=logger, # criterion_hard=BCEWithLogLoss(), criterion=ContinuousBCEWithLogLoss(), optimizer=optimizer, lr_scheduler=lr_scheduler, optimizer_semi=optimizer_semi, lr_scheduler_semi=lr_scheduler_semi, early_stopping=None, training_monitor=train_monitor, fp16=args.fp16, resume_path=args.resume_path, grad_clip=args.grad_clip, model_checkpoint=model_checkpoint, gradient_accumulation_steps=args.gradient_accumulation_steps, batch_metrics=[AccuracyThresh(thresh=0.5)], ##Only look at the f1 score epoch_metrics=[MultiLabelReport(id2label=id2label)]) # embeddings_dict = pickle.load(open("/home/rgaonkar/context_home/rgaonkar/label_embeddings/code/Bert_Masked_LM/label_embeddings_dict.p", "rb")) # label_similarity_matrix = get_label_similarity_matrix(embeddings_dict, label_list) # ------- model logger.info("initializing model") true_labels_matrix = [sample[-1].tolist() for sample in train_dataset] print("True train labels:") print(true_labels_matrix) train_label_corr = get_label_corr(true_labels_matrix) print("True train label correlations:") print(train_label_corr) #Save the correlation matrix of the true labels in the data cache folder pickle.dump(train_label_corr, open(config['data_cache'] / "train_label_corr.p", "wb")) trainer.train(train_data=train_dataloader, train_data_semi=train_dataloader_semi, valid_data=valid_dataloader, seed=args.seed, prob_thresh=args.prob_thresh, true_label_corr=train_label_corr, tokenizer=processor.tokenizer, args=args)
def run_test(args): # TODO: 对训练集使用micro F1-score进行结果评测 from pybert.io.task_data import TaskData from pybert.test.predictor import Predictor data = TaskData() ids, targets, sentences = data.read_data( raw_data_path=config['test_path'], preprocessor=ChinesePreProcessor(), is_train=True) # 设置为True lines = list(zip(sentences, targets)) #print(ids,sentences) processor = BertProcessor(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case) label_list = processor.get_labels(args.task_type) id2label = {i: label for i, label in enumerate(label_list)} test_data = processor.get_test(lines=lines) test_examples = processor.create_examples( lines=test_data, example_type=f'test_{args.task_type}', cached_examples_file=config['data_dir'] / f"cached_test_{args.task_type}_examples_{args.arch}") test_features = processor.create_features( examples=test_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config['data_dir'] / "cached_test_{}_features_{}_{}".format( args.task_type, args.eval_max_seq_len, args.arch)) test_dataset = processor.create_dataset(test_features) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn) model = None if args.task_type == 'base': model = BertForMultiLable.from_pretrained(config['checkpoint_dir'], num_labels=len(label_list)) else: # model = BertForMultiLable.from_pretrained(config['checkpoint_dir'], num_labels=len(label_list)) model = BertForMultiLable_Fewshot.from_pretrained( config['checkpoint_dir'], num_labels=len(label_list)) # ----------- predicting logger.info('model predicting....') predictor = Predictor(model=model, logger=logger, n_gpu=args.n_gpu) result = predictor.predict(data=test_dataloader) # 感觉这个变量名叫all_logits可能更好 # TODO: 计算F1-score,这个功能模块需要用代码测试一下~ f1_metric = F1Score(task_type='binary', average='micro', search_thresh=True) all_logits = torch.tensor(result, dtype=torch.float) # 转换成tensor all_labels = torch.tensor(targets, dtype=torch.long) # 转换成tensor f1_metric(all_logits, all_labels) # 会自动打印结果 print(f1_metric.value()) # 将结果写入一个文件之中 with open('test_output/test.log', 'a+') as f: f.write(str(f1_metric.value()) + "\n") thresh = f1_metric.thresh ids = np.array(ids) df1 = pd.DataFrame(ids, index=None) df2 = pd.DataFrame(result, index=None) all_df = pd.concat([df1, df2], axis=1) if args.task_type == 'base': all_df.columns = ['id', 'zy', 'gfgqzr', 'qs', 'tz', 'ggjc'] else: all_df.columns = ['id', 'sg', 'pj', 'zb', 'qsht', 'db'] for column in all_df.columns[1:]: all_df[column] = all_df[column].apply(lambda x: 1 if x > thresh else 0) # all_df['zy'] = all_df['zy'].apply(lambda x: 1 if x>thresh else 0) # all_df['gfgqzr'] = all_df['gfgqzr'].apply(lambda x: 1 if x>thresh else 0) # all_df['qs'] = all_df['qs'].apply(lambda x: 1 if x>thresh else 0) # all_df['tz'] = all_df['tz'].apply(lambda x: 1 if x>thresh else 0) # all_df['ggjc'] = all_df['ggjc'].apply(lambda x: 1 if x>thresh else 0) all_df.to_csv(f"test_output/{args.task_type}/cls_out.csv", index=False)
def run_train(args): # --------- data processor = BertProcessor(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case) label_list = processor.get_labels() label2id = {label: i for i, label in enumerate(label_list)} id2label = {i: label for i, label in enumerate(label_list)} train_data = processor.get_train(config['data_dir'] / f"{args.data_name}.label_train.pkl") print("Train data is:") print(train_data) train_examples = processor.create_examples( lines=train_data, example_type='train', cached_examples_file=config['data_cache'] / f"cached_train_label_examples_finetune{args.arch}") # print ("Training examples are:") # print (train_examples) train_features = processor.create_features( examples=train_examples, max_seq_len=args.train_max_seq_len, cached_features_file=config['data_cache'] / "cached_train_label_features_finetune{}_{}".format( args.train_max_seq_len, args.arch)) train_dataset = processor.create_dataset(train_features, is_sorted=args.sorted) if args.sorted: train_sampler = SequentialSampler(train_dataset) else: train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) valid_data = processor.get_dev(config['data_dir'] / f"{args.data_name}.label_valid.pkl") valid_examples = processor.create_examples( lines=valid_data, example_type='valid', cached_examples_file=config['data_cache'] / f"cached_valid_examples_label_finetune{args.arch}") valid_features = processor.create_features( examples=valid_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config['data_cache'] / "cached_valid_features_label_finetune{}_{}".format( args.eval_max_seq_len, args.arch)) valid_dataset = processor.create_dataset(valid_features) valid_sampler = SequentialSampler(valid_dataset) valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.eval_batch_size) # ------- model logger.info("initializing model") if args.resume_path: args.resume_path = Path(args.resume_path) model = BertForMultiLable.from_pretrained(args.resume_path, num_labels=len(label_list)) else: print("Labels are:") print(label_list) # model = BertForMultiLable.from_pretrained(config['bert_model_dir'], num_labels=len(label_list)) model = BertForMultiLable.from_pretrained("bert-base-uncased", num_labels=len(label_list)) t_total = int( len(train_dataloader) / args.gradient_accumulation_steps * args.epochs) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] warmup_steps = int(t_total * args.warmup_proportion) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # ---- callbacks logger.info("initializing callbacks") train_monitor = TrainingMonitor(file_dir=config['figure_dir'], arch=args.arch) model_checkpoint = ModelCheckpoint(checkpoint_dir=config['checkpoint_dir'], mode=args.mode, monitor=args.monitor, arch=args.arch, save_best_only=args.save_best) # **************************** training model *********************** logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Num Epochs = %d", args.epochs) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) trainer = Trainer( n_gpu=args.n_gpu, model=model, epochs=args.epochs, logger=logger, criterion=BCEWithLogLoss(), optimizer=optimizer, lr_scheduler=lr_scheduler, early_stopping=None, training_monitor=train_monitor, fp16=args.fp16, resume_path=args.resume_path, grad_clip=args.grad_clip, model_checkpoint=model_checkpoint, gradient_accumulation_steps=args.gradient_accumulation_steps, batch_metrics=[AccuracyThresh(thresh=0.5)], epoch_metrics=[ AUC(average='micro', task_type='binary'), MultiLabelReport(id2label=id2label) ]) # embeddings_dict = pickle.load(open("/home/rgaonkar/context_home/rgaonkar/label_embeddings/code/Bert_Masked_LM/label_embeddings_dict.p", "rb")) # label_similarity_matrix = get_label_similarity_matrix(embeddings_dict, label_list) trainer.train(train_data=train_dataloader, valid_data=valid_dataloader, seed=args.seed)
def main(): parser = ArgumentParser() parser.add_argument("--arch", default='bert', type=str) parser.add_argument("--do_data", action='store_true') parser.add_argument("--do_train", action='store_true') parser.add_argument("--do_test", action='store_true') parser.add_argument("--save_best", action='store_true') parser.add_argument("--do_lower_case", action='store_true') parser.add_argument('--data_name', default='kaggle', type=str) parser.add_argument("--epochs", default=6, type=int) parser.add_argument("--resume_path", default='', type=str) parser.add_argument("--mode", default='min', type=str) parser.add_argument("--monitor", default='valid_loss', type=str) parser.add_argument("--valid_size", default=0.2, type=float) parser.add_argument("--local_rank", type=int, default=-1) parser.add_argument("--sorted", default=1, type=int, help='1 : True 0:False ') parser.add_argument("--n_gpu", type=str, default='0', help='"0,1,.." or "0" or "" ') parser.add_argument('--gradient_accumulation_steps', type=int, default=1) parser.add_argument("--train_batch_size", default=8, type=int) parser.add_argument('--eval_batch_size', default=8, type=int) parser.add_argument("--train_max_seq_len", default=256, type=int) parser.add_argument("--eval_max_seq_len", default=256, type=int) parser.add_argument('--loss_scale', type=float, default=0) parser.add_argument( "--warmup_proportion", default=0.1, type=int, ) parser.add_argument("--weight_decay", default=0.01, type=float) parser.add_argument("--adam_epsilon", default=1e-8, type=float) parser.add_argument("--grad_clip", default=1.0, type=float) parser.add_argument("--learning_rate", default=2e-5, type=float) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--fp16', action='store_true') parser.add_argument('--fp16_opt_level', type=str, default='O1') parser.add_argument("--prob_thresh", default=0.5, type=float) args = parser.parse_args() config['checkpoint_dir'] = config['checkpoint_dir'] / args.arch config['checkpoint_dir'].mkdir(exist_ok=True) # Good practice: save your training arguments together with the trained model torch.save(args, config['checkpoint_dir'] / 'training_args.bin') seed_everything(args.seed) init_logger(log_file=config['log_dir'] / f"{args.arch}.log") logger.info("Training/evaluation parameters %s", args) if args.do_data: from pybert.io.task_data_label import TaskData data = TaskData() print("Train data path:") print(config['raw_data_path']) targets, sentences_char = data.read_data( raw_data_path=config['raw_data_path'], preprocessor=EnglishPreProcessor(), is_train=True) print("Target:") print(targets) print(" ") print("Sentence:") print(sentences_char) print(" ") data.train_val_split(X=sentences_char, y=targets, valid_size=args.valid_size, data_dir=config['data_dir'], data_name=args.data_name) ##Get the test data targets_test, sentences_char_test = data.read_data( raw_data_path=config['test_path'], preprocessor=EnglishPreProcessor(), is_train=True) print(targets_test) data.save_test_data(X=sentences_char_test, y=targets_test, data_dir=config['data_dir'], data_name=args.data_name) if args.do_train: run_train(args) if args.do_test: run_test(args)