def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path,bert_config_file,pytorch_dump_path): config = BertConfig.from_json_file(bert_config_file) model = BertForPretraining(config) load_tf_weights_in_bert(model,config,tf_checkpoint_path) torch.save(model.state_dict(),pytorch_dump_path)
def get_model(): bert_config = BertConfig.from_json_file(self.config_path) bert_config.type_vocab_size = 3 bert_config.eos_token_id = self.tokenizer.token_to_id('[SEP]') model = GenLM(bert_config) if not is_predict: load_tf_weights_in_bert(model, self.checkpoint_path) # model = keras.models.Model(model.inputs, model.outputs) return model
def __init__(self): config = BertConfig.from_json_file(join(BERT_PATH, 'bert_config.json')) self.tokenizer = BertTokenizer(vocab_file=join(BERT_PATH, 'vocab.txt')) self.model = BertModel(config, add_pooling_layer=False) load_tf_weights_in_bert(self.model, tf_checkpoint_path=join( BERT_PATH, 'bert_model.ckpt'), strip_bert=True) self.model.to(PT_DEVICE) self.model.eval()
def __init__(self, is_predict=False): super().__init__() config = BertConfig.from_json_file(join(BERT_PATH, 'bert_config.json')) self.bert = BertModel(config, add_pooling_layer=True) self.tokenizer = self.get_tokenizer() if not is_predict: load_tf_weights_in_bert(self.bert, tf_checkpoint_path=join( BERT_PATH, 'bert_model.ckpt'), strip_bert=True) self.cls = torch.nn.Linear(768, 2) self.save_dir = join(MODEL_PATH, 'consistent') if not os.path.isdir(self.save_dir): os.makedirs(self.save_dir) self.save_path = join(self.save_dir, 'trained.pt')
def classify(fname: str, verbose: bool = False): ''' Returns a 1 dimensional numpy array of predictions Currently predictions 0, -1, 1 are indexed at 0, 1, 2 Therefore when reading the return array: 0 = 'Neutral', 1 = 'Deny', 2 = 'Favor' ''' tokenizer = BertTokenizer('../models/BERT-vocab1.dms') config = BertConfig.from_json_file('../models/BERT-config0.json') model = TFBertForSequenceClassification.from_pretrained( '../models/BERT-transfer1/', config=config) # BATCH_SIZE = 64 feat_spec = { 'idx': tf.io.FixedLenFeature([], tf.int64), 'sentence': tf.io.FixedLenFeature([], tf.string), 'label': tf.io.FixedLenFeature([], tf.int64) } def parse_ex(ex_proto): return tf.io.parse_single_example(ex_proto, feat_spec) tweets = tf.data.TFRecordDataset(fname) tweets = tweets.map(parse_ex) # with open('data/tweet_info.json')as j_file: # data_info = json.load(j_file) # num_samples = data_info['DF_length'] eval_df = glue_convert_examples_to_features(examples=tweets, tokenizer=tokenizer, max_length=128, task='sst-2', label_list=['0', '-1', '1']) eval_df = eval_df.batch(64) y_preds = model.predict(eval_df, use_multiprocessing=True, verbose=verbose) y_preds_sm = tf.nn.softmax(y_preds) y_preds_argmax = tf.math.argmax(y_preds_sm, axis=1) return y_preds_argmax.numpy()
def __init__(self, pretrained_model_dir, num_classes, segment_len=200, overlap=50, dropout_p=0.5): super(BertLSTMWithOverlap, self).__init__() self.seg_len = segment_len self.overlap = overlap self.config = BertConfig.from_json_file(pretrained_model_dir + 'bert_config.json') self.bert = BertModel.from_pretrained(pretrained_model_dir, config=self.config) if feature_extract: for p in self.bert.parameters(): # 迁移学习:bert作为特征提取器 p.requires_grad = False d_model = self.config.hidden_size # 768 self.bi_lstm2 = torch.nn.LSTM(input_size=d_model, hidden_size=d_model // 2, bidirectional=True, batch_first=True) self.attn_weights2 = torch.nn.Sequential( torch.nn.Linear( d_model, d_model), # sent_attn_energy [b,num_seg,768]=>[b,num_seg,768] torch.nn.Tanh(), torch.nn.Linear( d_model, 1, bias=False ), # sent_attn_weights [b,num_seg,768]=>[b,num_seg,1] torch.nn.Softmax(dim=1), # [b,num_seg,1] ) self.fc = torch.nn.Sequential(torch.nn.Dropout(p=dropout_p), torch.nn.Linear(d_model, num_classes))
# In[ ]: import numpy as np import json from sklearn.model_selection import train_test_split import tensorflow as tf from transformers import BertTokenizer, TFBertForSequenceClassification, glue_convert_examples_to_features from transformers.configuration_bert import BertConfig # In[ ]: tokenizer = BertTokenizer('../models/BERT-vocab1.dms') config = BertConfig.from_json_file('../models/BERT-config0.json') model = TFBertForSequenceClassification.from_pretrained( '../models/BERT-transfer1', config=config) # In[ ]: fname = '../data/prelabeled/test47_even.tfrecord' # BATCH_SIZE = 64 feat_spec = { 'idx': tf.io.FixedLenFeature([], tf.int64), 'sentence': tf.io.FixedLenFeature([], tf.string), 'label': tf.io.FixedLenFeature([], tf.int64) }
def main(): parser = ArgumentParser() parser.add_argument("--model_name", type=str, required=True, choices=["GMMBert", "LogBert", "ExpBert", "FlowBert", "DisBert"]) parser.add_argument("--dataset", type=str, required=True, choices=["fin-all", "fin-dol", "sci-doc"]) parser.add_argument('--saved_checkpoint', type=str, default=None, required=False) parser.add_argument("--bert_model", type=str, default='bert-base-uncased', help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument('--do_lower_case', type=str_to_bool, default=True, help="Lower case the text and model.") parser.add_argument('--do_pretrain', type=str_to_bool, default=True, help="Use a pretrained Bert Parameters.") parser.add_argument('--do_pretrain_wpe', type=str_to_bool, default=True, help="Use a pretrained Bert Parameters only for wpe embeddings") parser.add_argument('--log_criterion', type=str, default='L1', choices=["L1", "L2", ''], help="Loss function to use for LogBert") parser.add_argument('--do_gmm', type=str_to_bool, default=False, help="Use the Gaussian mixture model components.") parser.add_argument('--do_log', type=str_to_bool, default=False, help="Do L2 over the numbers in logspace") parser.add_argument('--do_dis', type=str_to_bool, default=False, help="Discriminative baseline") parser.add_argument('--do_anomaly', type=str_to_bool, default=True, help="Do anomaly evaluation") parser.add_argument('--do_exp', type=str_to_bool, default=False, help="Latent Exponent Model") parser.add_argument('--exp_truncate', type=str_to_bool, default=True, help="Use a truncated normal distribution.") parser.add_argument('--do_flow', type=str_to_bool, default=False, help="Do flow over the numbers in logspace") parser.add_argument('--flow_criterion', type=str, default='L1', choices=["L1", "L2", ''], help="Loss function to use for 'Flow'Bert") parser.add_argument('--flow_v', type=str, default='', choices=['1a', '1b', '2a', '2b', ''], help="Mode for 'Flow'Bert") parser.add_argument('--flow_fix_mu', type=str_to_bool, default=False, help="Use a fixed mu for flow model") parser.add_argument("--flow_scale", type=float, default=10.0) parser.add_argument("--exp_logvar_scale", type=float, default=-5.0) parser.add_argument("--exp_logvar", type=str_to_bool, default=False) parser.add_argument("--drop_rate", type=float, default=0.0, help='Droprate of 0 is no droprate') parser.add_argument("--do_eval", type=str_to_bool, default=False) parser.add_argument("--do_test", type=str_to_bool, default=False) parser.add_argument("--reduce_memory", action="store_true", help="Store training data as on-disc memmaps to massively reduce memory usage") parser.add_argument("--patience", type=int, default=3, help="Number of early stop epochs patience ") parser.add_argument("--epochs", type=int, default=10, help="Number of epochs to train for") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=512, type=int, help="Total batch size for training.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--lr_bert", default=3e-5, type=float, help="The initial learning rate for Adam for bert params") parser.add_argument("--lr_mlp", default=3e-5, type=float) parser.add_argument("--weight_decay", default=0.01, type=float, help="Adam's weight l2 regularization") parser.add_argument("--clip_grad", default=5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gmm_crossentropy', type=str_to_bool, default=False, help="GMM Crossentropy.") parser.add_argument('--gmm_exponent', type=str_to_bool, default=True, help="Instead of Kernels use powers of 10") parser.add_argument('--gmm_nmix', type=int, default=31, help="number of mixtures used only for gmm. [1,3,7,15,31,63,127,255,511]") parser.add_argument('--optim', type=str, default='sgd', choices=['sgd', 'adam'], help="Loss function to use for LogBert") parser.add_argument('--min_exponent', type=int, default=-1, help="min exponent size") parser.add_argument('--max_exponent', type=int, default=16, help="max exponent size") parser.add_argument('--n_exponent', type=int, default=17, help="sum of min and max") parser.add_argument('--embed_exp', type=str_to_bool, default=False, help="Learn an input exponent embedding") parser.add_argument('--embed_exp_opt', type=str, default='high', choices=['low', 'high', ''], help="high or low learning rate for embeddings") parser.add_argument('--embed_digit', type=str_to_bool, default=False, help="Learn in input embedding of numbers using LSTM over digits") parser.add_argument('--output_embed_exp', type=str_to_bool, default=False, help="Learn in input embedding and attach after Bert") parser.add_argument('--zero_init', type=str_to_bool, default=False, help="Start non pretrained embeddings at zero") parser.add_argument("--n_digits", type=int, default=14, help="Size of digit vocab includes e.+-") parser.add_argument("--ez_digits", type=int, default=32, help="Digit embedding size") args = parser.parse_args() args.pregenerated_data = Path(PREGENERATED_DATA[args.dataset]) args.output_dir = Path(f'{CHECKPOINT_PATH}/{args.dataset}') sanity_check(args) args.savepath = args.output_dir if args.saved_checkpoint is not None: args.output_dir = Path(args.saved_checkpoint) args.run_name = args.output_dir.stem num_data_epochs = 1 else: args.output_dir, args.run_name = build_savepath(args) print('dataset', args.dataset) print('output_dir', args.output_dir) print('pregenerated_data', args.pregenerated_data) print('run_name', args.run_name) wandb.init(project="mnm-paper", name=f'{args.run_name}') wandb.config.update(args, allow_val_change=True) samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"train_epoch_{i}.json" metrics_file = args.pregenerated_data / f"train_epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: print(f'epoch_file:{epoch_file}') exit("No training data was found!") print(f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).") print("This script will loop over the available data, but training diversity may be negatively impacted.") num_data_epochs = i break else: num_data_epochs = args.epochs device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logging.info("device: {} n_gpu: {}".format( device, n_gpu)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning(f"Output directory ({args.output_dir}) already exists and is not empty!") args.output_dir.mkdir(parents=True, exist_ok=True) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int( total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) # Prepare model NumberBertModel = get_model(args) if args.do_test: best_model, tokenizer, best_path = load_best(args) global_step = 0 train_numbers = get_numbers_from_split(args, tokenizer, device, num_data_epochs, split='train') train_mean, train_median = np.mean(train_numbers), np.median(train_numbers) best_model.to(device) best_model.eval() if args.do_dis: test_metrics = evaluate_discriminative(args, best_model, tokenizer, device, global_step, 'test', train_mean, train_median, train_numbers) else: test_metrics = evaluation(args, best_model, tokenizer, device, global_step, 'test', train_mean, train_median, train_numbers) save_results(best_path, test_metrics) save_args(best_path, args) return early_stopper = EarlyStopping('valid_one_loss', min_delta=0.0, patience=args.patience, monitor_mode='min') if args.saved_checkpoint is not None: print('args.saved_checkpoint', args.saved_checkpoint) tokenizer = BertNumericalTokenizer.from_pretrained(args.saved_checkpoint) model = NumberBertModel.from_pretrained(args.saved_checkpoint, args=args) #uncomment this train_numbers = get_numbers_from_split(args, tokenizer, device, num_data_epochs, split='train') else: tokenizer = BertNumericalTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_numbers = get_numbers_from_split(args, tokenizer, device, num_data_epochs, split='train') # old_save_dir = None if args.do_pretrain: model = NumberBertModel.from_pretrained(args.bert_model, args=args) else: config = BertConfig.from_json_file('./bert-base-uncased-config.json') model = NumberBertModel(config, args) if args.do_pretrain_wpe: pre_model = NumberBertModel.from_pretrained(args.bert_model, args=args) # pretrained_dict = pretrained_dict = pre_model.state_dict() # print('pretrained_dict', pretrained_dict) pretrained_dict = {k: v for k, v in pretrained_dict.items() if 'embedding' in k} model_dict = model.state_dict() pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} # 2. overwrite entries in the existing state dict model_dict.update(pretrained_dict) # 3. load the new state dict model.load_state_dict(model_dict) if args.do_gmm: kernel_locs, kernel_scales = get_gmm_components(args, train_numbers) model.set_kernel_locs(kernel_locs, kernel_scales) special_tokens_dict = {'additional_special_tokens': ('[UNK_NUM]',)} num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) print('We have added', num_added_toks, 'tokens') model.resize_token_embeddings(len(tokenizer)) # model.set_params(args) def set_dropout(model, drop_rate): for name, child in model.named_children(): if isinstance(child, torch.nn.Dropout): child.p = drop_rate set_dropout(child, drop_rate=drop_rate) set_dropout(model, drop_rate=args.drop_rate) wandb.watch(model, log="all") model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = set_lr(args, param_optimizer) if args.optim == 'sgd': optimizer = torch.optim.SGD(optimizer_grouped_parameters, lr=args.lr_bert) elif args.optim == 'adam': optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.lr_bert, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) train_mean, train_median = np.mean(train_numbers), np.median(train_numbers) if args.do_eval: model.eval() if args.do_dis: train_epoch_metrics = evaluate_discriminative(args, model, tokenizer, device, global_step, 'train', train_mean, train_median, train_numbers) valid_epoch_metrics = evaluate_discriminative(args, model, tokenizer, device, global_step, 'valid', train_mean, train_median, train_numbers) else: # evaluation(args, model, tokenizer, device, global_step, 'train', train_mean, train_median, train_numbers) # valid_epoch_metrics = evaluation(args, model, tokenizer, device, global_step, 'valid', train_mean, train_median, train_numbers) #EMNLP FINAL test_metrics = evaluation(args, model, tokenizer, device, global_step, 'test', train_mean, train_median, train_numbers) return model.train() global_step = train_loop(args, model, optimizer, scheduler, tokenizer, device, optimizer_grouped_parameters, early_stopper, train_numbers, train_mean, train_median, global_step, n_gpu, num_data_epochs) del model best_model, tokenizer, best_path = load_best(args) best_model.to(device) best_model.eval() if args.do_dis: test_metrics = evaluate_discriminative(args, best_model, tokenizer, device, global_step, 'test', train_mean, train_median, train_numbers) else: test_metrics = evaluation(args, best_model, tokenizer, device, global_step, 'test', train_mean, train_median, train_numbers) save_results(best_path, test_metrics) save_args(best_path, args) #flush check wandb.log({})