Esempio n. 1
0
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path,bert_config_file,pytorch_dump_path):
  config = BertConfig.from_json_file(bert_config_file)
  model = BertForPretraining(config)
  
  load_tf_weights_in_bert(model,config,tf_checkpoint_path)
  
  torch.save(model.state_dict(),pytorch_dump_path)
Esempio n. 2
0
 def get_model():
     bert_config = BertConfig.from_json_file(self.config_path)
     bert_config.type_vocab_size = 3
     bert_config.eos_token_id = self.tokenizer.token_to_id('[SEP]')
     model = GenLM(bert_config)
     if not is_predict:
         load_tf_weights_in_bert(model, self.checkpoint_path)
     # model = keras.models.Model(model.inputs, model.outputs)
     return model
Esempio n. 3
0
 def __init__(self):
     config = BertConfig.from_json_file(join(BERT_PATH, 'bert_config.json'))
     self.tokenizer = BertTokenizer(vocab_file=join(BERT_PATH, 'vocab.txt'))
     self.model = BertModel(config, add_pooling_layer=False)
     load_tf_weights_in_bert(self.model,
                             tf_checkpoint_path=join(
                                 BERT_PATH, 'bert_model.ckpt'),
                             strip_bert=True)
     self.model.to(PT_DEVICE)
     self.model.eval()
 def __init__(self, is_predict=False):
     super().__init__()
     config = BertConfig.from_json_file(join(BERT_PATH, 'bert_config.json'))
     self.bert = BertModel(config, add_pooling_layer=True)
     self.tokenizer = self.get_tokenizer()
     if not is_predict:
         load_tf_weights_in_bert(self.bert,
                                 tf_checkpoint_path=join(
                                     BERT_PATH, 'bert_model.ckpt'),
                                 strip_bert=True)
     self.cls = torch.nn.Linear(768, 2)
     self.save_dir = join(MODEL_PATH, 'consistent')
     if not os.path.isdir(self.save_dir):
         os.makedirs(self.save_dir)
     self.save_path = join(self.save_dir, 'trained.pt')
Esempio n. 5
0
def classify(fname: str, verbose: bool = False):
    '''
    Returns a 1 dimensional numpy array of predictions
    Currently predictions 0, -1, 1 are indexed at 0, 1, 2
    Therefore when reading the return array:
    0 = 'Neutral', 1 = 'Deny', 2 = 'Favor'
    '''
    tokenizer = BertTokenizer('../models/BERT-vocab1.dms')
    config = BertConfig.from_json_file('../models/BERT-config0.json')
    model = TFBertForSequenceClassification.from_pretrained(
        '../models/BERT-transfer1/', config=config)

    # BATCH_SIZE = 64
    feat_spec = {
        'idx': tf.io.FixedLenFeature([], tf.int64),
        'sentence': tf.io.FixedLenFeature([], tf.string),
        'label': tf.io.FixedLenFeature([], tf.int64)
    }

    def parse_ex(ex_proto):
        return tf.io.parse_single_example(ex_proto, feat_spec)

    tweets = tf.data.TFRecordDataset(fname)
    tweets = tweets.map(parse_ex)

    # with open('data/tweet_info.json')as j_file:
    #     data_info = json.load(j_file)
    #     num_samples = data_info['DF_length']

    eval_df = glue_convert_examples_to_features(examples=tweets,
                                                tokenizer=tokenizer,
                                                max_length=128,
                                                task='sst-2',
                                                label_list=['0', '-1', '1'])
    eval_df = eval_df.batch(64)

    y_preds = model.predict(eval_df, use_multiprocessing=True, verbose=verbose)
    y_preds_sm = tf.nn.softmax(y_preds)
    y_preds_argmax = tf.math.argmax(y_preds_sm, axis=1)
    return y_preds_argmax.numpy()
    def __init__(self,
                 pretrained_model_dir,
                 num_classes,
                 segment_len=200,
                 overlap=50,
                 dropout_p=0.5):
        super(BertLSTMWithOverlap, self).__init__()

        self.seg_len = segment_len
        self.overlap = overlap

        self.config = BertConfig.from_json_file(pretrained_model_dir +
                                                'bert_config.json')
        self.bert = BertModel.from_pretrained(pretrained_model_dir,
                                              config=self.config)

        if feature_extract:
            for p in self.bert.parameters():  # 迁移学习:bert作为特征提取器
                p.requires_grad = False

        d_model = self.config.hidden_size  # 768

        self.bi_lstm2 = torch.nn.LSTM(input_size=d_model,
                                      hidden_size=d_model // 2,
                                      bidirectional=True,
                                      batch_first=True)
        self.attn_weights2 = torch.nn.Sequential(
            torch.nn.Linear(
                d_model,
                d_model),  # sent_attn_energy [b,num_seg,768]=>[b,num_seg,768]
            torch.nn.Tanh(),
            torch.nn.Linear(
                d_model, 1, bias=False
            ),  # sent_attn_weights [b,num_seg,768]=>[b,num_seg,1]
            torch.nn.Softmax(dim=1),  # [b,num_seg,1]
        )

        self.fc = torch.nn.Sequential(torch.nn.Dropout(p=dropout_p),
                                      torch.nn.Linear(d_model, num_classes))
# In[ ]:

import numpy as np
import json
from sklearn.model_selection import train_test_split

import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification, glue_convert_examples_to_features

from transformers.configuration_bert import BertConfig

# In[ ]:

tokenizer = BertTokenizer('../models/BERT-vocab1.dms')

config = BertConfig.from_json_file('../models/BERT-config0.json')

model = TFBertForSequenceClassification.from_pretrained(
    '../models/BERT-transfer1', config=config)

# In[ ]:

fname = '../data/prelabeled/test47_even.tfrecord'
# BATCH_SIZE = 64
feat_spec = {
    'idx': tf.io.FixedLenFeature([], tf.int64),
    'sentence': tf.io.FixedLenFeature([], tf.string),
    'label': tf.io.FixedLenFeature([], tf.int64)
}

Esempio n. 8
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--model_name", type=str, required=True,
                        choices=["GMMBert", "LogBert", "ExpBert", "FlowBert", "DisBert"])

    
    parser.add_argument("--dataset", type=str, required=True,
                        choices=["fin-all", "fin-dol", "sci-doc"])
    
    parser.add_argument('--saved_checkpoint', type=str, default=None, required=False)

    parser.add_argument("--bert_model", type=str, default='bert-base-uncased', 
                            help="Bert pre-trained model selected in the list: bert-base-uncased, "
                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")

    parser.add_argument('--do_lower_case', type=str_to_bool, default=True, help="Lower case the text and model.")

    parser.add_argument('--do_pretrain', type=str_to_bool, default=True, help="Use a pretrained Bert Parameters.")
    parser.add_argument('--do_pretrain_wpe', type=str_to_bool, default=True, help="Use a pretrained Bert Parameters only for wpe embeddings")
    

    parser.add_argument('--log_criterion', type=str, default='L1',  choices=["L1", "L2", ''], help="Loss function to use for LogBert")

    parser.add_argument('--do_gmm', type=str_to_bool, default=False, help="Use the Gaussian mixture model components.")
    parser.add_argument('--do_log', type=str_to_bool, default=False, help="Do L2 over the numbers in logspace")
    parser.add_argument('--do_dis', type=str_to_bool, default=False, help="Discriminative baseline")
    parser.add_argument('--do_anomaly', type=str_to_bool, default=True, help="Do anomaly evaluation")

    parser.add_argument('--do_exp', type=str_to_bool, default=False, help="Latent Exponent Model")
    parser.add_argument('--exp_truncate', type=str_to_bool, default=True, help="Use a truncated normal distribution.")
    
    
    parser.add_argument('--do_flow', type=str_to_bool, default=False, help="Do flow over the numbers in logspace")
    parser.add_argument('--flow_criterion', type=str, default='L1',  choices=["L1", "L2", ''], help="Loss function to use for 'Flow'Bert")
    parser.add_argument('--flow_v', type=str, default='',  choices=['1a', '1b', '2a', '2b', ''], help="Mode for 'Flow'Bert")
    parser.add_argument('--flow_fix_mu', type=str_to_bool, default=False, help="Use a fixed mu for flow model")
    parser.add_argument("--flow_scale", type=float, default=10.0)

    parser.add_argument("--exp_logvar_scale", type=float, default=-5.0)
    parser.add_argument("--exp_logvar", type=str_to_bool, default=False)

    parser.add_argument("--drop_rate", type=float, default=0.0, help='Droprate of 0 is no droprate')

    parser.add_argument("--do_eval", type=str_to_bool, default=False)
    parser.add_argument("--do_test", type=str_to_bool, default=False)

    parser.add_argument("--reduce_memory", action="store_true",
                        help="Store training data as on-disc memmaps to massively reduce memory usage")

    parser.add_argument("--patience", type=int, default=3, help="Number of early stop epochs patience ")
    parser.add_argument("--epochs", type=int, default=10, help="Number of epochs to train for")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=512,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    
    parser.add_argument("--lr_bert", default=3e-5, type=float, help="The initial learning rate for Adam for bert params")
    parser.add_argument("--lr_mlp", default=3e-5, type=float)

    parser.add_argument("--weight_decay",
                        default=0.01,
                        type=float,
                        help="Adam's weight l2 regularization")
    parser.add_argument("--clip_grad",
                        default=5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")


    parser.add_argument('--gmm_crossentropy', type=str_to_bool, default=False, help="GMM Crossentropy.")
    parser.add_argument('--gmm_exponent', type=str_to_bool, default=True, help="Instead of Kernels use powers of 10")
    parser.add_argument('--gmm_nmix',
                        type=int,
                        default=31,
                        help="number of mixtures used only for gmm. [1,3,7,15,31,63,127,255,511]")
    parser.add_argument('--optim', type=str, default='sgd',  choices=['sgd', 'adam'], help="Loss function to use for LogBert")
    
    parser.add_argument('--min_exponent', type=int, default=-1, help="min exponent size")
    parser.add_argument('--max_exponent', type=int, default=16, help="max exponent size")
    parser.add_argument('--n_exponent', type=int, default=17, help="sum of min and max")
    
    parser.add_argument('--embed_exp', type=str_to_bool, default=False, help="Learn an input exponent embedding")
    parser.add_argument('--embed_exp_opt', type=str, default='high', choices=['low', 'high', ''], help="high or low learning rate for embeddings")

    parser.add_argument('--embed_digit', type=str_to_bool, default=False, help="Learn in input embedding of numbers using LSTM over digits")
    parser.add_argument('--output_embed_exp', type=str_to_bool, default=False, help="Learn in input embedding and attach after Bert")
    parser.add_argument('--zero_init', type=str_to_bool, default=False, help="Start non pretrained embeddings at zero")

    
    parser.add_argument("--n_digits", type=int, default=14, help="Size of digit vocab includes e.+-")
    parser.add_argument("--ez_digits", type=int, default=32, help="Digit embedding size")


    args = parser.parse_args()

    args.pregenerated_data = Path(PREGENERATED_DATA[args.dataset])
    args.output_dir = Path(f'{CHECKPOINT_PATH}/{args.dataset}')
    
    sanity_check(args)

    args.savepath = args.output_dir
    
    if args.saved_checkpoint is not None:
        args.output_dir = Path(args.saved_checkpoint)
        args.run_name = args.output_dir.stem
        num_data_epochs = 1
    else:
        args.output_dir, args.run_name = build_savepath(args)

    print('dataset', args.dataset)
    print('output_dir', args.output_dir)
    print('pregenerated_data', args.pregenerated_data)
    print('run_name', args.run_name)
    
    wandb.init(project="mnm-paper", name=f'{args.run_name}')
    wandb.config.update(args, allow_val_change=True)

    samples_per_epoch = []
    for i in range(args.epochs):
        epoch_file = args.pregenerated_data / f"train_epoch_{i}.json"
        metrics_file = args.pregenerated_data / f"train_epoch_{i}_metrics.json"
        if epoch_file.is_file() and metrics_file.is_file():
            metrics = json.loads(metrics_file.read_text())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0:
                print(f'epoch_file:{epoch_file}')
                exit("No training data was found!")
            print(f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).")
            print("This script will loop over the available data, but training diversity may be negatively impacted.")
            num_data_epochs = i
            break
    else:
        num_data_epochs = args.epochs

    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()
    
    logging.info("device: {} n_gpu: {}".format(
        device, n_gpu))

    
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


    if args.output_dir.is_dir() and list(args.output_dir.iterdir()):
        logging.warning(f"Output directory ({args.output_dir}) already exists and is not empty!")
    args.output_dir.mkdir(parents=True, exist_ok=True)


    total_train_examples = 0
    for i in range(args.epochs):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = int(
        total_train_examples / args.train_batch_size / args.gradient_accumulation_steps)

    # Prepare model
    NumberBertModel = get_model(args)


    if args.do_test:
        best_model, tokenizer, best_path = load_best(args)    
        global_step = 0
        train_numbers = get_numbers_from_split(args, tokenizer, device, num_data_epochs, split='train')
        train_mean, train_median = np.mean(train_numbers), np.median(train_numbers)
        
        best_model.to(device)
        best_model.eval()
        
        if args.do_dis:
            test_metrics = evaluate_discriminative(args, best_model, tokenizer, device, global_step, 'test', train_mean, train_median, train_numbers)
        else:
            test_metrics = evaluation(args, best_model, tokenizer, device, global_step, 'test', train_mean, train_median, train_numbers)
        save_results(best_path, test_metrics)
        save_args(best_path, args)
        return

    early_stopper = EarlyStopping('valid_one_loss', min_delta=0.0,
                                patience=args.patience, monitor_mode='min')

    if args.saved_checkpoint is not None:
        print('args.saved_checkpoint', args.saved_checkpoint)
        tokenizer = BertNumericalTokenizer.from_pretrained(args.saved_checkpoint)
        model = NumberBertModel.from_pretrained(args.saved_checkpoint, args=args)
        #uncomment this
        train_numbers = get_numbers_from_split(args, tokenizer, device, num_data_epochs, split='train')
    else:
        tokenizer = BertNumericalTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
        train_numbers = get_numbers_from_split(args, tokenizer, device, num_data_epochs, split='train')
        # old_save_dir = None

        if args.do_pretrain:
            model = NumberBertModel.from_pretrained(args.bert_model, args=args)
        else:
            config = BertConfig.from_json_file('./bert-base-uncased-config.json')
            model = NumberBertModel(config, args)

            if args.do_pretrain_wpe:
                pre_model = NumberBertModel.from_pretrained(args.bert_model, args=args)
                # pretrained_dict = 
                pretrained_dict = pre_model.state_dict()
                # print('pretrained_dict', pretrained_dict)
                
                pretrained_dict = {k: v for k, v in pretrained_dict.items() if 'embedding' in k}

                model_dict = model.state_dict()

                pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
                # 2. overwrite entries in the existing state dict
                model_dict.update(pretrained_dict) 
                # 3. load the new state dict
                model.load_state_dict(model_dict)

        
        if args.do_gmm:
            kernel_locs, kernel_scales = get_gmm_components(args, train_numbers)
            model.set_kernel_locs(kernel_locs, kernel_scales)

        special_tokens_dict = {'additional_special_tokens': ('[UNK_NUM]',)}
        num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
        print('We have added', num_added_toks, 'tokens')
        model.resize_token_embeddings(len(tokenizer))
        # model.set_params(args)

    def set_dropout(model, drop_rate):
        for name, child in model.named_children():
            if isinstance(child, torch.nn.Dropout):
                child.p = drop_rate
            set_dropout(child, drop_rate=drop_rate)
    set_dropout(model, drop_rate=args.drop_rate)


    wandb.watch(model, log="all")
    model.to(device)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    optimizer_grouped_parameters = set_lr(args, param_optimizer)
    
    if args.optim == 'sgd':
        optimizer = torch.optim.SGD(optimizer_grouped_parameters, lr=args.lr_bert)
    elif args.optim == 'adam':
        optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.lr_bert, eps=args.adam_epsilon)
    
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps,
                                                 num_training_steps=num_train_optimization_steps)

    global_step = 0
    logging.info("***** Running training *****")
    logging.info(f"  Num examples = {total_train_examples}")
    logging.info("  Batch size = %d", args.train_batch_size)
    logging.info("  Num steps = %d", num_train_optimization_steps)

    train_mean, train_median = np.mean(train_numbers), np.median(train_numbers)
    
    if args.do_eval:
        model.eval()
        if args.do_dis:
            train_epoch_metrics = evaluate_discriminative(args, model, tokenizer, device, global_step, 'train', train_mean, train_median, train_numbers)
            valid_epoch_metrics = evaluate_discriminative(args, model, tokenizer, device, global_step, 'valid', train_mean, train_median, train_numbers)
        else:
            # evaluation(args, model, tokenizer, device, global_step, 'train', train_mean, train_median, train_numbers)
            # valid_epoch_metrics = evaluation(args, model, tokenizer, device, global_step, 'valid', train_mean, train_median, train_numbers)
            
            #EMNLP FINAL
            test_metrics = evaluation(args, model, tokenizer, device, global_step, 'test', train_mean, train_median, train_numbers)
        return



    model.train()
    global_step = train_loop(args, model, optimizer, scheduler, tokenizer, device, optimizer_grouped_parameters, early_stopper,
        train_numbers, train_mean, train_median, global_step, n_gpu,
        num_data_epochs)

    del model
    best_model, tokenizer, best_path = load_best(args)    
    best_model.to(device)
    best_model.eval()
    if args.do_dis:
        test_metrics = evaluate_discriminative(args, best_model, tokenizer, device, global_step, 'test', train_mean, train_median, train_numbers)
    else:
        test_metrics = evaluation(args, best_model, tokenizer, device, global_step, 'test', train_mean, train_median, train_numbers)
    save_results(best_path, test_metrics)
    save_args(best_path, args)

    #flush check
    wandb.log({})