Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default="data",
        type=str,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument("--saved_model",
                        default=None,
                        type=str,
                        help="Fine-tuned model to load weights from")

    parser.add_argument("--config_file",
                        default=None,
                        type=str,
                        help="File to load config from")

    ## Other parameters
    parser.add_argument(
        "--tensorboard_logdir",
        default="runs",
        type=str,
        required=False,
        help="The output directory where the tensorboard event files are saved."
    )
    parser.add_argument(
        "--cache_dir",
        default="~/local_model_cache",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=50,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--train_frac",
                        default=1.0,
                        type=float,
                        help="What percentage of the training data to use")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.",
                        default=True)
    parser.add_argument(
        "--q_relevance",
        action='store_true',
        help="Whether to classify questions as confused or not.")
    parser.add_argument(
        "--r_relevance",
        action='store_true',
        help="Whether to classify responses as confused or not.")
    parser.add_argument("--answer_extraction",
                        action='store_true',
                        help="Whether to extract answers")
    parser.add_argument("--answer_verification",
                        action='store_true',
                        help="Whether to verify answers",
                        default=False)
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.",
                        default=True)
    parser.add_argument("--do_mini",
                        action='store_true',
                        help="Whether not to mini version of the data")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=128,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=128,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=10.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--attention_dropout",
                        default=0.1,
                        type=float,
                        help="Percent dropout at attention layers")
    parser.add_argument("--hidden_dropout",
                        default=0.1,
                        type=float,
                        help="Percent dropout at hidden layers")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    log_dir = os.path.join(
        args.tensorboard_logdir,
        datetime.now().strftime('%Y-%m-%d--%H-%M-%S') + '_' +
        os.path.basename(args.output_dir[:-1] if args.output_dir[-1] ==
                         '/' else args.output_dir))
    os.mkdir(log_dir)
    fh = logging.FileHandler(log_dir + '/run.log')
    fh.setLevel(logging.DEBUG)

    tb_writer = SummaryWriter(logdir=log_dir)

    def get_free_gpu():
        os.system('nvidia-smi -q -d Memory |grep -A4 GPU|grep Free >tmp')
        memory_available = [
            int(x.split()[2]) for x in open('tmp', 'r').readlines()
        ]
        os.remove('tmp')
        return np.argmax(memory_available)

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device(f"cuda:{get_free_gpu()}" if torch.cuda.
                              is_available() and not args.no_cuda else "cpu")
        n_gpu = 1
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    args.output_dir = args.output_dir
    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    processor = VQRProcessor(args.do_mini, args.q_relevance, args.r_relevance,
                             args.train_frac)
    label_list = processor.get_labels()
    num_labels = len(label_list)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)
    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            np.ceil(
                len(train_examples) / args.train_batch_size /
                args.gradient_accumulation_steps)) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )
        train_features, train_token_mappings = convert_examples_to_features(
            train_examples, label_list, args.max_seq_length, tokenizer)
        train_dataloader = get_train_dataloader(train_features, args)

    if args.do_eval:
        eval_examples = processor.get_dev_examples(args.data_dir)
        eval_dataloader, eval_token_mappings = get_eval_dataloader(
            args, eval_examples, label_list, tokenizer)

    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(
        str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(
            args.local_rank))

    if args.saved_model is not None:
        print("Now loading from", args.saved_model)

        # Load a trained model and config that you have fine-tuned
        config = BertConfig(args.config_file)
        model = BertForVQR(config,
                           num_labels=2,
                           binary_only=args.binary_only,
                           answer_extraction_only=args.answer_extraction_only,
                           answer_verification=args.answer_verification)
        model.load_state_dict(torch.load(args.saved_model))
    else:
        config = BertConfig.from_pretrained(args.config_file)
        model = BertForVQR.from_pretrained(
            args.bert_model,
            cache_dir=cache_dir,
            config=config,
            num_labels=num_labels,
            q_relevance=args.q_relevance,
            r_relevance=args.r_relevance,
            answer_extraction=args.answer_extraction,
            answer_verification=args.answer_verification)

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        train(device, model, optimizer, train_dataloader, args,
              eval_dataloader, tb_writer, train_examples, train_token_mappings,
              eval_examples, eval_token_mappings)

    if not args.do_train and args.do_eval and (
            args.local_rank == -1 or torch.distributed.get_rank() == 0):
        eval(device, model, eval_dataloader, args)
    tb_writer.close()
Esempio n. 2
0
import torch
import torch.nn.functional as F
from torch import nn
from transformers import BertConfig, BertTokenizer
from transformers.modeling_bert import BertLayerNorm

from .adaptive_span import AdaptiveSpan
from .entmax import EntmaxAlpha
from .layerdrop import LayerDrop_Bert, LayerDrop_Cross
from .lxmert_utils import (VISUAL_CONFIG, BertPreTrainedModel, InputFeatures,
                           convert_sents_to_features, set_visual_config)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
    "cpu")
MAX_VQA_LENGTH = 20
bert_config = BertConfig()


class GeLU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return F.gelu(x)


## BertEmbeddings
class BertEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings.
    """
    def __init__(self, config):
Esempio n. 3
0
# filepath = 'data/match/pretrain_wo_aug.txt'
filepath = 'data/match/pretrain.txt' #150个epoch的没有对偶数据增强
vocab_file_dir = './output_bert/vocab.txt'
tokenizer = BertTokenizer.from_pretrained(vocab_file_dir)


dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path = filepath,
    block_size=32 # 32
)

config = BertConfig(
    vocab_size=23737,
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    max_position_embeddings=512,
)

model = BertForMaskedLM(config)
# model = BertForMaskedLM.from_pretrained(pretrained_model_name_or_path='chinese-bert-wwm')
# model.resize_token_embeddings(new_num_tokens=23737) #这里写现在token的个数

# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) #0.15
data_collator = DataCollatorForNgramMask(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) #0.15

training_args = TrainingArguments(
    output_dir='output_bert_wwm',
    overwrite_output_dir=True,
    num_train_epochs=100,
Esempio n. 4
0
    if len(ck_list) == 0:
        return None

    if len(ck_list) == 1:
        return os.path.join(output_path, ck_list[0])
    else:
        return os.path.join(output_path, ck_list[-1])


latest_model_path = load_latest_path(output_path)

print(f"restore {latest_model_path}")
""" Set Config
BERT Base
"""
config = BertConfig(
    vocab_size=32_000,
    attention_probs_dropout_prob=0.1,
    directionality="bidi",
    gradient_checkpointing=False,
    hidden_act="gelu",
    hidden_dropout_prob=0.1,
    hidden_size=768,
    initializer_range=0.02,
    intermediate_size=3072,
    layer_norm_eps=1e-12,
    max_position_embeddings=512,
    model_type="bert",
    num_attention_heads=12,
    num_hidden_layers=12,
    pad_token_id=0,
    pooler_fc_size=768,
Esempio n. 5
0
def build_enc_dec_tokenizers(config):

    src_tokenizer = BertTokenizer.from_pretrained(
        'bert-base-multilingual-cased')
    tgt_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    tgt_tokenizer.bos_token = '<s>'
    tgt_tokenizer.eos_token = '</s>'

    # hidden_size and intermediate_size are both wrt all the attention heads.
    # Should be divisible by num_attention_heads
    encoder_config = BertConfig(
        vocab_size=src_tokenizer.vocab_size,
        hidden_size=config.hidden_size,
        num_hidden_layers=config.num_hidden_layers,
        num_attention_heads=config.num_attention_heads,
        intermediate_size=config.intermediate_size,
        hidden_act=config.hidden_act,
        hidden_dropout_prob=config.dropout_prob,
        attention_probs_dropout_prob=config.dropout_prob,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02,
        layer_norm_eps=1e-12)

    decoder_config = BertConfig(
        vocab_size=tgt_tokenizer.vocab_size,
        hidden_size=config.hidden_size,
        num_hidden_layers=config.num_hidden_layers,
        num_attention_heads=config.num_attention_heads,
        intermediate_size=config.intermediate_size,
        hidden_act=config.hidden_act,
        hidden_dropout_prob=config.dropout_prob,
        attention_probs_dropout_prob=config.dropout_prob,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        is_decoder=False)  # CHANGE is_decoder=True

    # Create encoder and decoder embedding layers.
    encoder_embeddings = torch.nn.Embedding(
        src_tokenizer.vocab_size,
        config.hidden_size,
        padding_idx=src_tokenizer.pad_token_id)
    decoder_embeddings = torch.nn.Embedding(
        tgt_tokenizer.vocab_size,
        config.hidden_size,
        padding_idx=tgt_tokenizer.pad_token_id)

    encoder = BertModel(encoder_config)
    # encoder.set_input_embeddings(encoder_embeddings.cuda())
    encoder.set_input_embeddings(encoder_embeddings)  # 1

    decoder = BertForMaskedLM(decoder_config)
    # decoder.set_input_embeddings(decoder_embeddings.cuda())
    decoder.set_input_embeddings(decoder_embeddings)  # 2

    # model.cuda()

    tokenizers = ED({'src': src_tokenizer, 'tgt': tgt_tokenizer})
    return encoder, decoder, tokenizers
Esempio n. 6
0
    config = GPT2Config()
    model = GPT2LMHeadModel(config)
    model.resize_token_embeddings(len(tokenizer))
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    if train:
        train_dataset = datasets.Dataset.load_from_disk(os.path.join(data_dir, "lm_train"))


elif model_type == "bert":
    dataset_properties = json.load(open(os.path.join(data_dir, "dataset_properties.json")))
    special_tokens = dataset_properties["special_tokens"]
    tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
    tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})

    config = BertConfig()
    config.vocab_size = len(tokenizer)

    model = AutoModelForMaskedLM.from_config(config)
    model.resize_token_embeddings(len(tokenizer))
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)

    # the NL inputs for the train dataset are the same for BERT and GPT-2 models, but they are tokenized
    # differently (using the corresponding BERT and GPT-2 tokenizers, respectively). The standard training
    # set is already tokenized with the BERT tokenizer, so we can reuse that set here.
    if train:
        train_dataset = datasets.Dataset.load_from_disk(os.path.join(data_dir, "arsenal_train"))


else:
    raise("unknown model type")
import os
import re
import json
import string
import argparse
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig
from models import CABert
import pandas as pd
from metrics import evaluate

max_len = 512
configuration = BertConfig()  # default paramters and configuration for BERT

# Save the slow pretrained tokenizer
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt",
                                   lowercase=True)

if __name__ == '__main__':
    my_parser = argparse.ArgumentParser(
        description='List the content of a folder')

    # Add the arguments
    my_parser.add_argument('train',
Esempio n. 8
0
def build_model(config, train_loader, eval_loader):
    
    src_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    tgt_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    tgt_tokenizer.bos_token = '<s>'
    tgt_tokenizer.eos_token = '</s>'
    
    #hidden_size and intermediate_size are both wrt all the attention heads. 
    #Should be divisible by num_attention_heads
    encoder_config = BertConfig(vocab_size=src_tokenizer.vocab_size,
                                hidden_size=config.hidden_size,
                                num_hidden_layers=config.num_hidden_layers,
                                num_attention_heads=config.num_attention_heads,
                                intermediate_size=config.intermediate_size,
                                hidden_act=config.hidden_act,
                                hidden_dropout_prob=config.dropout_prob,
                                attention_probs_dropout_prob=config.dropout_prob,
                                max_position_embeddings=512,
                                type_vocab_size=2,
                                initializer_range=0.02,
                                layer_norm_eps=1e-12)

    decoder_config = BertConfig(vocab_size=tgt_tokenizer.vocab_size,
                                hidden_size=config.hidden_size,
                                num_hidden_layers=config.num_hidden_layers,
                                num_attention_heads=config.num_attention_heads,
                                intermediate_size=config.intermediate_size,
                                hidden_act=config.hidden_act,
                                hidden_dropout_prob=config.dropout_prob,
                                attention_probs_dropout_prob=config.dropout_prob,
                                max_position_embeddings=512,
                                type_vocab_size=2,
                                initializer_range=0.02,
                                layer_norm_eps=1e-12,
                                is_decoder=True)

    #Create encoder and decoder embedding layers.
    encoder_embeddings = torch.nn.Embedding(src_tokenizer.vocab_size, config.hidden_size, padding_idx=src_tokenizer.pad_token_id)
    decoder_embeddings = torch.nn.Embedding(tgt_tokenizer.vocab_size, config.hidden_size, padding_idx=tgt_tokenizer.pad_token_id)

    encoder = BertModel(encoder_config)
    encoder.set_input_embeddings(encoder_embeddings.cpu())
    
    decoder = BertForMaskedLM(decoder_config)
    decoder.set_input_embeddings(decoder_embeddings.cpu())
    """
    input_dirs = config.model_output_dirs
    if(os.listdir(input_dirs['decoder']) and os.listdir(input_dirs['encoder'])):
        suffix = "pytorch_model.bin"
        decoderPath = os.path.join(input_dirs['decoder'], suffix)
        encoderPath = os.path.join(input_dirs['encoder'], suffix)
        
        decoder_state_dict = torch.load(decoderPath)
        encoder_state_dict = torch.load(encoderPath)
        decoder.load_state_dict(decoder_state_dict)
        encoder.load_state_dict(encoder_state_dict)
        model = TranslationModel(encoder, decoder, train_loader, eval_loader, tgt_tokenizer, config)
        model.cpu()
        return model
    """
    #model = TranslationModel(encoder, decoder)
    model = TranslationModel(encoder, decoder, train_loader, eval_loader, tgt_tokenizer, config)
    model.cpu()


    tokenizers = ED({'src': src_tokenizer, 'tgt': tgt_tokenizer})
    #return model, tokenizers
    return model
    def __init__(self,
                 args,
                 device,
                 checkpoint=None,
                 bert_from_extractive=None):
        super(AbsSummarizer, self).__init__()
        self.args = args
        self.device = device
        self.bert = Bert(args.large, args.temp_dir, args.finetune_bert)

        if bert_from_extractive is not None:
            self.bert.model.load_state_dict(
                dict([(n[11:], p) for n, p in bert_from_extractive.items()
                      if n.startswith("bert.model")]),
                strict=True,
            )

        if args.encoder == "baseline":
            bert_config = BertConfig(
                self.bert.model.config.vocab_size,
                hidden_size=args.enc_hidden_size,
                num_hidden_layers=args.enc_layers,
                num_attention_heads=8,
                intermediate_size=args.enc_ff_size,
                hidden_dropout_prob=args.enc_dropout,
                attention_probs_dropout_prob=args.enc_dropout,
            )
            self.bert.model = BertModel(bert_config)

        if args.max_pos > 512:
            my_pos_embeddings = nn.Embedding(
                args.max_pos, self.bert.model.config.hidden_size)
            my_pos_embeddings.weight.data[:
                                          512] = self.bert.model.embeddings.position_embeddings.weight.data
            my_pos_embeddings.weight.data[
                512:] = self.bert.model.embeddings.position_embeddings.weight.data[
                    -1][None, :].repeat(args.max_pos - 512, 1)
            self.bert.model.embeddings.position_embeddings = my_pos_embeddings
        self.vocab_size = self.bert.model.config.vocab_size
        tgt_embeddings = nn.Embedding(self.vocab_size,
                                      self.bert.model.config.hidden_size,
                                      padding_idx=0)
        if self.args.share_emb:
            tgt_embeddings.weight = copy.deepcopy(
                self.bert.model.embeddings.word_embeddings.weight)

        # 在bert基础上添加一个 decoder的结构
        self.decoder = TransformerDecoder(
            self.args.dec_layers,
            self.args.dec_hidden_size,
            heads=self.args.dec_heads,
            d_ff=self.args.dec_ff_size,
            dropout=self.args.dec_dropout,
            embeddings=tgt_embeddings,
        )

        self.generator = get_generator(self.vocab_size,
                                       self.args.dec_hidden_size, device)
        self.generator[0].weight = self.decoder.embeddings.weight

        if checkpoint is not None:
            self.load_state_dict(checkpoint["model"], strict=True)
        else:
            for module in self.decoder.modules():
                if isinstance(module, (nn.Linear, nn.Embedding)):
                    module.weight.data.normal_(mean=0.0, std=0.02)
                elif isinstance(module, nn.LayerNorm):
                    module.bias.data.zero_()
                    module.weight.data.fill_(1.0)
                if isinstance(module, nn.Linear) and module.bias is not None:
                    module.bias.data.zero_()
            for p in self.generator.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)
                else:
                    p.data.zero_()
            if args.use_bert_emb:
                tgt_embeddings = nn.Embedding(
                    self.vocab_size,
                    self.bert.model.config.hidden_size,
                    padding_idx=0)
                tgt_embeddings.weight = copy.deepcopy(
                    self.bert.model.embeddings.word_embeddings.weight)
                self.decoder.embeddings = tgt_embeddings
                self.generator[0].weight = self.decoder.embeddings.weight

        self.to(device)
Esempio n. 10
0
def train(args):

    if not os.path.exists(args.save_dir): os.mkdir(args.save_dir)

    if args.gpu != '-1' and torch.cuda.is_available():
        device = torch.device('cuda')
        torch.cuda.set_rng_state(torch.cuda.get_rng_state())
        torch.backends.cudnn.deterministic = True
    else:
        device = torch.device('cpu')

    config = {
        'train': {
            'unchanged_variable_weight': 0.1,
            'buffer_size': 5000
        },
        'encoder': {
            'type': 'SequentialEncoder'
        },
        'data': {
            'vocab_file': 'data/vocab.bpe10000/vocab'
        }
    }

    train_set = Dataset('data/preprocessed_data/train-shard-*.tar')
    dev_set = Dataset('data/preprocessed_data/dev.tar')

    vocab = Vocab.load('data/vocab.bpe10000/vocab')

    if args.decoder:
        vocab_size = len(vocab.all_subtokens) + 1
    else:
        vocab_size = len(vocab.source_tokens) + 1

    max_iters = args.max_iters
    lr = args.lr
    warm_up = args.warm_up

    batch_size = 4096
    effective_batch_size = args.batch_size

    max_embeds = 1000 if args.decoder else 512

    bert_config = BertConfig(vocab_size=vocab_size,
                             max_position_embeddings=max_embeds,
                             num_hidden_layers=6,
                             hidden_size=256,
                             num_attention_heads=4)
    model = BertForPreTraining(bert_config)

    if args.restore:
        state_dict = torch.load(os.path.join(args.save_dir, args.res_name))
        model.load_state_dict(state_dict['model'])
        batch_count = state_dict['step']
        epoch = state_dict['epoch']

    model.train()
    model.to(device)

    if len(args.gpu) > 1 and device == torch.device('cuda'):
        model = nn.DataParallel(model)

    def lr_func(step):
        if step > warm_up:
            return (max_iters - step) / (max_iters - warm_up)
        else:
            return (step / warm_up)

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=lr,
                                 eps=1e-6,
                                 weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer,
                                                  lr_lambda=lr_func,
                                                  last_epoch=-1)
    loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='none')

    if args.restore:
        optimizer.load_state_dict(state_dict['optim'])
        scheduler.load_state_dict(state_dict['scheduler'])

    batch_count = 0
    epoch = 0
    cum_loss = 0.0

    while True:
        # load training dataset, which is a collection of ASTs and maps of gold-standard renamings
        train_set_iter = train_set.batch_iterator(
            batch_size=batch_size,
            return_examples=False,
            config=config,
            progress=True,
            train=True,
            max_seq_len=512,
            num_readers=args.num_readers,
            num_batchers=args.num_batchers)
        epoch += 1
        print("Epoch {}".format(epoch))

        loss = 0
        num_seq = 0

        optimizer.zero_grad()

        for batch in train_set_iter:
            if args.decoder:
                input_ids = batch.tensor_dict['prediction_target'][
                    'src_with_true_var_names']
            else:
                input_ids = batch.tensor_dict['src_code_tokens']

            attention_mask = torch.ones_like(input_ids)
            attention_mask[input_ids == 0] = 0.0

            assert torch.max(input_ids) < vocab_size
            assert torch.min(input_ids) >= 0

            if input_ids.shape[0] > max_embeds:
                print(
                    "Warning - length {} is greater than max length {}. Skipping."
                    .format(input_ids.shape[0], max_embeds))
                continue

            input_ids, labels = mask_tokens(inputs=input_ids,
                                            mask_token_id=vocab_size - 1,
                                            vocab_size=vocab_size,
                                            mlm_probability=0.15)

            input_ids[attention_mask == 0] = 0
            labels[attention_mask == 0] = -100

            if torch.cuda.is_available():
                input_ids = input_ids.cuda()
                labels = labels.cuda()
                attention_mask = attention_mask.cuda()

            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            masked_lm_labels=labels)

            unreduced_loss = loss_fn(
                outputs[0].view(-1, bert_config.vocab_size),
                labels.view(-1)).reshape(labels.shape) / (
                    torch.sum(labels != -100, axis=1).unsqueeze(1) + 1e-7)
            loss += unreduced_loss.sum()
            num_seq += input_ids.shape[0]

            if num_seq > effective_batch_size:
                batch_count += 1
                loss /= num_seq
                cum_loss += loss.item()

                if batch_count % 20 == 0:
                    print("{} batches, Loss : {:.4}, LR : {:.6}".format(
                        batch_count, cum_loss / 20,
                        scheduler.get_lr()[0]))
                    cum_loss = 0.0

                if batch_count % 10000 == 0:
                    fname1 = os.path.join(
                        args.save_dir, 'bert_{}_step_{}.pth'.format(
                            ('decoder' if args.decoder else 'encoder'),
                            batch_count))
                    fname2 = os.path.join(
                        args.save_dir, 'bert_{}.pth'.format(
                            ('decoder' if args.decoder else 'encoder'),
                            batch_count))

                    state = {
                        'epoch': epoch,
                        'step': batch_count,
                        'model': model.module.state_dict(),
                        'optim': optimizer.state_dict(),
                        'scheduler': scheduler.state_dict()
                    }

                    torch.save(state, fname1)
                    torch.save(state, fname2)

                    print("Saved file to path {}".format(fname1))
                    print("Saved file to path {}".format(fname2))

                loss.backward()
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

                loss = 0
                num_seq = 0

            if batch_count == max_iters:
                print(f'[Learner] Reached max iters', file=sys.stderr)
                exit()

        print("Max_len = {}".format(max_len))
        break
Esempio n. 11
0
def load_bert(bert_path, device):
    bert_config_path = os.path.join(bert_path, 'config.json')
    bert = BertModel(BertConfig(**load_json(bert_config_path))).to(device)
    bert_model_path = os.path.join(bert_path, 'model.bin')
    bert.load_state_dict(clean_state_dict(torch.load(bert_model_path)))
    return bert
Esempio n. 12
0
from transformers import BertConfig

config = BertConfig(
    vocab_size=21_128,
    max_position_embeddings=512,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=2,
)

from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("./model/wpe", max_len=512)

from transformers import BertForMaskedLM

model = BertForMaskedLM(config=config)

print(model.num_parameters())
model.resize_token_embeddings(len(tokenizer))

from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./data/data_train.csv",
    block_size=128,
)

from transformers import DataCollatorForLanguageModeling
# add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
#decoder = BertGenerationDecoder.from_pretrained("bert-base-uncased",
#    add_cross_attention=True, is_decoder=True,
#    bos_token_id=decoder_tokenizer.vocab["[CLS]"],
#    eos_token_id=decoder_tokenizer.vocab["[SEP]"],
#    )
#decoder.resize_token_embeddings(len(decoder_tokenizer))

# Fresh decoder config.
decoder_config = BertConfig(
    is_decoder=True,
    add_cross_attention=True,
    vocab_size=len(decoder_tokenizer),
    # Set required tokens.
    unk_token_id=decoder_tokenizer.vocab["[UNK]"],
    sep_token_id=decoder_tokenizer.vocab["[SEP]"],
    pad_token_id=decoder_tokenizer.vocab["[PAD]"],
    cls_token_id=decoder_tokenizer.vocab["[CLS]"],
    mask_token_id=decoder_tokenizer.vocab["[MASK]"],
    bos_token_id=decoder_tokenizer.vocab["[BOS]"],
    eos_token_id=decoder_tokenizer.vocab["[EOS]"],
)
# AutoConfig.from_pretrained("bert-base-uncased")
#decoder_config = BertGenerationDecoderConfig()

# From: https://github.com/huggingface/transformers/blob/master/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py#L464
#>>> model.config.decoder_start_token_id = tokenizer.cls_token_id
#>>> model.config.pad_token_id = tokenizer.pad_token_id
#>>> model.config.vocab_size = model.config.decoder.vocab_size
#decoder_config.decoder_start_token_id = decoder_tokenizer.vocab["[CLS]"]
# decoder_config.pad_token_type_id = 0 ?
Esempio n. 14
0
OUTPUT_DIRECTORY = "./synthBERT"

# load vocab file to list
vocab_file = open(VOCAB_PATH, "r")
vocab = vocab_file.read().split("\n")

# use our pretrained tokenizer and model
tokenizer = BertTokenizer(VOCAB_PATH,
                          do_basic_tokenize=True,
                          additional_special_tokens=vocab)
#tokenizer.add_tokens(vocab)

# set BERT model parameters
config = BertConfig(
    vocab_size=141,
    max_position_embeddings=50,
    num_addention_heads=12,
    num_hidden_layers=6,
)

# instantiate the model
# transformers has some built specifically for masked language modeling
model = BertForMaskedLM(config=config)

# resize the model embedding to fit our own vocab
model.resize_token_embeddings(len(tokenizer))

# put corpus into a dataset helper
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=CORPUS_PATH,
    block_size=128,
Esempio n. 15
0
def main():
    parser = argparse.ArgumentParser(
        description='Train the individual Transformer model')
    parser.add_argument('--dataset_folder', type=str, default='datasets')
    parser.add_argument('--dataset_name', type=str, default='zara1')
    parser.add_argument('--obs', type=int, default=8)
    parser.add_argument('--preds', type=int, default=12)
    parser.add_argument('--emb_size', type=int, default=1024)
    parser.add_argument('--heads', type=int, default=8)
    parser.add_argument('--layers', type=int, default=6)
    parser.add_argument('--dropout', type=float, default=0.1)
    parser.add_argument('--cpu', action='store_true')
    parser.add_argument('--output_folder', type=str, default='Output')
    parser.add_argument('--val_size', type=int, default=50)
    parser.add_argument('--gpu_device', type=str, default="0")
    parser.add_argument('--verbose', action='store_true')
    parser.add_argument('--max_epoch', type=int, default=100)
    parser.add_argument('--batch_size', type=int, default=256)
    parser.add_argument('--validation_epoch_start', type=int, default=30)
    parser.add_argument('--resume_train', action='store_true')
    parser.add_argument('--delim', type=str, default='\t')
    parser.add_argument('--name', type=str, default="zara1")

    args = parser.parse_args()
    model_name = args.name

    try:
        os.mkdir('models')
    except:
        pass
    try:
        os.mkdir('output')
    except:
        pass
    try:
        os.mkdir('output/BERT')
    except:
        pass
    try:
        os.mkdir(f'models/BERT')
    except:
        pass

    try:
        os.mkdir(f'output/BERT/{args.name}')
    except:
        pass

    try:
        os.mkdir(f'models/BERT/{args.name}')
    except:
        pass

    log = SummaryWriter('logs/BERT_%s' % model_name)

    log.add_scalar('eval/mad', 0, 0)
    log.add_scalar('eval/fad', 0, 0)

    try:
        os.mkdir(args.name)
    except:
        pass

    device = torch.device("cuda")
    if args.cpu or not torch.cuda.is_available():
        device = torch.device("cpu")

    args.verbose = True

    ## creation of the dataloaders for train and validation
    train_dataset, _ = baselineUtils.create_datase(args.dataset_folder,
                                                   args.dataset_name,
                                                   0,
                                                   args.obs,
                                                   args.preds,
                                                   delim=args.delim,
                                                   train=True,
                                                   verbose=args.verbose)
    val_dataset, _ = baselineUtils.create_dataset(args.dataset_folder,
                                                  args.dataset_name,
                                                  0,
                                                  args.obs,
                                                  args.preds,
                                                  delim=args.delim,
                                                  train=False,
                                                  verbose=args.verbose)
    test_dataset, _ = baselineUtils.create_dataset(args.dataset_folder,
                                                   args.dataset_name,
                                                   0,
                                                   args.obs,
                                                   args.preds,
                                                   delim=args.delim,
                                                   train=False,
                                                   eval=True,
                                                   verbose=args.verbose)

    from transformers import BertModel, BertConfig

    config = BertConfig(vocab_size=30522,
                        hidden_size=768,
                        num_hidden_layers=12,
                        num_attention_heads=12,
                        intermediate_size=3072,
                        hidden_act='relu',
                        hidden_dropout_prob=0.1,
                        attention_probs_dropout_prob=0.1,
                        max_position_embeddings=512,
                        type_vocab_size=2,
                        initializer_range=0.02,
                        layer_norm_eps=1e-12)
    model = BertModel(config).to(device)

    from individual_TF import LinearEmbedding as NewEmbed, Generator as GeneratorTS
    a = NewEmbed(3, 768).to(device)
    model.set_input_embeddings(a)
    generator = GeneratorTS(768, 2).to(device)
    # model.set_output_embeddings(GeneratorTS(1024,2))

    tr_dl = torch.utils.data.DataLoader(train_dataset,
                                        batch_size=args.batch_size,
                                        shuffle=True,
                                        num_workers=0)
    val_dl = torch.utils.data.DataLoader(val_dataset,
                                         batch_size=args.batch_size,
                                         shuffle=True,
                                         num_workers=0)
    test_dl = torch.utils.data.DataLoader(test_dataset,
                                          batch_size=args.batch_size,
                                          shuffle=False,
                                          num_workers=0)

    # optim = SGD(list(a.parameters())+list(model.parameters())+list(generator.parameters()),lr=0.01)
    # sched=torch.optim.lr_scheduler.StepLR(optim,0.0005)
    optim = NoamOpt(
        768, 0.1, len(tr_dl),
        torch.optim.Adam(list(a.parameters()) + list(model.parameters()) +
                         list(generator.parameters()),
                         lr=0,
                         betas=(0.9, 0.98),
                         eps=1e-9))
    # optim=Adagrad(list(a.parameters())+list(model.parameters())+list(generator.parameters()),lr=0.01,lr_decay=0.001)
    epoch = 0

    mean = train_dataset[:]['src'][:, :, 2:4].mean((0, 1)) * 0
    std = train_dataset[:]['src'][:, :, 2:4].std((0, 1)) * 0 + 1

    while epoch < args.max_epoch:
        epoch_loss = 0
        model.train()

        for id_b, batch in enumerate(tr_dl):
            optim.optimizer.zero_grad()
            r = 0
            rot_mat = np.array([[np.cos(r), np.sin(r)],
                                [-np.sin(r), np.cos(r)]])

            inp = ((batch['src'][:, :, 2:4] - mean) / std).to(device)
            inp = torch.matmul(inp,
                               torch.from_numpy(rot_mat).float().to(device))
            trg_masked = torch.zeros((inp.shape[0], args.preds, 2)).to(device)
            inp_cls = torch.ones(inp.shape[0], inp.shape[1], 1).to(device)
            trg_cls = torch.zeros(trg_masked.shape[0], trg_masked.shape[1],
                                  1).to(device)
            inp_cat = torch.cat((inp, trg_masked), 1)
            cls_cat = torch.cat((inp_cls, trg_cls), 1)
            net_input = torch.cat((inp_cat, cls_cat), 2)

            position = torch.arange(0, net_input.shape[1]).repeat(
                inp.shape[0], 1).long().to(device)
            token = torch.zeros(
                (inp.shape[0], net_input.shape[1])).long().to(device)
            attention_mask = torch.ones(
                (inp.shape[0], net_input.shape[1])).long().to(device)

            out = model(input_ids=net_input,
                        position_ids=position,
                        token_type_ids=token,
                        attention_mask=attention_mask)

            pred = generator(out[0])

            loss = F.pairwise_distance(
                pred[:, :].contiguous().view(-1, 2),
                torch.matmul(
                    torch.cat(
                        (batch['src'][:, :, 2:4], batch['trg'][:, :, 2:4]),
                        1).contiguous().view(-1, 2).to(device),
                    torch.from_numpy(rot_mat).float().to(device))).mean()
            loss.backward()
            optim.step()
            print("epoch %03i/%03i  frame %04i / %04i loss: %7.4f" %
                  (epoch, args.max_epoch, id_b, len(tr_dl), loss.item()))
            epoch_loss += loss.item()
        # sched.step()
        log.add_scalar('Loss/train', epoch_loss / len(tr_dl), epoch)
        with torch.no_grad():
            model.eval()

            gt = []
            pr = []
            val_loss = 0
            for batch in val_dl:
                inp = ((batch['src'][:, :, 2:4] - mean) / std).to(device)
                trg_masked = torch.zeros(
                    (inp.shape[0], args.preds, 2)).to(device)
                inp_cls = torch.ones(inp.shape[0], inp.shape[1], 1).to(device)
                trg_cls = torch.zeros(trg_masked.shape[0], trg_masked.shape[1],
                                      1).to(device)
                inp_cat = torch.cat((inp, trg_masked), 1)
                cls_cat = torch.cat((inp_cls, trg_cls), 1)
                net_input = torch.cat((inp_cat, cls_cat), 2)

                position = torch.arange(0, net_input.shape[1]).repeat(
                    inp.shape[0], 1).long().to(device)
                token = torch.zeros(
                    (inp.shape[0], net_input.shape[1])).long().to(device)
                attention_mask = torch.zeros(
                    (inp.shape[0], net_input.shape[1])).long().to(device)

                out = model(input_ids=net_input,
                            position_ids=position,
                            token_type_ids=token,
                            attention_mask=attention_mask)

                pred = generator(out[0])

                loss = F.pairwise_distance(
                    pred[:, :].contiguous().view(-1, 2),
                    torch.cat(
                        (batch['src'][:, :, 2:4], batch['trg'][:, :, 2:4]),
                        1).contiguous().view(-1, 2).to(device)).mean()
                val_loss += loss.item()

                gt_b = batch['trg'][:, :, 0:2]
                preds_tr_b = pred[:, args.obs:].cumsum(1).to(
                    'cpu').detach() + batch['src'][:, -1:, 0:2]
                gt.append(gt_b)
                pr.append(preds_tr_b)

            gt = np.concatenate(gt, 0)
            pr = np.concatenate(pr, 0)
            mad, fad, errs = baselineUtils.distance_metrics(gt, pr)
            log.add_scalar('validation/loss', val_loss / len(val_dl), epoch)
            log.add_scalar('validation/mad', mad, epoch)
            log.add_scalar('validation/fad', fad, epoch)

            model.eval()

            gt = []
            pr = []
            for batch in test_dl:
                inp = ((batch['src'][:, :, 2:4] - mean) / std).to(device)
                trg_masked = torch.zeros(
                    (inp.shape[0], args.preds, 2)).to(device)
                inp_cls = torch.ones(inp.shape[0], inp.shape[1], 1).to(device)
                trg_cls = torch.zeros(trg_masked.shape[0], trg_masked.shape[1],
                                      1).to(device)
                inp_cat = torch.cat((inp, trg_masked), 1)
                cls_cat = torch.cat((inp_cls, trg_cls), 1)
                net_input = torch.cat((inp_cat, cls_cat), 2)

                position = torch.arange(0, net_input.shape[1]).repeat(
                    inp.shape[0], 1).long().to(device)
                token = torch.zeros(
                    (inp.shape[0], net_input.shape[1])).long().to(device)
                attention_mask = torch.zeros(
                    (inp.shape[0], net_input.shape[1])).long().to(device)

                out = model(input_ids=net_input,
                            position_ids=position,
                            token_type_ids=token,
                            attention_mask=attention_mask)

                pred = generator(out[0])

                gt_b = batch['trg'][:, :, 0:2]
                preds_tr_b = pred[:, args.obs:].cumsum(1).to(
                    'cpu').detach() + batch['src'][:, -1:, 0:2]
                gt.append(gt_b)
                pr.append(preds_tr_b)

            gt = np.concatenate(gt, 0)
            pr = np.concatenate(pr, 0)
            mad, fad, errs = baselineUtils.distance_metrics(gt, pr)

            torch.save(model.state_dict(),
                       "models/BERT/%s/ep_%03i.pth" % (args.name, epoch))
            torch.save(generator.state_dict(),
                       "models/BERT/%s/gen_%03i.pth" % (args.name, epoch))
            torch.save(a.state_dict(),
                       "models/BERT/%s/emb_%03i.pth" % (args.name, epoch))

            log.add_scalar('eval/mad', mad, epoch)
            log.add_scalar('eval/fad', fad, epoch)

        epoch += 1

    ab = 1
Esempio n. 16
0
def model_setting(args):
    loader, tokenizer = get_loader(args)

    if args.text_processor == 'roberta':
        config = RobertaConfig()
        roberta = RobertaModel(config)
        # text_processor = roberta.from_pretrained('roberta-base')
        ## 텍스트를 분할해서 로벌타에 넣어보자
        if args.dataset == 'MissO_split' or args.dataset == 'TVQA_split':
            text_processor_que = roberta.from_pretrained('roberta-base')
            text_processor_utt = roberta.from_pretrained('roberta-base')
        elif args.eval == 'True':
            memory_processor = roberta.from_pretrained('roberta-base')
            logic_processor = roberta.from_pretrained('roberta-base')
        else:
            text_processor = roberta.from_pretrained('roberta-base')

    elif args.text_processor == 'bert':
        config = BertConfig()
        bert = BertModel(config)
        text_processor = bert.from_pretrained('bert-base-uncased')
    else:
        text_processor = None

    if args.eval == 'False':
        if args.only_text_input == 'True':
            model = QuestionLevelDifficultyOT(args, tokenizer, text_processor)
        else:
            if args.dataset == 'MissO_split' or args.dataset == 'TVQA_split':
                model = QuestionLevelDifficulty_M_split(
                    args, tokenizer, text_processor_que, text_processor_utt)
            else:
                model = QuestionLevelDifficulty_M(args, tokenizer,
                                                  text_processor)

        criterion = get_loss_func(tokenizer)
        optimizer = get_optim(args, model)
        scheduler = get_scheduler(optimizer, args, loader['train'])

        model.to(args.device)
        criterion.to(args.device)

        config = {
            'loader': loader,
            'optimizer': optimizer,
            'criterion': criterion,
            'scheduler': scheduler,
            'tokenizer': tokenizer,
            'args': args,
            'model': model
        }
    else:
        memory_model = QuestionLevelDifficulty_M(args, tokenizer,
                                                 memory_processor)
        logic_model = QuestionLevelDifficulty_L(args, tokenizer,
                                                logic_processor)

        memory_model.to(args.device)
        logic_model.to(args.device)

        config = {
            'loader': loader,
            'tokenizer': tokenizer,
            'args': args,
            'memory_model': memory_model,
            'logic_model': logic_model
        }

    return config
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--relation',
        '-r',
        type=str,
        required=True,
        help=
        f'relation type that is trained on. Available :{", ".join(config.supported_relations)}'
    )
    parser.add_argument('--dataset_name',
                        '-d',
                        required=True,
                        type=str,
                        help='dataset used for train, eval and vocab')
    parser.add_argument('--output_model_name',
                        '-o',
                        type=str,
                        default='',
                        help='Defaults to dataset_name if not stated.')
    parser.add_argument('--epochs',
                        type=int,
                        default='2000',
                        help='Default is 2000 epochs')
    parser.add_argument('--batch_size',
                        type=int,
                        default='1024',
                        help='Default is batch size of 256')
    parser.add_argument('--logging_steps',
                        type=int,
                        default='200',
                        help='After how many batches metrics are logged')
    parser.add_argument(
        "--mlm_probability",
        type=float,
        default=0.15,
        help="Ratio of tokens to mask for masked language modeling loss")

    parser.add_argument(
        "--block_size",
        default=-1,
        type=int,
        help="Optional input sequence length after tokenization."
        "The training datasets will be truncated in block of this size for training."
        "Default to the model max input length for single sentence inputs (take into account special tokens).",
    )

    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate",
                        default=6e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--save_steps",
                        type=int,
                        default=500,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--save_total_limit",
        type=int,
        default=2,
        help="Saves this many checkpoints and deletes older ones",
    )
    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument("--gpu_device", type=int, default=0, help="gpu number")

    args = parser.parse_args()

    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_device)
    if args.output_model_name == '':
        args.output_model_name = args.dataset_name

    data_dir = Path('data') / args.relation / 'datasets' / args.dataset_name
    args.train_data_file = data_dir / 'train.txt'
    args.tokenizer_name = f'data/{args.relation}/vocab/{args.dataset_name}/'
    args.output_dir = f'output/models/{args.relation}/{args.output_model_name}'

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and not args.overwrite_output_dir:
        raise ValueError(
            f"Output directory ({args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup CUDA, GPU & distributed training
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)
    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
    model_config = BertConfig(vocab_size=tokenizer.vocab_size)

    if args.block_size <= 0:
        args.block_size = tokenizer.max_len
        # Our input block size will be the max possible for the model
    else:
        args.block_size = min(args.block_size, tokenizer.max_len)

    corrects = {
        "eval":
        json.load(open(
            data_dir / 'subject_relation2object_eval.json',
            'r',
        )),
        "train":
        json.load(open(
            data_dir / 'subject_relation2object_train.json',
            'r',
        ))
    }

    for eval_type, d in corrects.items():
        corrects[eval_type] = batchify_dict(d, args, tokenizer)

    logger.info("Training new model from scratch")
    model = BertForMaskedLM(config=model_config)
    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    train_dataset = LineByLineTextDataset(tokenizer,
                                          args,
                                          args.train_data_file,
                                          block_size=args.block_size)
    # train
    global_step, tr_loss = train(args, train_dataset, corrects, model,
                                 tokenizer)  # TRAIN
    logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using
    # from_pretrained()
    os.makedirs(args.output_dir, exist_ok=True)

    logger.info("Saving model checkpoint to %s", args.output_dir)
    # Save a trained model, configuration and tokenizer using `save_pretrained()`.
    # They can then be reloaded using `from_pretrained()`
    model_to_save = (model.module if hasattr(model, "module") else model)
    # Take care of distributed/parallel training
    model_to_save.save_pretrained(args.output_dir)
    tokenizer.save_pretrained(args.output_dir)

    # Good practice: save your training arguments together with the trained model
    torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

    # Load a trained model and vocabulary that you have fine-tuned
    model = BertForMaskedLM.from_pretrained(args.output_dir)
    tokenizer = BertTokenizer.from_pretrained(args.output_dir)
    model.to(args.device)

    # Evaluation
    results = {}
    checkpoints = [args.output_dir]
    logger.info("Evaluate the following checkpoints: %s", checkpoints)
    for checkpoint in checkpoints:
        global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
        prefix = checkpoint.split(
            "/")[-1] if checkpoint.find("checkpoint") != -1 else ""

        model = BertForMaskedLM.from_pretrained(checkpoint)
        model.to(args.device)
        result = evaluate(args, corrects, model, tokenizer, prefix=prefix)
        result = dict(
            (k + "_{}".format(global_step), v) for k, v in result.items())
        results.update(result)

    return results
# 
# How to train a language model	Highlight all the steps to effectively train Transformer model on custom data
# - Colab (ipynb) version : https://github.com/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb
# - MD version: https://github.com/huggingface/blog/blob/master/how-to-train.md
# 
# Pretrain Longformer	How to build a "long" version of existing pretrained models	Iz Beltagy  
# https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb

# In[6]:

from transformers import BertForMaskedLM, BertConfig

configuration = BertConfig(
    vocab_size=80000,
#     max_position_embeddings=512, # 512 + 2 more special tokens
#     num_attention_heads=12,
#     num_hidden_layers=12,
#     type_vocab_size=1,
)
# configuration.vocab_size = 20000

model = BertForMaskedLM(config=configuration)
# model = RobertaForMaskedLM.from_pretrained('./Roberta/checkpoint-200000')

# Accessing the model configuration
model.config

# # Initializing Tokenizer

# ## Rewrite Tokenizer of bert_itos_80k with special tokens in front
Esempio n. 19
0
 def __init__(self, bert_path):
     super().__init__()
     config = BertConfig()
     config.output_hidden_states = True
     self.bert_layer = BertModel.from_pretrained(bert_path, config=config)
Esempio n. 20
0
from transformers import BertTokenizer, BertConfig, BertModel
from sklearn.model_selection import train_test_split
import torch
import torch.utils.data as data
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

logger = create_logger(__name__)

# bert parameters
MAX_LEN = 256  # max is 512 for BERT
config = BertConfig(
    vocab_size_or_config_json_file=32000,
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
)


def load_train_test_sets(filepath: str = "IMDB Dataset.csv", test_size: float = 0.10, random_state: int = 42):
    data = pd.read_csv(filepath)
    # Sentiment score must be numeric
    data["sent_score"] = 1
    data.loc[data.sentiment == "negative", "sent_score"] = 0

    X, y = data["review"], data["sent_score"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test
Esempio n. 21
0
 def __init__(self):
     self.config = BertConfig("bert-base-chinese")
     self.model = BertForQuestionAnswering.from_pretrained(
         "bert-base-chinese")
     self.data_processor = DataProcessor(
         max_len=self.config.max_position_embeddings)
Esempio n. 22
0
import os
import re
import json
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from Squad_example import SquadExample, ExactMatch
from utils import create_inputs_targets, normalize_text, create_squad_examples, create_model
from transformers import BertTokenizer, TFBertModel, BertConfig
from constant import max_len

configuration = BertConfig()

slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)


train_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
train_path = keras.utils.get_file("train.json", train_data_url)
eval_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"
eval_path = keras.utils.get_file("eval.json", eval_data_url)


with open(train_path) as f:
    raw_train_data = json.load(f)
Esempio n. 23
0
    args = get_eval_args()

    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    res_file = os.path.join(args.output_dir, "./raw_res.csv")

    cache_dir = os.path.join(args.data_dir, "cache")
    cached_file = os.path.join(cache_dir, "test_examples_cache.dat".format())

    logging.basicConfig(level='INFO')
    logger = logging.getLogger(__name__)

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    model = TBertT(BertConfig(), args.code_bert)
    if args.model_path and os.path.exists(args.model_path):
        model_path = os.path.join(args.model_path, MODEL_FNAME)
        model.load_state_dict(torch.load(model_path))

    logger.info("model loaded")
    start_time = time.time()
    test_examples = load_examples(args.data_dir,
                                  data_type="test",
                                  model=model,
                                  overwrite=args.overwrite,
                                  num_limit=args.test_num)
    test_examples.update_embd(model)
    m = test(args, model, test_examples, "cached_twin_test")
    exe_time = time.time() - start_time
    m.write_summary(exe_time)
        return np.stack(layer_vectors), np.stack(layer_attns)
    else:
        return np.stack(layer_vectors)


#%%
# # random_model = True
random_model = False

# dep_tree = True
dep_tree = False

if random_model:
    model = BertModel(
        BertConfig(output_hidden_states=True,
                   output_attentions=True,
                   cache_dir='pretrained_models'))
else:
    model = BertModel.from_pretrained('bert-base-cased',
                                      output_hidden_states=True,
                                      output_attentions=True)
    # config = AutoConfig.from_pretrained('bert-base-cased', output_hidden_states=True,
    #                                 output_attentions=True,
    #                                 cache_dir='pretrained_models')
    # model = AutoModel.from_config(config)
# tokenizer = AutoTokenizer.from_pretrained('bert-base-cased', cache_dir='pretrained_models')
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

if dep_tree:
    dfile = SAVE_DIR + 'dependency_train_bracketed.txt'
    idx = np.load(SAVE_DIR + 'const_in_dep.npy').astype(int)
Esempio n. 25
0
    # full_vecs = np.array(layer_full_vectors).squeeze().transpose((0,2,1))
    return np.stack(layer_Z_vectors), np.array(layer_att), np.array(
        layer_full_vectors)


#%%
random_model = False
# random_model = True

if random_model:
    # config = AutoConfig.from_pretrained(pretrained_weights, output_hidden_states=True,
    #                                 output_attentions=args.attention,
    #                                 cache_dir='pretrained_models')
    # model = AutoModel.from_config(config)
    model = BertModel(
        BertConfig(output_hidden_states=True, output_attentions=True))
else:
    model = BertModel.from_pretrained('bert-base-cased',
                                      output_hidden_states=True,
                                      output_attentions=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

dfile = SAVE_DIR + 'train_bracketed.txt'

# with open(SAVE_DIR+'permuted_data.pkl', 'rb') as dfile:
#     dist = pkl.load(dfile)

#%%
max_num = 200
these_bounds = [0, 1, 2, 3, 4, 5, 6]
Esempio n. 26
0
def train(args):
    #load squad data for pre-training.
    
    args.train_batch_size=int(args.train_batch_size / args.gradient_accumulation_steps)
    
    review_train_examples=np.load(os.path.join(args.review_data_dir, "data.npz") )
    
    num_train_steps = args.num_train_steps
    bar = tqdm(total=num_train_steps)
    
    # load bert pre-train data.
    review_train_data = TensorDataset(
        torch.from_numpy(review_train_examples["input_ids"]),
        torch.from_numpy(review_train_examples["segment_ids"]),
        torch.from_numpy(review_train_examples["input_mask"]),           
        torch.from_numpy(review_train_examples["masked_lm_ids"]),
        torch.from_numpy(review_train_examples["next_sentence_labels"]) )
    
    review_train_dataloader = DataLoader(review_train_data, sampler=RandomSampler(review_train_data), batch_size=args.train_batch_size , drop_last=True)
    
    # we do not have any valiation for pretuning
    model = BertForPreTraining.from_pretrained(modelconfig.MODEL_ARCHIVE_MAP[args.bert_model], cache_dir='../cache', config=BertConfig())
    model.train()
    model = BertForMTPostTraining(model, BertConfig())
    
    model.cuda()

    # Prepare optimizer
    param_optimizer = [(k, v) for k, v in model.named_parameters() if v.requires_grad==True]
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    t_total = num_train_steps
        
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(args.warmup_proportion * t_total), num_training_steps=t_total)
    global_step=0
    step=0
    batch_loss=0.
    model.train()
    model.zero_grad()
    
    training=True
    
    review_iter=iter(review_train_dataloader)
    model_dir = os.path.join(args.output_dir, "saved_model")
    os.makedirs(model_dir, exist_ok=True)
    while training:
        try:
            batch = next(review_iter)
        except:
            review_iter=iter(review_train_dataloader)
            batch = next(review_iter)
            
        batch = tuple(t.cuda() for t in batch)
        
        input_ids, segment_ids, input_mask, masked_lm_ids, next_sentence_labels = batch
        
        review_loss = model("review", input_ids=input_ids.long(), token_type_ids=segment_ids.long(), attention_mask=input_mask.long(), masked_lm_labels=masked_lm_ids.long(), next_sentence_label=next_sentence_labels.long())
        
        loss = review_loss

        if args.gradient_accumulation_steps > 1:
            loss = loss / args.gradient_accumulation_steps
        batch_loss += loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        if (step + 1) % args.gradient_accumulation_steps == 0:
            # modify learning rate with special warm up BERT uses
            lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr_this_step
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            global_step += 1
            bar.update(1)
            if global_step % 50 ==0:
                logging.info("step %d batch_loss %f ", global_step, batch_loss)
            batch_loss=0.

            if global_step % args.save_checkpoints_steps == 0:
                model.float()
                print('Saving model..')
                model.model.save_pretrained(model_dir + f"-{global_step}")
            if global_step>=num_train_steps:
                training=False
                break
        step+=1
    model.float()
    print('Saving model..')
    model.model.save_pretrained(model_dir + f"-{global_step}")
Esempio n. 27
0
    print('max_len of tokenized texts:',
          max([len(sent) for sent in tokenized_texts]))

    print("Tokenize the first sentence:")
    print(tokenized_texts[0])

    # construct the vocabulary
    vocab = list(set([w for sent in tokenized_texts for w in sent]))
    # index the input words
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

    input_ids = pad_or_truncate(input_ids, MAX_LEN)

    bert_config = BertConfig(vocab_size_or_config_json_file=len(vocab))

    heads = config["heads"]
    heads_mwe = config["heads_mwe"]

    all_test_indices = []
    all_predictions = []
    all_folds_labels = []
    recorded_results_per_fold = []
    splits = train_test_loader(input_ids, labels, A, A_MWE,
                               target_token_idices, K, BATCH_TRAIN, BATCH_TEST)

    for i, (train_dataloader, test_dataloader) in enumerate(splits):
        model = BertWithGCNAndMWE(MAX_LEN, bert_config, heads, heads_mwe,
                                  dropout)
        model.to(device)
Esempio n. 28
0
def bert_model():
    config = BertConfig()
    model = BertForQuestionAnswering(config=config)
    return model