Beispiel #1
0
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file,
                                     pytorch_dump_path):
    config_path = os.path.abspath(bert_config_file)
    tf_path = os.path.abspath(tf_checkpoint_path)
    print("Converting TensorFlow checkpoint from {} with config at {}".format(
        tf_path, config_path))
    # Load weights from TF model
    init_vars = tf.train.list_variables(tf_path)
    names = []
    arrays = []
    for name, shape in init_vars:
        print("Loading TF weight {} with shape {}".format(name, shape))
        array = tf.train.load_variable(tf_path, name)
        names.append(name)
        arrays.append(array)

    # Initialise PyTorch model
    config = BertConfig.from_json_file(bert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = BertForPreTraining(config)

    for name, array in zip(names, arrays):
        name = name.split('/')
        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
        # which are not required for using pretrained model
        if any(n in ["adam_v", "adam_m"] for n in name):
            print("Skipping {}".format("/".join(name)))
            continue
        pointer = model
        for m_name in name:
            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
                l = re.split(r'_(\d+)', m_name)
            else:
                l = [m_name]
            if l[0] == 'kernel' or l[0] == 'gamma':
                pointer = getattr(pointer, 'weight')
            elif l[0] == 'output_bias' or l[0] == 'beta':
                pointer = getattr(pointer, 'bias')
            elif l[0] == 'output_weights':
                pointer = getattr(pointer, 'weight')
            else:
                pointer = getattr(pointer, l[0])
            if len(l) >= 2:
                num = int(l[1])
                pointer = pointer[num]
        if m_name[-11:] == '_embeddings':
            pointer = getattr(pointer, 'weight')
        elif m_name == 'kernel':
            array = np.transpose(array)
        try:
            assert pointer.shape == array.shape
        except AssertionError as e:
            e.args += (pointer.shape, array.shape)
            raise
        print("Initialize PyTorch weight {}".format(name))
        pointer.data = torch.from_numpy(array)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)
Beispiel #2
0
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
    # Initialise PyTorch model
    config = BertConfig.from_json_file(bert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = BertForPreTraining(config)
    # Load weights from tf checkpoint
    load_tf_weights_in_bert(model, tf_checkpoint_path)
    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)
Beispiel #3
0
def test1():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    processor = SentProcessor()
    bert_path = '/home/liuxg/.pytorch_pretrained_bert'
    tokenizer = BertTokenizer.from_pretrained(bert_path, do_lower_case=True)

    # Prepare model
    model = BertForPreTraining.from_pretrained(bert_path)
    model.to(device)
    model.eval()

    sents = ['which computer do you like', 'what app are you most using']
    label_list = processor.get_labels()
    eval_examples = processor.get_dev_examples(sents=sents)
    # for e in eval_examples:
    #     print('----------------', e.text_a)
    max_seq_length = 15
    eval_features = convert_examples_to_features(eval_examples, label_list,
                                                 max_seq_length, tokenizer)
    input_ids = torch.tensor([f.input_ids for f in eval_features],
                             dtype=torch.long)
    input_mask = torch.tensor([f.input_mask for f in eval_features],
                              dtype=torch.long)
    segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                               dtype=torch.long)
    label_ids = torch.tensor([f.label_id for f in eval_features],
                             dtype=torch.long)

    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    ans = model(input_ids, segment_ids, input_mask)[0]  # 1,768
    print(ans)
def bertForPreTraining(*args, **kwargs):
    """
    BERT model with pre-training heads.
    This module comprises the BERT model followed by the two pre-training heads
        - the masked language modeling head, and
        - the next sentence classification head.
    """
    model = BertForPreTraining.from_pretrained(*args, **kwargs)
    return model
Beispiel #5
0
    def convert_all_tensorflow_bert_weights_to_pytorch(self, input_folder: str) -> None:
        """
        Tensorflow to Pytorch weight conversion based on huggingface's library

        Parameters
        ----------
        input_folder: `str`, required
            The folder containing the tensorflow files
        """
        files = [e for e in os.listdir(input_folder) if os.path.isfile(os.path.join(input_folder, e))]
        folders = [os.path.join(input_folder, e) for e in os.listdir(input_folder) if
                   os.path.isdir(os.path.join(input_folder, e))]

        flag = -4
        for file in files:
            if file == 'vocab.txt' or \
                    file.endswith('.data-00000-of-00001') or \
                    file.endswith('.index') or \
                    file.endswith('.meta') or \
                    file.endswith('.json'):
                flag += 1
                if file.endswith('.json'):
                    config_file = file

        if flag > 0:
            assert type(config_file) == str, "no valid config file, but is attempting to convert"
            pytorch_path = os.path.join(input_folder, 'pytorch')
            tensorflow_path = os.path.join(input_folder, 'tensorflow')

            force_folder_to_exist(pytorch_path)
            force_folder_to_exist(tensorflow_path)

            os.system('mv ' + os.path.join(input_folder, '*.*') + ' ' + tensorflow_path)
            os.system('cp ' + os.path.join(tensorflow_path, '*.txt') + ' ' + pytorch_path)
            os.system('cp ' + os.path.join(tensorflow_path, '*.json') + ' ' + pytorch_path)

            config = BertConfig.from_json_file(os.path.join(tensorflow_path, config_file))
            model = BertForPreTraining(config)
            load_tf_weights_in_bert(model=model, tf_checkpoint_path=os.path.join(tensorflow_path, 'bert_model.ckpt'))
            torch.save(model.state_dict(), os.path.join(pytorch_path, 'pytorch_model.bin'))

        else:
            for folder in folders:
                self.convert_all_tensorflow_bert_weights_to_pytorch(input_folder=folder)
def test_BertForPreTraining():
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
    config = BertConfig(vocab_size_or_config_json_file=32000,
                        hidden_size=768,
                        num_hidden_layers=12,
                        num_attention_heads=12,
                        intermediate_size=3072)
    model = BertForPreTraining(config)
    print(model(input_ids, token_type_ids, input_mask))
Beispiel #7
0
def load_checkpoint(filename, device, n_gpu):
    logger.info('Loading model %s' % filename)
    saved_params = torch.load(filename,
                              map_location=lambda storage, loc: storage)
    args = saved_params['args']
    global_step = saved_params['global_step']
    model_dict = saved_params['model_dict']
    optimizer_dict = saved_params['optimizer']
    iter_id = saved_params['iter_id']

    model = BertForPreTraining.from_pretrained(args.bert_model,
                                               state_dict=model_dict)
    return args, global_step, iter_id, model, optimizer_dict
Beispiel #8
0
def load_bert(bert_model_name, bert_load_mode, all_state,
              bert_config_json_path=None):
    if bert_config_json_path is None:
        bert_config_json_path = os.path.join(get_bert_config_path(bert_model_name), "bert_config.json")
    if bert_load_mode == "model_only":
        state_dict = all_state
    elif bert_load_mode in ["state_model_only", "state_all", "state_full_model"]:
        state_dict = all_state["model"]
    else:
        raise KeyError(bert_load_mode)

    if bert_load_mode == "state_full_model":
        model = BertForPreTraining.from_state_dict_full(
            config_file=bert_config_json_path,
            state_dict=state_dict,
        )
    else:
        model = BertForPreTraining.from_state_dict(
            config_file=bert_config_json_path,
            state_dict=state_dict,
        )
    return model
    def __init__(self, config, num_labels, model_param_fp=None):
        super(Model, self).__init__(config)
        self.num_labels = num_labels

        if model_param_fp is not None:
            self.bert = BertForPreTraining.from_pretrained(
                config).load_state_dict(torch.load(open(model_param_fp,
                                                        'rb'))).bert
        else:
            self.bert = BertModel(config)

        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        self.apply(self.init_bert_weights)
Beispiel #10
0
    def __init__(self,
                 vocab: Vocabulary,
                 bert_model_name: str,
                 remap_segment_embeddings: int = None,
                 regularizer: RegularizerApplicator = None):
        super().__init__(vocab, regularizer)

        pretrained_bert = BertForPreTraining.from_pretrained(bert_model_name)
        self.pretraining_heads = pretrained_bert.cls
        self.bert = pretrained_bert

        self.remap_segment_embeddings = remap_segment_embeddings
        if remap_segment_embeddings is not None:
            new_embeddings = self._remap_embeddings(
                self.bert.bert.embeddings.token_type_embeddings.weight)
            if new_embeddings is not None:
                del self.bert.bert.embeddings.token_type_embeddings
                self.bert.bert.embeddings.token_type_embeddings = new_embeddings
Beispiel #11
0
 def load_bert(self):
     with torch.no_grad():
         model = BertForPreTraining.from_pretrained('bert-base-uncased').to(
             'cuda')
     '''if args.fp16:
         model.half()
     model.to(device)
     if args.local_rank != -1:
         try:
             from apex.parallel import DistributedDataParallel as DDP
         except ImportError:
             raise ImportError(
                 "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
         model = DDP(model)
     elif n_gpu > 1:
         model = torch.nn.DataParallel(model)'''
     model.eval()
     return model
    def __init__(self,
                 featQty,
                 headFeatQty,
                 useTfIdfTransform=False,
                 useWordFeatures=True,
                 useBERT=False,
                 useHeadBERT=False,
                 bertModelPath=None,
                 torch_device='cuda',
                 bertModelType='bert-base-uncased'):

        self.useWordFeatures = useWordFeatures
        if self.useWordFeatures:
            self.featQty = featQty
            self.countVect = CountVectorizer(ngram_range=(1, 1))
            self.tfidf = TfidfTransformer() if useTfIdfTransform else None

            self.headFeatQty = headFeatQty
            self.headCountVect = CountVectorizer(ngram_range=(1, 1))
            self.headTfidf = TfidfTransformer() if useTfIdfTransform else None

        self.useBERT = useBERT
        self.useHeadBERT = useHeadBERT
        if useBERT or useHeadBERT:
            self.torch_device = torch.device(torch_device)
            if bertModelPath is not None:
                print('Loading fine-tuned model from file:', bertModelPath)
                self.bertModelWrapper = BertForPreTraining.from_pretrained(
                    bertModelType)
                self.bertModelWrapper.load_state_dict(
                    torch.load(bertModelPath))
            else:
                print('Loading standard pre-trained model')
                self.bertModelWrapper = BertForMaskedLM.from_pretrained(
                    bertModelType)

            self.bertModelWrapper.eval()
            self.bertModelWrapper.to(torch_device)
            self.bert_tokenizer = BertTokenizer.from_pretrained(
                bertModelType, do_lower_case=True)
Beispiel #13
0
def bertForPreTraining(*args, **kwargs):
    """
    BERT model with pre-training heads.
    This module comprises the BERT model followed by the two pre-training heads
        - the masked language modeling head, and
        - the next sentence classification head.

    Example:
        # Load the tokenizer
        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        >>> tokenized_text = tokenizer.tokenize(text)
        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        >>> tokens_tensor = torch.tensor([indexed_tokens])
        >>> segments_tensors = torch.tensor([segments_ids])
        # Load bertForPreTraining
        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForPreTraining', 'bert-base-cased')
        >>> masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors)
    """
    model = BertForPreTraining.from_pretrained(*args, **kwargs)
    return model
Beispiel #14
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--pregenerated_data', type=Path, required=True)
    parser.add_argument('--output_dir', type=Path, required=True)
    parser.add_argument(
        "--bert_model",
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument("--do_lower_case", action="store_true")
    parser.add_argument(
        "--reduce_memory",
        action="store_true",
        help=
        "Store training data as on-disc memmaps to massively reduce memory usage"
    )

    parser.add_argument("--epochs",
                        type=int,
                        default=3,
                        help="Number of epochs to train for")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    args = parser.parse_args()

    assert args.pregenerated_data.is_dir(), \
        "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!"

    samples_per_epoch = []
    for i in range(args.epochs):
        epoch_file = args.pregenerated_data / f"epoch_{i}.json"
        metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json"
        if epoch_file.is_file() and metrics_file.is_file():
            metrics = json.loads(metrics_file.read_text())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0:
                exit("No training data was found!")
            print(
                f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})."
            )
            print(
                "This script will loop over the available data, but training diversity may be negatively impacted."
            )
            num_data_epochs = i
            break
    else:
        num_data_epochs = args.epochs

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logging.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if args.output_dir.is_dir() and list(args.output_dir.iterdir()):
        logging.warning(
            f"Output directory ({args.output_dir}) already exists and is not empty!"
        )
    args.output_dir.mkdir(parents=True, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    total_train_examples = 0
    for i in range(args.epochs):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = int(total_train_examples /
                                       args.train_batch_size /
                                       args.gradient_accumulation_steps)
    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
        )

    # Prepare model
    model = BertForPreTraining.from_pretrained(args.bert_model)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)
        warmup_linear = WarmupLinearSchedule(
            warmup=args.warmup_proportion,
            t_total=num_train_optimization_steps)
    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

    global_step = 0
    logging.info("***** Running training *****")
    logging.info(f"  Num examples = {total_train_examples}")
    logging.info("  Batch size = %d", args.train_batch_size)
    logging.info("  Num steps = %d", num_train_optimization_steps)
    model.train()
    for epoch in range(args.epochs):
        epoch_dataset = PregeneratedDataset(
            epoch=epoch,
            training_path=args.pregenerated_data,
            tokenizer=tokenizer,
            num_data_epochs=num_data_epochs,
            reduce_memory=args.reduce_memory)
        if args.local_rank == -1:
            train_sampler = RandomSampler(epoch_dataset)
        else:
            train_sampler = DistributedSampler(epoch_dataset)
        train_dataloader = DataLoader(epoch_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar:
            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
                loss = model(input_ids, segment_ids, input_mask, lm_label_ids,
                             is_next)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                pbar.update(1)
                mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
                pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(
                            global_step, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

    # Save a trained model
    logging.info("** ** * Saving fine-tuned model ** ** * ")
    model_to_save = model.module if hasattr(
        model, 'module') else model  # Only save the model it-self

    output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
    output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

    torch.save(model_to_save.state_dict(), output_model_file)
    model_to_save.config.to_json_file(output_config_file)
    tokenizer.save_vocabulary(args.output_dir)
def main():
    """
        main function that saves embeddings of each article with their id as individual joblib pickle files
    """
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_file",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .csv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument(
        "--model_file",
        default=None,
        type=str,
        required=True,
        help="Where the pre-trained/fine-tuned model is stored for loading.")
    parser.add_argument("--override_features",
                        default=False,
                        type=bool,
                        required=True,
                        help="Override pickled feature files.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the output files will be written.")
    ## Other parameters

    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be split into two, then combined by averaging \n"
        "than this will be padded.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--tuned',
        action='store_true',
        help="Whether to use fine-tuned BERT on finance articles")
    args = parser.parse_args()

    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    model = BertForPreTraining.from_pretrained(args.bert_model)
    if args.tuned:
        model.load_state_dict(torch.load(args.model_file))
        print('Loaded model')

    if args.fp16:
        model.half()
    model.to(device)

    # Prepare optimizer
    processor = SamplesProcessor()
    if not os.path.isfile('eval_features.gz'):
        # save processed articles into features
        eval_examples = processor.get_dev_examples(args.data_file)
        eval_features = convert_examples_to_features(eval_examples,
                                                     args.max_seq_length,
                                                     tokenizer)
        dump(eval_features, 'eval_features.gz')
    else:
        if not args.override_features:
            eval_features = load('eval_features.gz')
        else:
            # override processed articles into features
            eval_examples = processor.get_dev_examples(args.data_file)
            eval_features = convert_examples_to_features(
                eval_examples, args.max_seq_length, tokenizer)
            dump(eval_features, 'eval_features.gz')

    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_features))

    model.eval()
    for eval_count, eval_feature in enumerate(eval_features):
        if os.path.isfile(args.output_dir + '/embedding_' +
                          str(eval_feature.text_id) + '.gz'):
            continue
        para_embed_list = []
        for para in range(len(eval_feature.input_ids)):
            #  if segment has no overlap
            if eval_feature.overlap[para] == 0:
                encoded_layer_array = np.zeros((0, args.max_seq_length, 1024))
                input_ids = torch.tensor(eval_feature.input_ids[para]).view(
                    1, -1)
                input_mask = torch.tensor(eval_feature.input_mask[para]).view(
                    1, -1)
                segment_ids = torch.tensor(
                    eval_feature.segment_ids[para]).view(1, -1)
                para_len = np.sum(np.array(eval_feature.input_mask[para]) != 0)
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                encoded_layers, _ = model.bert.forward(input_ids, segment_ids,
                                                       input_mask)
                for encoded_layer in encoded_layers:
                    encoded_layer_array = np.concatenate(
                        (encoded_layer.detach().cpu().numpy(),
                         encoded_layer_array))
                encoded_layers = encoded_layer_array[-2, 1:para_len - 1, :]
            else:
                #  if segment has overlap
                encoded_layer_array_1 = np.zeros(
                    (0, args.max_seq_length, 1024))
                encoded_layer_array_2 = np.zeros(
                    (0, args.max_seq_length, 1024))
                para_len_1 = np.sum(
                    np.array(eval_feature.input_mask[para][0]) != 0)
                para_len_2 = np.sum(
                    np.array(eval_feature.input_mask[para][1]) != 0)
                input_ids_1 = torch.tensor(
                    eval_feature.input_ids[para][0]).view(1, -1)
                input_mask_1 = torch.tensor(
                    eval_feature.input_mask[para][0]).view(1, -1)
                segment_ids_1 = torch.tensor(
                    eval_feature.segment_ids[para][0]).view(1, -1)
                input_ids_1 = input_ids_1.to(device)
                input_mask_1 = input_mask_1.to(device)
                segment_ids_1 = segment_ids_1.to(device)
                encoded_layers_1, _ = model.bert.forward(
                    input_ids_1, segment_ids_1, input_mask_1)
                for encoded_layer in encoded_layers_1:
                    encoded_layer_array_1 = np.concatenate(
                        (encoded_layer.detach().cpu().numpy(),
                         encoded_layer_array_1))
                encoded_layers_1 = encoded_layer_array_1[-2,
                                                         1:para_len_1 - 1, :]

                input_ids_2 = torch.tensor(
                    eval_feature.input_ids[para][1]).view(1, -1)
                input_mask_2 = torch.tensor(
                    eval_feature.input_mask[para][1]).view(1, -1)
                segment_ids_2 = torch.tensor(
                    eval_feature.segment_ids[para][1]).view(1, -1)
                input_ids_2 = input_ids_2.to(device)
                input_mask_2 = input_mask_2.to(device)
                segment_ids_2 = segment_ids_2.to(device)
                encoded_layers_2, _ = model.bert.forward(
                    input_ids_2, segment_ids_2, input_mask_2)
                for encoded_layer in encoded_layers_2:
                    encoded_layer_array_2 = np.concatenate(
                        (encoded_layer.detach().cpu().numpy(),
                         encoded_layer_array_2))
                encoded_layers_2 = encoded_layer_array_2[-2,
                                                         1:para_len_2 - 1, :]
                # average the overlapped portion
                overlap = eval_feature.overlap[para]
                encoded_overlap = (encoded_layers_1[-overlap:, :] +
                                   encoded_layers_2[:overlap, :]) / 2
                encoded_layers = np.concatenate(
                    (encoded_layers_1[:-overlap, :], encoded_overlap,
                     encoded_layers_2[overlap:, :]))
            para_embed_list.append(encoded_layers)
        dump(
            para_embed_list, args.output_dir + '/embedding_' +
            str(eval_feature.text_id) + '.gz')
Beispiel #16
0
def main():
    local_rank = -1
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", "-c", type=str, required=True)
    args, _ = parser.parse_known_args()
    options = argconf.options_from_json("confs/options.json")
    config = argconf.config_from_json(args.config)
    args = edict(argconf.parse_args(options, config))
    args.local_rank = local_rank
    args.on_memory = True

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train:
        raise ValueError(
            "Training is currently the only implemented execution option. Please set `do_train`."
        )

    if os.path.exists(args.workspace) and os.listdir(args.workspace):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.workspace))
    if not os.path.exists(args.workspace):
        os.makedirs(args.workspace)

    tokenizer = BertTokenizer.from_pretrained(args.model_file,
                                              do_lower_case=True)

    #train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        print("Loading Train Dataset", args.train_file)
        train_dataset = BERTDataset(args.train_file,
                                    tokenizer,
                                    seq_len=args.max_seq_length,
                                    corpus_lines=None,
                                    on_memory=args.on_memory)
        num_train_optimization_steps = int(
            len(train_dataset) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # Prepare model
    model = BertForPreTraining.from_pretrained(args.model_file)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

    global_step = 0
    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset)
        else:
            #TODO: check if this works with current data generator from disk that relies on next(file)
            # (it doesn't return item back by index)
            train_sampler = DistributedSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids = batch
                prediction_scores, _ = model(input_ids, segment_ids,
                                             input_mask, lm_label_ids)
                loss = loss_fct(
                    prediction_scores.view(-1, model.module.config.vocab_size),
                    lm_label_ids.view(-1)).mean()
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear(
                            global_step / num_train_optimization_steps,
                            args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

        # Save a trained model
        logger.info("** ** * Saving fine - tuned model ** ** * ")
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.workspace, "pytorch_model.bin")
        if args.do_train:
            torch.save(model_to_save.state_dict(), output_model_file)
def main():

    parser = ArgumentParser()
    parser.add_argument('--pregenerated_data', type=Path, required=True)

    parser.add_argument('--dist_url',
                        type=str,
                        default="tcp://172.31.38.122:23456")
    parser.add_argument('--rank', type=int, default=0)
    parser.add_argument('--output_dir', type=Path, required=True)
    parser.add_argument('--use_all_gpus', action="store_true")
    parser.add_argument('--world_size', type=int, default=1)

    parser.add_argument('--output_file', type=str, default="pytorch_model.bin")

    parser.add_argument(
        "--bert_model",
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )

    parser.add_argument("--no_sentence_loss",
                        action="store_true",
                        help="Whether not to use sentence level loss.")

    parser.add_argument('--tokeniser', type=str, default="vocab.txt")

    parser.add_argument("--do_lower_case", action="store_true")

    parser.add_argument(
        "--reduce_memory",
        action="store_true",
        help=
        "Store training data as on-disc memmaps to massively reduce memory usage"
    )

    parser.add_argument("--epochs",
                        type=int,
                        default=3,
                        help="Number of epochs to train for")

    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")

    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )

    parser.add_argument('--training',
                        type=bool,
                        default=True,
                        help="Whether to train the model or not")

    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")

    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")

    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")

    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")

    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument('--verbose',
                        action='store_true',
                        help="Whether to print more details along the way")

    parser.add_argument('--tensorboard',
                        action='store_true',
                        help="Whether to use Tensorboard ")

    parser.add_argument('--save',
                        type=bool,
                        default=True,
                        help="Whether to save")
    parser.add_argument(
        '--bert_finetuned',
        type=str,
        default=None,
        help='Model finetuned to use instead of pretrained models')

    args = parser.parse_args()

    # if args.tensorboard :
    #     from modeling import BertForPreTraining








    assert args.pregenerated_data.is_dir(), \
        "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!"

    samples_per_epoch = []
    for i in range(args.epochs):
        epoch_file = args.pregenerated_data / f"epoch_{i}.json"
        metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json"
        if epoch_file.is_file() and metrics_file.is_file():
            metrics = json.loads(metrics_file.read_text())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0:
                exit("No training data was found!")
            print(
                f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})."
            )
            print(
                "This script will loop over the available data, but training diversity may be negatively impacted."
            )
            num_data_epochs = i
            break
    else:
        num_data_epochs = args.epochs

    if args.local_rank == -1 or args.no_cuda:
        n_gpu = torch.cuda.device_count()
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")

    else:
        if args.use_all_gpus:
            device = torch.device("cuda")
            n_gpu = torch.cuda.device_count()

            dp_device_ids = list(range(min(n_gpu, args.train_batch_size)))

        else:
            torch.cuda.set_device(args.local_rank)
            device = torch.device("cuda", args.local_rank)
            dp_device_ids = [args.local_rank]
            n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        print("Initialize Process Group...")
        # torch.distributed.init_process_group(backend='nccl')
        # Number of distributed processes
        world_size = args.world_size

        # Distributed backend type

        dist_backend = 'gloo'
        start = time.time()
        torch.distributed.init_process_group(backend=dist_backend,
                                             init_method=args.dist_url,
                                             rank=args.rank,
                                             world_size=world_size)
        end = time.time()
        print('done within :', end - start)
    logging.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    ##### COMBIEN DE GPUs UTILLISER

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if args.output_dir.is_dir() and list(args.output_dir.iterdir()):
        logging.warning(
            f"Output directory ({args.output_dir}) already exists and is not empty!"
        )
    args.output_dir.mkdir(parents=True, exist_ok=True)

    # tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
    tokenizer = BertTokenizer.from_pretrained('vocab.txt',
                                              do_lower_case=args.do_lower_case)
    total_train_examples = 0
    for i in range(args.epochs):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = int(total_train_examples /
                                       args.train_batch_size /
                                       args.gradient_accumulation_steps)
    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
        )
        # num_train_optimization_steps = num_train_optimization_ste ps // n_gpu
    # Prepare model

    try:
        model = BertForPreTraining.from_pretrained(
            args.bert_model,
            verbose=args.verbose,
            tokeniser=args.tokeniser,
            train_batch_size=args.train_batch_size,
            device=device)
    except:
        model = BertForPreTraining.from_pretrained(args.bert_model)

    if args.bert_finetuned is not None:
        model_dict = torch.load(args.bert_finetuned)
        model.load_state_dict(model_dict)

    if args.fp16:
        model.half()

    if args.local_rank != -1:
        # try:
        #     from apex.parallel import DistributedDataParallel as DDP
        # except ImportError:
        #     raise ImportError(
        #         "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        print("Initialize Model...")
        # model = DDP(model, device_ids = dp_device_ids,output_device=args.local_rank)
        model.to(device)
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=dp_device_ids, output_device=args.local_rank)
        n_gpu_used = model.device_ids
        print('number gpu used', n_gpu_used)

    elif n_gpu > 1:

        # torch.cuda.set_device(list(range(min(args.train_batch_size, n_gpu))))

        model = torch.nn.DataParallel(
            model
        )  #, device_ids = list(range(min(args.train_batch_size, n_gpu))))
        n_gpu_used = model.device_ids
        print('number gpu used', n_gpu_used)
    elif n_gpu == 1:
        print("Only 1 GPU used")
        n_gpu_used = [1]
    model.to(device)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

    global_step = 0
    logging.info("***** Running training *****")
    logging.info(f"  Num examples = {total_train_examples}")
    logging.info("  Batch size = %d", args.train_batch_size)
    logging.info("  Num steps = %d", num_train_optimization_steps)
    model.train()
    for epoch in range(args.epochs):
        epoch_dataset = PregeneratedDataset(
            epoch=epoch,
            training_path=args.pregenerated_data,
            tokenizer=tokenizer,
            num_data_epochs=num_data_epochs)
        if args.local_rank == -1:
            train_sampler = RandomSampler(epoch_dataset)
        else:
            train_sampler = DistributedSampler(epoch_dataset)
        train_dataloader = DataLoader(epoch_dataset,
                                      sampler=train_sampler,
                                      num_workers=0,
                                      batch_size=args.train_batch_size,
                                      pin_memory=False)
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar:
            for step, batch in enumerate(train_dataloader):
                if args.training:
                    model.train()
                    batch = tuple(
                        t.to(device, non_blocking=True) for t in batch)
                    input_ids, input_mask, segment_ids, lm_label_ids, is_next, mask_index = batch
                    if args.no_sentence_loss:
                        is_next = None
                    loss = model(input_ids, segment_ids, input_mask,
                                 lm_label_ids, is_next, mask_index)
                    if args.verbose:
                        # print('input_ids : ', input_ids)
                        #
                        # print('input_mask : ', input_mask)
                        # print('segment_ids : ', segment_ids)
                        # print('lm_label_ids : ', lm_label_ids)
                        # print('is_next : ', is_next)
                        print('loss : ', loss)

                    # if n_gpu > 1:
                    if len(n_gpu_used) > 1:
                        loss = loss.mean()  # mean() to average on multi-gpu.
                    if args.gradient_accumulation_steps > 1:
                        loss = loss / args.gradient_accumulation_steps

                    if args.fp16:
                        optimizer.backward(loss)
                    else:
                        # print('backwards')
                        loss.backward()
                    # print('backwards done')
                    tr_loss += loss.item()
                    nb_tr_examples += input_ids.size(0)
                    nb_tr_steps += 1
                    pbar.update(1)
                    mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
                    pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
                    if (step + 1) % args.gradient_accumulation_steps == 0:
                        if args.fp16:
                            # modify learning rate with special warm up BERT uses
                            # if args.fp16 is False, BertAdam is used that handles this automatically
                            lr_this_step = args.learning_rate * warmup_linear(
                                global_step / num_train_optimization_steps,
                                args.warmup_proportion)
                            for param_group in optimizer.param_groups:
                                param_group['lr'] = lr_this_step
                        optimizer.step()
                        optimizer.zero_grad()
                        global_step += 1
                else:
                    with torch.no_grad():
                        model.eval()
                        batch = tuple(t.to(device) for t in batch)
                        input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
                        loss = model(input_ids, segment_ids, input_mask,
                                     lm_label_ids, is_next)
                        if args.verbose:
                            print('input_ids : ', input_ids)
                            print('input_mask : ', input_mask)
                            print('segment_ids : ', segment_ids)
                            print('lm_label_ids : ', lm_label_ids)
                            print('is_next : ', is_next)
                            print('loss : ', loss)
                        if n_gpu > 1:
                            loss = loss.mean(
                            )  # mean() to average on multi-gpu.
                        if args.gradient_accumulation_steps > 1:
                            loss = loss / args.gradient_accumulation_steps

                        tr_loss += loss.item()
                        nb_tr_examples += input_ids.size(0)
                        nb_tr_steps += 1
                        pbar.update(1)
                        mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
                        pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
                        if (step + 1) % args.gradient_accumulation_steps == 0:

                            # modify learning rate with special warm up BERT uses
                            # if args.fp16 is False, BertAdam is used that handles this automatically

                            global_step += 1

    # Save a trained model
    if args.save:
        # pickle.dump(model.df,open('results.p','wb'))
        logging.info("** ** * Saving fine-tuned model ** ** * ")
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = args.output_dir / args.output_file
        torch.save(model_to_save.state_dict(), str(output_model_file))
Beispiel #18
0
  dev_label_features = data_loader.convert_examples_to_features(dev_label_examples, label_list, MAX_SEQ_LEN, tokenizer, "classification",all_name_array)
  dev_label_dataloader = data_loader.make_data_loader (dev_label_features,batch_size=args.batch_size_label-2,fp16=args.fp16, sampler='sequential',metric_option=args.metric_option)
  # torch.save( dev_label_dataloader, os.path.join( args.qnli_dir, "dev_label_dataloader"+name_add_on+".pytorch") )
  print ('\ndev_label_examples {}'.format(len(dev_label_examples))) # dev_label_examples 7661


## **** make model ****

# bert model

bert_config = BertConfig( os.path.join(args.bert_model,"bert_config.json") )

cache_dir = args.cache_dir if args.cache_dir else os.path.join(
  str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))

bert_lm_sentence = BertForPreTraining.from_pretrained(args.bert_model,cache_dir=cache_dir)  # @num_labels is yes/no
if args.fp16:
  bert_lm_sentence.half() ## don't send to cuda, we will send to cuda with the joint model


# entailment model
ent_model = entailment_model.entailment_model (num_labels,bert_config.hidden_size,args.def_emb_dim,weight=torch.FloatTensor([1.5,.75])) # torch.FloatTensor([1.5,.75])

# cosine model
# **** in using cosine model, we are not using the training sample A->B then B not-> A
other = {'metric_option':args.metric_option}
cosine_loss = encoder_model.cosine_distance_loss(bert_config.hidden_size,args.def_emb_dim, args)

metric_pass_to_joint_model = {'entailment':ent_model, 'cosine':cosine_loss}

def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--train_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The input train corpus.")
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--on_memory",
        action='store_true',
        help="Whether to load train samples into memory or use disk")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    #train_examples = None
    num_train_steps = None
    if args.do_train:
        print("Loading Train Dataset", args.train_file)
        train_dataset = BERTDataset(args.train_file,
                                    tokenizer,
                                    seq_len=args.max_seq_length,
                                    corpus_lines=None,
                                    on_memory=args.on_memory)
        num_train_steps = int(
            len(train_dataset) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    #model = BertForPreTraining.from_pretrained(args.bert_model)
    bert_config = BertConfig.from_json_file('bert_config.json')
    model = BertForPreTraining(bert_config)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_steps)

    global_step = 0
    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset)
        else:
            #TODO: check if this works with current data generator from disk that relies on file.__next__
            # (it doesn't return item back by index)
            train_sampler = DistributedSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
                loss = model(input_ids, segment_ids, input_mask, lm_label_ids,
                             is_next)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses
                    lr_this_step = args.learning_rate * warmup_linear(
                        global_step / num_train_steps, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

        # Save a trained model
        logger.info("** ** * Saving fine - tuned model ** ** * ")
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
        if args.do_train:
            torch.save(model_to_save.state_dict(), output_model_file)
Beispiel #20
0
def main():
    parser = argparse.ArgumentParser()
    ## Required parameters
    parser.add_argument("-train_file",
                        required=True,
                        help="The input train corpus.")

    parser.add_argument(
        "-output_dir",
        default='../trained_model',
        help="The output directory where the model checkpoints will be written."
    )
    parser.add_argument(
        "-bert_model",
        default='bert-base-cased',
        help="bert-base-uncased, bert-large-uncased, bert-base-cased")

    parser.add_argument("-max_seq_length",
                        default=128,
                        type=int,
                        help="sequence length after WordPiece tokenization. \
                                                                            longer will be truncated, shorter will be padded."
                        )
    parser.add_argument("-train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("-learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("-num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "-warmup_proportion",
        default=0.1,
        type=float,
        help="Proportion of training to perform linear lr rate warmup. ")

    parser.add_argument("-local_rank",
                        default=-1,
                        type=int,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('-seed',
                        default=42,
                        type=int,
                        help="random seed for initialization")
    parser.add_argument('-gradient_accumulation_steps',
                        default=1,
                        type=int,
                        help="")
    parser.add_argument('-loss_scale', default=0, type=float, help="")

    parser.add_argument("-no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "-on_memory",
        action='store_true',
        help="Whether to load train samples into memory or use disk")
    parser.add_argument("-do_lower_case",
                        action='store_true',
                        help="Whether to lower case the input text.")
    parser.add_argument("-do_train",
                        action='store_true',
                        help="Whether to run training.")

    args = parser.parse_args()

    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {} n_gpu: {}, distributed training: {}".format(
        device, n_gpu, bool(args.local_rank != -1)))

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0: torch.cuda.manual_seed_all(args.seed)

    if not args.do_train:
        raise ValueError(
            "Training is currently the only implemented execution option. Please set `do_train`."
        )

    if not os.path.exists(args.output_dir):
        logger.info('creating model output dir {}'.format(args.output_dir))
        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)
    # import pdb
    # pdb.set_trace()

    #train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        print("Loading Train Dataset", args.train_file)
        train_dataset = BERTDataset(args.train_file,
                                    tokenizer,
                                    seq_len=args.max_seq_length,
                                    corpus_lines=None,
                                    on_memory=args.on_memory)

        num_train_optimization_steps = int(
            len(train_dataset) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs

    # Prepare model
    model = BertForPreTraining.from_pretrained(args.bert_model)

    model.to(device)
    if n_gpu > 1:  # true most of the time
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_optimization_steps)

    global_step = 0
    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        train_sampler = RandomSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
                loss, encoded_seq = model(
                    input_ids,
                    segment_ids,
                    input_mask,
                    lm_label_ids,
                    is_next,
                    get_encoded_seq=True)  # Language modeling loss

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

        # Save a trained model
        logger.info("** ** * Saving fine - tuned model ** ** * ")
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(
            args.output_dir, "pytorch_model_{}.bin".format(
                time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())))
        if args.do_train:
            torch.save(model_to_save.state_dict(), output_model_file)
Beispiel #21
0
def main(dn, dev, batch_size, epochs):
    pregenerated_data = Dir(f'data/{dn}.pretrain.temp')
    output_dir = Dir(f'temp/{dn}.bert.pt')
    bert_model = 'bert-base-uncased'
    do_lower_case = TRUE
    reduce_memory = TRUE
    epochs = epochs
    local_rank = -1
    no_cuda = (dev == 'cpu')
    gradient_accumulation_steps = 1
    train_batch_size = batch_size
    fp16 = FALSE
    loss_scale = 0
    warmup_proportion = 0.1
    learning_rate = 3e-5
    seed = 42

    samples_per_epoch = []
    for i in range(epochs):
        epoch_file = pregenerated_data / f'epoch_{i}.json'
        metrics_file = pregenerated_data / f'epoch_{i}_metrics.json'
        if epoch_file.isFile() and metrics_file.isFile():
            metrics = json.loads(metrics_file.file().read())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0: exit("No training data was found!")
            print(
                f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({epochs})."
            )
            print(
                "This script will loop over the available data, but training diversity may be negatively impacted."
            )
            num_data_epochs = i
            break
    else:
        num_data_epochs = epochs

    if no_cuda: device, n_gpu = 'cpu', 0
    elif local_rank == -1: device, n_gpu = 'cuda', torch.cuda.device_count()
    else:
        torch.cuda.set_device(local_rank)
        device, n_gpu = f'cuda:{local_rank}', 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    pr(device=device,
       n_gpu=n_gpu,
       distributed=(local_rank != -1),
       float16=fp16)

    train_batch_size = train_batch_size // gradient_accumulation_steps

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0: torch.cuda.manual_seed_all(seed)

    tokenizer = BertTokenizer.from_pretrained(bert_model,
                                              do_lower_case=do_lower_case)

    total_train_examples = 0
    for i in range(epochs):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = int(
        total_train_examples / train_batch_size / gradient_accumulation_steps)
    if local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
        )

    # Prepare model
    model = BertForPreTraining.from_pretrained(bert_model)
    if fp16: model.half()
    model.to(device)
    if local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        model = nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    warmup_linear = NA
    if fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale)
        warmup_linear = WarmupLinearSchedule(
            warmup=warmup_proportion, t_total=num_train_optimization_steps)
    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=learning_rate,
                             warmup=warmup_proportion,
                             t_total=num_train_optimization_steps)

    global_step = 0
    pr('***** Running training *****')
    pr(num_examples=total_train_examples)
    pr(batch_size=train_batch_size)
    pr(num_steps=num_train_optimization_steps)
    model.train()
    for epoch in range(epochs):
        epoch_dataset = PregeneratedDataset(
            epoch=epoch,
            training_path=pregenerated_data,
            tokenizer=tokenizer,
            num_data_epochs=num_data_epochs,
            reduce_memory=reduce_memory,
        )
        if local_rank == -1:
            train_sampler = RandomSampler(epoch_dataset)
        else:
            train_sampler = DistributedSampler(epoch_dataset)
        train_dataloader = DataLoader(epoch_dataset,
                                      sampler=train_sampler,
                                      batch_size=train_batch_size)
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        with tqdm(total=len(train_dataloader), desc=f"epoch-{epoch}") as pbar:
            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
                loss = model(input_ids, segment_ids, input_mask, lm_label_ids,
                             is_next)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if gradient_accumulation_steps > 1:
                    loss = loss / gradient_accumulation_steps
                if fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                pbar.update(1)
                mean_loss = tr_loss * gradient_accumulation_steps / nb_tr_steps
                pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
                if (step + 1) % gradient_accumulation_steps == 0:
                    if fp16:
                        # modify learning rate with special warm up BERT uses
                        # if fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = learning_rate * warmup_linear.get_lr(
                            global_step / num_train_optimization_steps,
                            warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

    # Save a trained model
    pr('***** Saving fine-tuned model *****')
    model_to_save = model.module if hasattr(
        model, 'module') else model  # Only save the model it-self
    output_model_file = output_dir.add().div('pytorch_model.bin').file()
    torch.save(model_to_save.state_dict(), output_model_file.pathstr())
Beispiel #22
0
                                                     len(quant75)))

    other_params['GoCount'] = GO_counter
    other_params['quant25'] = quant25
    other_params['quant75'] = quant75
    other_params['betweenQ25Q75'] = betweenQ25Q75

## **** make BERT model

# bert language mask + next sentence model

bert_config = BertConfig(os.path.join(args.bert_model, "bert_config.json"))
cache_dir = args.cache_dir if args.cache_dir else os.path.join(
    str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(
        args.local_rank))
bert_lm_sentence = BertForPreTraining.from_pretrained(args.bert_model,
                                                      cache_dir=cache_dir)

# cosine model
# **** in using cosine model, we are not using the training sample A->B then B not-> A
cosine_loss = BERT_encoder_model.cosine_distance_loss(bert_config.hidden_size,
                                                      args.def_emb_dim, args)
metric_pass_to_joint_model = {'entailment': None, 'cosine': cosine_loss}

#* **** add yes/no classifier to BERT ****
## init joint model
GOEncoder = BERT_encoder_model.encoder_model(
    bert_lm_sentence, metric_pass_to_joint_model[args.metric_option], args,
    tokenizer, **other_params)

if args.go_enc_model_load is not None:
    print('\n\nload back best model for GO encoder {}'.format(
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help="The input data dir. Should contain the data files for the task.")
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory is where the model checkpoints will be saved.")

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=512,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument(
        "--do_lower_case",
        default=False,
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=10,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--optimize_on_cpu',
        default=False,
        action='store_true',
        help=
        "Whether to perform optimization and keep the optimizer averages on CPU"
    )
    parser.add_argument(
        '--fp16',
        default=False,
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=128,
        help=
        'Loss scaling, positive power of 2 values can improve fp16 convergence.'
    )

    args = parser.parse_args()

    processors = {"semeval": SemevalProcessor}

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        if args.fp16:
            logger.info(
                "16-bits training currently not supported in distributed training"
            )
            args.fp16 = False  # (see https://github.com/pytorch/pytorch/pull/13496)
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size //
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    os.makedirs(args.output_dir, exist_ok=True)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    train_examples = processor.get_train_examples(args.data_dir)
    num_train_steps = int(
        len(train_examples) / args.train_batch_size /
        args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    model = BertForPreTraining.from_pretrained(
        args.bert_model,
        cache_dir=PYTORCH_PRETRAINED_BERT_CACHE /
        'distributed_{}'.format(args.local_rank))
    model_path = Path(args.output_dir, "model.pth")

    if model_path.exists():
        model.load_state_dict(torch.load(model_path))

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    if args.fp16:
        param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
                            for n, param in model.named_parameters()]
    elif args.optimize_on_cpu:
        param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
                            for n, param in model.named_parameters()]
    else:
        param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
    t_total = num_train_steps
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=t_total)

    train_data = PretrainingDataset(train_examples, args.max_seq_length,
                                    tokenizer)

    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)

    if args.local_rank == -1:
        train_sampler = RandomSampler(train_data)
    else:
        train_sampler = DistributedSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  num_workers=0,
                                  pin_memory=True)

    model.train()
    for _ in trange(int(args.num_train_epochs), desc="Epoch"):
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        with tqdm(train_dataloader, desc="Iteration") as pbar:
            for step, batch in enumerate(pbar):
                input_ids, input_mask, segment_ids, masked_lm_labels, next_sentence_labels = batch
                loss = model(input_ids, segment_ids, input_mask,
                             masked_lm_labels, next_sentence_labels)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.fp16 and args.loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                    loss = loss * args.loss_scale
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16 or args.optimize_on_cpu:
                        if args.fp16 and args.loss_scale != 1.0:
                            # scale down gradients for fp16 training
                            for param in model.parameters():
                                if param.grad is not None:
                                    param.grad.data = param.grad.data / args.loss_scale
                        is_nan = set_optimizer_params_grad(
                            param_optimizer,
                            model.named_parameters(),
                            test_nan=True)
                        if is_nan:
                            logger.info(
                                "FP16 TRAINING: Nan in gradients, reducing loss scaling"
                            )
                            args.loss_scale = args.loss_scale / 2
                            model.zero_grad()
                            continue
                        optimizer.step()
                        copy_optimizer_params_to_model(
                            model.named_parameters(), param_optimizer)
                    else:
                        optimizer.step()
                    model.zero_grad()
                    pbar.set_postfix(loss="%.3f" % loss.item())

    if n_gpu > 1:
        torch.save(model.module.state_dict(), model_path)
    else:
        torch.save(model.state_dict(), model_path)
Beispiel #24
0
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if n_gpu > 0:
    torch.cuda.manual_seed_all(42)

total_train_examples = 0
for i in range(epochs):
    # The modulo takes into account the fact that we may loop over limited epochs of data
    total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

num_train_optimization_steps = int(total_train_examples / train_batch_size)

# Prepare model
model = BertForPreTraining.from_pretrained(
    pretrained_model_name_or_path=bert_model_path, cache_dir=bert_data_path)
model.to(device)
if n_gpu > 1:
    model = torch.nn.DataParallel(model)

# Prepare optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [{
    'params':
    [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay':
    0.01
}, {
    'params':
    [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
def main():
    parser = ArgumentParser()
    parser.add_argument('--pregenerated_training_data',
                        type=Path,
                        required=True)
    parser.add_argument('--pregenerated_dev_data', type=Path, required=True)
    parser.add_argument('--output_dir', type=Path, required=True)
    parser.add_argument(
        '--bert_model',
        type=str,
        required=True,
        help='Bert pre-trained model selected in the list: bert-base-uncased, '
        'bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.'
    )
    parser.add_argument('--do_lower_case', action='store_true')
    parser.add_argument(
        '--reduce_memory',
        action='store_true',
        help=
        'Store training data as on-disc memmaps to massively reduce memory usage'
    )

    parser.add_argument('--epochs',
                        type=int,
                        default=3,
                        help='Number of epochs to train for')
    parser.add_argument('--local_rank',
                        type=int,
                        default=-1,
                        help='local_rank for distributed training on gpus')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        help='Whether not to use CUDA when available')
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        'Number of updates steps to accumulate before performing a backward/update pass.'
    )
    parser.add_argument('--train_batch_size',
                        default=32,
                        type=int,
                        help='Total batch size for training.')
    parser.add_argument(
        '--fp16',
        action='store_true',
        help='Whether to use 16-bit float precision instead of 32-bit')
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        'Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n'
        '0 (default value): dynamic loss scaling.\n'
        'Positive power of 2: static loss scaling value.\n')
    parser.add_argument(
        '--warmup_proportion',
        default=0.1,
        type=float,
        help=
        'Proportion of training to perform linear learning rate warmup for. '
        'E.g., 0.1 = 10%% of training.')
    parser.add_argument('--learning_rate',
                        default=3e-5,
                        type=float,
                        help='The initial learning rate for Adam.')
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help='random seed for initialization')
    args = parser.parse_args()

    assert args.pregenerated_training_data.is_dir(), \
        '--pregenerated_training_data should point to the folder of files made by pregenerate_training_data.py!'

    samples_per_epoch = []
    for i in range(args.epochs):
        epoch_file = args.pregenerated_training_data / f'epoch_{i}.json'
        metrics_file = args.pregenerated_training_data / f'epoch_{i}_metrics.json'
        if epoch_file.is_file() and metrics_file.is_file():
            metrics = json.loads(metrics_file.read_text())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0:
                exit('No training data was found!')
            print(
                f'Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).'
            )
            print(
                'This script will loop over the available data, but training diversity may be negatively impacted.'
            )
            num_data_epochs = i
            break
    else:
        num_data_epochs = args.epochs

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device('cuda' if torch.cuda.is_available()
                              and not args.no_cuda else 'cpu')
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device('cuda', args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logging.info(
        'device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}'.
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            'Invalid gradient_accumulation_steps parameter: {}, should be >= 1'
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if args.output_dir.is_dir() and list(args.output_dir.iterdir()):
        logging.warning(
            f'Output directory ({args.output_dir}) already exists and is not empty!'
        )
    args.output_dir.mkdir(parents=True, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    total_train_examples = 0
    for i in range(args.epochs):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = int(total_train_examples /
                                       args.train_batch_size /
                                       args.gradient_accumulation_steps)
    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
        )

    # Prepare model
    model = BertForPreTraining.from_pretrained(args.bert_model)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                'Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.'
            )
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                'Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.'
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)
        warmup_linear = WarmupLinearSchedule(
            warmup=args.warmup_proportion,
            t_total=num_train_optimization_steps)
    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

    # Track loss
    train_loss_history = list()
    dev_loss_history = list()

    # Start training
    global_step = 0
    logging.info('***** Running training *****')
    logging.info(f'  Num examples = {total_train_examples}')
    logging.info(f'  Batch size = {args.train_batch_size}')
    logging.info(f'  Num steps = {num_train_optimization_steps} \n')
    for epoch in range(args.epochs):
        # Train model
        model.train()
        epoch_dataset = PregeneratedDataset(
            epoch=epoch,
            training_path=args.pregenerated_training_data,
            tokenizer=tokenizer,
            num_data_epochs=num_data_epochs,
            train_or_dev='train',
            reduce_memory=args.reduce_memory)
        if args.local_rank == -1:
            train_sampler = RandomSampler(epoch_dataset)
        else:
            train_sampler = DistributedSampler(epoch_dataset)
        train_dataloader = DataLoader(epoch_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        with tqdm(total=len(train_dataloader),
                  desc=f'Epoch {epoch}') as train_pbar:
            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
                loss = model(input_ids, segment_ids, input_mask, lm_label_ids,
                             is_next)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                train_pbar.update(1)
                mean_train_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
                if step % 10 == 0:
                    train_loss_history.append((epoch, mean_train_loss))
                train_pbar.set_postfix_str(f'Loss: {mean_train_loss:.5f}')
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(
                            global_step, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

        # Evaluate dev loss
        model.eval()
        dev_dataset = PregeneratedDataset(
            epoch=epoch,
            training_path=args.pregenerated_dev_data,
            tokenizer=tokenizer,
            num_data_epochs=num_data_epochs,
            train_or_dev='dev',
            reduce_memory=args.reduce_memory)
        if args.local_rank == -1:
            train_sampler = RandomSampler(dev_dataset)
        else:
            train_sampler = DistributedSampler(dev_dataset)
        dev_dataloader = DataLoader(dev_dataset,
                                    sampler=train_sampler,
                                    batch_size=args.train_batch_size)
        dev_loss = 0
        nb_dev_examples, nb_dev_steps = 0, 0
        with tqdm(total=len(dev_dataloader),
                  desc=f'Epoch {epoch}') as dev_pbar:
            for step, batch in enumerate(dev_dataloader):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
                loss = model(input_ids, segment_ids, input_mask, lm_label_ids,
                             is_next)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                dev_loss += loss.item()
                nb_dev_examples += input_ids.size(0)
                nb_dev_steps += 1
                dev_pbar.update(1)
                mean_dev_loss = dev_loss * args.gradient_accumulation_steps / nb_dev_steps
                dev_pbar.set_postfix_str(f'Loss: {mean_dev_loss:.5f}')
        dev_loss_history.append(
            (epoch, mean_dev_loss))  # Only collect final mean dev loss

        # Save training progress with optimizer
        logging.info('** ** * Saving training progress * ** **')
        Path(args.output_dir / f'{epoch}/').mkdir(exist_ok=True)

        output_model_file = args.output_dir / f'{epoch}/model_and_opt.bin'
        torch.save(
            {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': tr_loss,
            }, str(output_model_file))

        # Save easily-loadable model module
        logging.info(f'** ** * Saving fine-tuned model {epoch} * ** ** \n')
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self

        output_model_file = args.output_dir / f'{epoch}/{WEIGHTS_NAME}'
        output_config_file = args.output_dir / f'{epoch}/{CONFIG_NAME}'

        torch.save(model_to_save.state_dict(), str(output_model_file))
        model_to_save.config.to_json_file(str(output_config_file))
        tokenizer.save_vocabulary(args.output_dir)

        # Save loss history after every epoch
        with open(args.output_dir / f'{epoch}/loss_history.json', 'a') as h:
            hist = {'dev': dev_loss_history, 'train': train_loss_history}
            h.write(f'{json.dumps(hist)}\n')
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--dataset_name",
                        default="top300_kl", 
                        type=str, 
                        required=True, 
                        help="The name of dataset to inference (without extention ex) top300_kl)")
    parser.add_argument("--model_type",
                        default="baseline_tfidf", 
                        type=str, 
                        required=True, 
                        help="baseline, baseline_tfidf, ir-v0, ir-v1")
    parser.add_argument("--model_path",
                        default=None, 
                        type=str, 
                        required=True, 
                        help="path to model dir")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="save_path")

    ## Other parameters
    parser.add_argument("--bert_model",
                        default="bert-base-multilingual-cased",
                        type=str,
                        help="Default: bert-base-multilingual-cased" 
                         "Bert pre-trained model selected in the list: bert-base-uncased, "
                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
                        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--model_file",
                        default="pytorch_model.bin",
                        type=str,
                        help="The file of model (.bin), default is pytorhc_model.bin,\n" 
                             "특정 파일이 필요시 이름 설정 필요")
    parser.add_argument("--max_seq_length",
                        default=384,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    processor = IRProcessor()

    label_list = processor.get_labels()
    num_labels = len(label_list)

    print("model:", args.model_type)
    if args.model_type == "baseline": # load model (finetuned baseline on IR)
        tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=False)
        config = BertConfig(os.path.join(args.model_path + "bert_config.json"))
        model = BertForPreTraining(config)
        model.load_state_dict(torch.load(os.path.join(args.model_path, args.model_file)))
    elif args.model_type == "baseline_tfidf": # load model (baseline_tfidf)
        tokenizer = BertTFIDFTokenizer.from_pretrained(args.bert_model, do_lower_case=False, do_tf_idf=True)
        TFIDFconfig = modeling.BertConfig(os.path.join(args.model_path + "bert_config.json"))
        model = modeling.BertTFIDFForPreTraining(TFIDFconfig)
        model.load_state_dict(torch.load(os.path.join(args.model_path, args.model_file)))
    elif args.model_type == "ir-v0": # load model (*-head)
        tokenizer = BertTFIDFTokenizer.from_pretrained(args.bert_model, do_lower_case=False, do_tf_idf=True)
        head_config = modeling_ir.BertForIRConfig(os.path.join(args.model_path + "bert_config.json"))
        model = modeling_ir.BertForIRForPreTraining(head_config)
        model.load_state_dict(torch.load(os.path.join(args.model_path, args.model_file)))
    elif args.model_type == "ir-v1": # load model (*-head)
        tokenizer = BertTFIDFTokenizer.from_pretrained(args.bert_model, do_lower_case=False, do_tf_idf=True)
        head_config = modeling_ir_2.BertForIRConfig(os.path.join(args.model_path + "bert_config.json"))
        model = modeling_ir_2.BertForIRForPreTraining(head_config)
        model.load_state_dict(torch.load(os.path.join(args.model_path, args.model_file)))

    if args.fp16:
        model.half()
    model.to(device)

    tfidf_dict = pickle_load(os.path.join(args.data_dir, args.dataset_name + '_tfidf.pkl'))

    results_logit = dict()
    results_softmax = dict()

    eval_set, documents, queries = processor.make_eval_set(args.data_dir, args.dataset_name)
    logger.info("***** Running evaluation *****")
    logger.info("  Batch size = %d", args.eval_batch_size)
    for q_num, query in tqdm(enumerate(queries), total=len(queries), desc="Evaluating"):
    # for query in queries[0:1]: # for testing

        logger.info(f"Current Query Num : {q_num}")
        eval_examples = processor._create_examples(eval_set, query, documents)
        # logger.info("  Num examples = %d", len(eval_examples))
        if args.model_type == "baseline": # baseline or baseline_finetuned
            eval_features = convert_examples_to_features_for_vanilla(
                eval_examples, label_list, args.max_seq_length, tokenizer)
            all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
            all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
            all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
            all_label_ids = torch.tensor([f.label for f in eval_features], dtype=torch.long)
            eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

            model.eval()
            eval_loss = 0
            nb_eval_steps = 0
            preds = []

            for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Query"):
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    _, logits = model(input_ids, segment_ids, input_mask)

                # loss_fct = CrossEntropyLoss()
                # tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
                
                # eval_loss += tmp_eval_loss.mean().item()
                # nb_eval_steps += 1
                if len(preds) == 0:
                    preds.append(logits.detach().cpu().numpy())
                else:
                    preds[0] = np.append(
                        preds[0], logits.detach().cpu().numpy(), axis=0)
        else: # baseline_tfidf or *-head model
            eval_data = LazyDatasetClassifier(eval_examples, label_list, args.max_seq_length, tokenizer, tfidf_dict)
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

            model.eval()
            eval_loss = 0
            nb_eval_steps = 0
            preds = []
                
            for batch in tqdm(eval_dataloader, desc="Query"):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_weights, input_mask, segment_ids, label_ids = batch

                with torch.no_grad():
                    _, logits = model(input_ids, input_weights, segment_ids, input_mask)
                
                # loss_fct = CrossEntropyLoss()
                # tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
                
                # eval_loss += tmp_eval_loss.mean().item()
                nb_eval_steps += 1
                if len(preds) == 0:
                    preds.append(logits.detach().cpu().numpy())
                else:
                    preds[0] = np.append(
                        preds[0], logits.detach().cpu().numpy(), axis=0)


        # eval_loss = eval_loss / nb_eval_steps
        preds = preds[0]

        results_softmax[query] = []
        for i, pred in enumerate(softmax(preds)): # using softmax
            pair = dict()
            pair["score"] = pred[1]
            pair["doc_id"] = list(documents.keys())[i]
            results_softmax[query].append(pair)
        results_softmax[query].sort(reverse=True, key=lambda x: x["score"])

        ranked_doc_list = []
        for doc in results_logit[query]:
            ranked_doc_list.append(doc["doc_id"])
        results_logit[query] = ranked_doc_list

        ranked_doc_list = []
        for doc in results_softmax[query]:
            ranked_doc_list.append(doc["doc_id"])
        results_softmax[query] = ranked_doc_list

    save_name2 = args.model_path.split('/')[0] + '_' + args.model_file.split('.')[0] \
                 + '_' + args.dataset_name + '_output.json'
    path2 = os.path.join(args.output_dir,
                         save_name2)

    with open(path2, 'w', encoding="utf8") as f:
        json.dump(results_softmax, f, indent=4, sort_keys=True, ensure_ascii=False)
Beispiel #27
0
cosine_loss = encoder_model.cosine_distance_loss(
    args.def_emb_dim, args.def_emb_dim,
    args)  ## remember to turn on reduce flag ???

# entailment model
# ent_model = entailment_model.entailment_model (num_labels,args.gcnn_dim,args.def_emb_dim,weight=torch.FloatTensor([1.5,.75])) # torch.FloatTensor([1.5,.75])

metric_pass_to_joint_model = {'entailment': None, 'cosine': cosine_loss}

## NEED TO MAKE THE BERT MODEL

# use BERT tokenizer
bert_config = BertConfig(os.path.join(args.bert_model, "bert_config.json"))

other = {'metric_option': args.metric_option}
bert_lm_sentence = BertForPreTraining.from_pretrained(args.bert_model)
bert_lm_ent_model = BERT_encoder_model.encoder_model(
    bert_lm_sentence, metric_pass_to_joint_model[args.metric_option], args,
    tokenizer, **other)

## make GCN model

model = encoder_model.encoder_with_bert(
    args, bert_lm_ent_model, metric_pass_to_joint_model[args.metric_option],
    **other_params)

print('\nmodel is\n')
print(model)
if args.use_cuda:
    print('\n\n send model to gpu\n\n')
    model.cuda()
Beispiel #28
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--pregenerated_data', type=Path, required=True)
    parser.add_argument('--output_dir', type=Path, required=True)
    parser.add_argument("--bert_model", type=str, required=True,
                        choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased",
                                 "bert-base-multilingual-cased", "bert-base-chinese"])
    parser.add_argument("--do_lower_case", action="store_true")
    parser.add_argument("--reduce_memory", action="store_true",
                        help="Store training data as on-disc memmaps to massively reduce memory usage")

    parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--train_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                        "0 (default value): dynamic loss scaling.\n"
                        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument('--seed',
                        type=int,
                        default=None,
                        help="random seed for initialization")
    args = parser.parse_args()

    assert args.pregenerated_data.is_dir(), \
        "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!"

    samples_per_epoch = []
    for i in range(args.epochs):
        epoch_file = args.pregenerated_data / f"epoch_{i}.json"
        metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json"
        if epoch_file.is_file() and metrics_file.is_file():
            metrics = json.loads(metrics_file.read_text())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0:
                exit("No training data was found!")
            print(f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).")
            print("This script will loop over the available data, but training diversity may be negatively impacted.")
            num_data_epochs = i
            break
    else:
        num_data_epochs = args.epochs
    print(samples_per_epoch)

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    if args.seed:
        random.seed(args.seed)
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        if n_gpu > 0:
            torch.cuda.manual_seed_all(args.seed)

    if args.output_dir.is_dir() and list(args.output_dir.iterdir()):
        logging.warning(f"Output directory ({args.output_dir}) already exists and is not empty!")
    args.output_dir.mkdir(parents=True, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    total_train_examples = 0
    for i in range(args.epochs):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = int(
        total_train_examples / args.train_batch_size / args.gradient_accumulation_steps)

    # Prepare model
    model = BertForPreTraining.from_pretrained(args.bert_model)
    model = torch.nn.DataParallel(model)

    # Prepare optimizer
    optimizer = BertAdam

    train_dataloader = DataLoader(
        PregeneratedData(args.pregenerated_data,  tokenizer,args.epochs, args.train_batch_size),
        batch_size=args.train_batch_size,
    )

    data = DataBunch(train_dataloader,train_dataloader)
    global_step = 0
    logging.info("***** Running training *****")
    logging.info(f"  Num examples = {total_train_examples}")
    logging.info("  Batch size = %d", args.train_batch_size)
    logging.info("  Num steps = %d", num_train_optimization_steps)
    def loss(x, y):
        return x.mean()

    learn = Learner(data, model, optimizer,
                    loss_func=loss,
                    true_wd=False,
                    path='learn',
                    layer_groups=bert_layer_list(model),
    )

    lr= args.learning_rate
    layers = len(bert_layer_list(model))
    lrs = learn.lr_range(slice(lr/(2.6**4), lr))
    for epoch in range(args.epochs):
        learn.fit_one_cycle(1, lrs, wd=0.01)
        # save model at half way point
        if epoch == args.epochs//2:
            savem = learn.model.module.bert if hasattr(learn.model, 'module') else learn.model.bert
            output_model_file = args.output_dir / (f"pytorch_fastai_model_{args.bert_model}_{epoch}.bin")
            torch.save(savem.state_dict(), str(output_model_file))
            print(f'Saved bert to {output_model_file}')

    savem = learn.model.module.bert if hasattr(learn.model, 'module') else learn.model.bert
    output_model_file = args.output_dir / (f"pytorch_fastai_model_{args.bert_model}_{args.epochs}.bin")
    torch.save(savem.state_dict(), str(output_model_file))
    print(f'Saved bert to {output_model_file}')
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--train_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The input train corpus.")

    parser.add_argument(
        "--output_dir",
        default='../trained_model',
        type=str,
        help="The output directory where the model checkpoints will be written."
    )
    parser.add_argument(
        "--bert_model",
        default='bert-base-cased',
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")

    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")

    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--on_memory",
        action='store_true',
        help="Whether to load train samples into memory or use disk")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")

    args = parser.parse_args()

    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0: torch.cuda.manual_seed_all(args.seed)

    if not args.do_train:
        raise ValueError(
            "Training is currently the only implemented execution option. Please set `do_train`."
        )

    if not os.path.exists(args.output_dir):
        logger.info('creating model output dir {}'.format(args.output_dir))
        os.makedirs(args.output_dir)

    dataset = args.train_file.split('/')[-3]
    voc_fname = '/'.join(args.train_file.split('/')[:-2] + ['vocab.txt'])
    tokenizer = BertTokenizer(voc_fname, do_lower_case=args.do_lower_case)

    num_train_optimization_steps = None
    print("Loading Train Dataset", args.train_file)
    train_dataset = BERTDataset(args.train_file,
                                tokenizer,
                                seq_len=args.max_seq_length,
                                corpus_lines=None,
                                on_memory=args.on_memory)
    num_train_optimization_steps = int(
        len(train_dataset) / args.train_batch_size /
        args.gradient_accumulation_steps) * args.num_train_epochs

    # Prepare model. TODO: change to create fresh model.
    vocab = utils.read(voc_fname)
    config = BertConfig(vocab_size_or_config_json_file=len(vocab),
                        hidden_size=768,
                        num_hidden_layers=12,
                        num_attention_heads=12,
                        intermediate_size=3072)
    model = BertForPreTraining(config)

    model.to(device)
    if n_gpu > 1:  # true most of the time
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_optimization_steps)

    global_step = 0
    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        # true most of the time
        train_sampler = RandomSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
                loss = model(input_ids, segment_ids, input_mask, lm_label_ids,
                             is_next)

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)  # batch_size
                nb_tr_steps += 1

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

        # Save a trained model
        logger.info("** ** * Saving fine - tuned model ** ** * ")
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(
            args.output_dir, "pretraining_{}.bin".format(
                time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())))

        if args.do_train:
            torch.save(model_to_save.state_dict(), output_model_file)
Beispiel #30
0
def create_from_pretrained(bert_model_name, cache_dir):
    model = BertForPreTraining.from_pretrained(
        pretrained_model_name_or_path=bert_model_name,
        cache_dir=cache_dir,
    )
    return model