コード例 #1
0
    def test_data_is_not_parallelized_when_model_is_parallel(self):
        model = RegressionModel()
        # Make the Trainer believe it's a parallelized model
        model.is_parallelizable = True
        model.model_parallel = True
        trainer = Trainer(model=model,
                          train_dataset=RegressionDataset(),
                          eval_dataset=RegressionDataset())
        # Check the Trainer was fooled
        self.assertTrue(trainer.is_model_parallel)

        # The batch size of the training and evaluation dataloaders should be 16, not 16 * n_gpu
        self.assertEqual(trainer.get_train_dataloader().batch_size, 16)
        self.assertEqual(len(trainer.get_train_dataloader()), 64 // 16)
        self.assertEqual(trainer.get_eval_dataloader().batch_size, 16)
        self.assertEqual(len(trainer.get_eval_dataloader()), 64 // 16)
コード例 #2
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.config_name:
        config = BertConfig.from_pretrained(model_args.config_name,
                                            cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = BertConfig.from_pretrained(model_args.model_name_or_path,
                                            cache_dir=model_args.cache_dir)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name:
        tokenizer = BertTokenizer.from_pretrained(
            model_args.tokenizer_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        tokenizer = BertTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --tokenizer_name")

    if model_args.model_name_or_path:
        model = BertForTagRankingLate.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = BertForTagRanking.from_config(config)

    # add vocab for special tokens and hashtags
    special_tokens = ['<img>', '<loc>', '<time>']
    num_added_special_toks = tokenizer.add_tokens(special_tokens)
    print('We have added', num_added_special_toks, 'special tokens')
    tokenizer.img_token = '<img>'
    tokenizer.loc_token = '<loc>'
    tokenizer.time_token = '<time>'
    print(tokenizer.convert_tokens_to_ids(special_tokens))
    assert tokenizer.img_token == '<img>'
    assert tokenizer.loc_token == '<loc>'
    assert tokenizer.time_token == '<time>'

    with open(data_args.tag_list) as f:
        tag_list = f.readlines()
        tag_list = ' '.join(tag_list).replace('\n', '').split()
    num_added_toks = tokenizer.add_tokens(tag_list)
    print('tag_list:', data_args.tag_list)
    print('We have added', num_added_toks, 'tokens for hashtags')
    print('total vocab_size:', len(tokenizer))
    model.resize_token_embeddings(len(tokenizer))

    if data_args.block_size <= 0:
        data_args.block_size = tokenizer.max_len
        # Our input block size will be the max possible for the model
    else:
        data_args.block_size = min(data_args.block_size, tokenizer.max_len)

    # Get datasets
    train_dataset = get_dataset(
        data_args, tokenizer=tokenizer) if training_args.do_train else None
    eval_dataset = get_dataset(
        data_args, tokenizer=tokenizer,
        evaluate=True) if training_args.do_eval else None
    data_collator = DataCollatorForTagGeneration(config.vocab_size)

    training_args.per_device_eval_batch_size = 1  # force eval_batch as 1
    # Initialize our Trainer
    trainer = Trainer(model=model,
                      args=training_args,
                      train_dataset=train_dataset,
                      eval_dataset=eval_dataset,
                      data_collator=data_collator)

    # Training
    if training_args.do_train:
        model_path = (model_args.model_name_or_path
                      if model_args.model_name_or_path is not None
                      and os.path.isdir(model_args.model_name_or_path) else
                      None)
        trainer.train(model_path=model_path)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        dataloader = trainer.get_eval_dataloader(eval_dataset)
        # multi-gpu eval
        if training_args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        description = "Evaluation"
        batch_size = dataloader.batch_size
        logger.info("***** Running %s *****", description)
        logger.info("  Num examples = %d", len(dataloader.dataset))
        logger.info("  Batch size = %d", batch_size)
        model.eval()

        if is_torch_tpu_available():
            dataloader = pl.ParallelLoader(
                dataloader,
                [training_args.device]).per_device_loader(training_args.device)

        results = {}
        for eid, example in enumerate(tqdm(dataloader, desc=description)):
            feature = convert_example_to_feature(example, tokenizer,
                                                 data_args.block_size)
            image_ids = torch.tensor([feature['image_ids']],
                                     dtype=torch.long).to(training_args.device)
            location_ids = torch.tensor([feature['location_ids']],
                                        dtype=torch.long).to(
                                            training_args.device)
            time_ids = torch.tensor([feature['time_ids']],
                                    dtype=torch.long).to(training_args.device)
            text_ids = torch.tensor([feature['text_ids']],
                                    dtype=torch.long).to(training_args.device)
            pid = feature['pid']
            inputs = {
                'image_ids': image_ids,
                'location_ids': location_ids,
                'time_ids': time_ids,
                'text_ids': text_ids
            }
            with torch.no_grad():
                outputs = model(**inputs)
                logits = outputs[0]

                logit_for_cls = logits[0]

                orig_vocab_size = 30522
                added_special_toks_size = 3  # <img>, <loc>, <time>
                logit_for_cls[:orig_vocab_size +
                              added_special_toks_size] = -float('inf')

                probabilities = F.softmax(logit_for_cls, 0).detach().cpu()

                probs, predicted_indices = torch.topk(probabilities, k=10)

                predicted_tokens = tokenizer.convert_ids_to_tokens(
                    predicted_indices)

                while pid in results:
                    pid = pid + '_'
                results[pid] = predicted_tokens

        results_save_path = os.path.join(training_args.output_dir,
                                         'results.json')
        with open(results_save_path, 'w') as f:
            logger.info("saved results.json into %s", training_args.output_dir)
            json.dump(results, f)
コード例 #3
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Model parameters %s", model_args)
    logger.info("Data parameters %s", data_args)
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.config_name:
        config = BertConfig.from_pretrained(model_args.config_name,
                                            cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = BertConfig.from_pretrained(model_args.model_name_or_path,
                                            cache_dir=model_args.cache_dir)
    else:
        config = BertConfig()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    config.loss_fct = model_args.loss_fct

    if model_args.tokenizer_name:
        tokenizer = BertTokenizer.from_pretrained(
            model_args.tokenizer_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        tokenizer = BertTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --tokenizer_name")

    if model_args.model_name_or_path:
        model = BertForTagGeneration.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir)
    else:
        logger.info("Training new model from scratch")
        model = BertForTagGeneration.from_config(config)

    # add vocab for special tokens and hashtags
    special_tokens = ['<img>', '<loc>', '<time>']
    num_added_special_toks = tokenizer.add_tokens(special_tokens)
    print('We have added', num_added_special_toks, 'special tokens')
    tokenizer.img_token = '<img>'
    tokenizer.loc_token = '<loc>'
    tokenizer.time_token = '<time>'
    print(tokenizer.convert_tokens_to_ids(special_tokens))
    assert tokenizer.img_token == '<img>'
    assert tokenizer.loc_token == '<loc>'
    assert tokenizer.time_token == '<time>'

    with open(data_args.tag_list) as f:
        tag_list = f.readlines()
        tag_list = ' '.join(tag_list).replace('\n', '').split()
    num_added_toks = tokenizer.add_tokens(tag_list)
    print('tag_list:', data_args.tag_list)
    print('We have added', num_added_toks, 'tokens for hashtags')
    print('total vocab_size:', len(tokenizer))
    model.resize_token_embeddings(len(tokenizer))

    if data_args.block_size <= 0:
        data_args.block_size = tokenizer.max_len
        # Our input block size will be the max possible for the model
    else:
        data_args.block_size = min(data_args.block_size, tokenizer.max_len)

    neptune_project_name = 'junmokang/bertinsta'
    neptune_experiment_name = 'bertinsta-generation'

    if not training_args.do_eval:
        if is_torch_tpu_available():
            if xm.get_ordinal() == 0:
                neptune.init(neptune_project_name)
                neptune.create_experiment(name=neptune_project_name,
                                          params=training_args.__dict__)
        else:
            neptune.init(neptune_project_name)
            neptune.create_experiment(name=neptune_project_name,
                                      params=training_args.__dict__)

    # Get datasets
    train_dataset = get_dataset(
        data_args, tokenizer=tokenizer,
        loss_fct=model_args.loss_fct) if training_args.do_train else None
    eval_dataset = get_dataset(
        data_args, tokenizer=tokenizer,
        evaluate=True) if training_args.do_eval else None
    data_collator = DataCollatorForTagGeneration(config.vocab_size,
                                                 loss_fct=model_args.loss_fct)

    training_args.per_device_eval_batch_size = 1  # force eval_batch as 1
    # Initialize our Trainer
    trainer = Trainer(model=model,
                      args=training_args,
                      neptune=neptune,
                      train_dataset=train_dataset,
                      eval_dataset=eval_dataset,
                      data_collator=data_collator)

    # Training
    if training_args.do_train:
        model_path = (model_args.model_name_or_path
                      if model_args.model_name_or_path is not None
                      and os.path.isdir(model_args.model_name_or_path) else
                      None)
        trainer.train(model_path=model_path)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        dataloader = trainer.get_eval_dataloader(eval_dataset)
        # multi-gpu eval
        if training_args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        description = "Evaluation"
        batch_size = dataloader.batch_size
        logger.info("***** Running %s *****", description)
        logger.info("  Num examples = %d", len(dataloader.dataset))
        logger.info("  Batch size = %d", batch_size)
        model.eval()

        if is_torch_tpu_available():
            dataloader = pl.ParallelLoader(
                dataloader,
                [training_args.device]).per_device_loader(training_args.device)

        results = {}
        grouping_results = {}
        # interaction_matrix = np.zeros((6, 6)) # feature interaction
        beam_width = 1
        top_k = 10

        # tag to contexts mapping
        context_list = [
            'emotion', 'mood', 'location', 'time', 'object', 'activity',
            'event', 'others'
        ]
        context2ids = {c: [] for c in context_list}
        if data_args.tag2contexts:
            with open(data_args.tag2contexts) as f:
                tag2contexts = json.load(f)
                for tag, contexts in tag2contexts.items():
                    for c in contexts:
                        context2ids[c].append(tag)
                for c in context_list:
                    context2ids[c] = tokenizer.convert_tokens_to_ids(
                        context2ids[c])

        for eid, example in enumerate(tqdm(dataloader, desc=description)):
            generated_tags = beam_decode(beam_width, top_k, model, example,
                                         tokenizer, data_args.block_size,
                                         training_args.device)
            # generated_tags = beam_decode(beam_width, top_k, model, example, tokenizer, data_args.block_size, training_args.device, None, interaction_matrix) # feature interaction
            results[example['pid']] = generated_tags
            grouping_results[example['pid']] = {}
            grouping_results[example['pid']]['all'] = generated_tags
            # print('all:', str(generated_tags))

            # diverse generation (according to context)
            if data_args.tag2contexts:
                for context in context_list:
                    generated_tags = beam_decode(beam_width, top_k, model,
                                                 example, tokenizer,
                                                 data_args.block_size,
                                                 training_args.device,
                                                 context2ids[context])
                    grouping_results[example['pid']][context] = generated_tags
                    # print(context, ':', str(generated_tags))

        # with np.printoptions(precision=2, suppress=True): # feature interaction
        #         print(interaction_matrix)
        #         print(interaction_matrix.sum(1))
        #         print(interaction_matrix / interaction_matrix.sum(1))

        results_save_path = os.path.join(training_args.output_dir,
                                         'results.json')
        with open(results_save_path, 'w') as f:
            logger.info("saved results.json into %s", training_args.output_dir)
            json.dump(results, f)

        grouping_results_save_path = os.path.join(training_args.output_dir,
                                                  'grouping_results.json')
        with open(grouping_results_save_path, 'w') as f:
            logger.info("saved grouping_results.json into %s",
                        training_args.output_dir)
            json.dump(grouping_results, f)
コード例 #4
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name,
                                            cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                            cache_dir=model_args.cache_dir)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --tokenizer_name")

    if model_args.model_name_or_path:
        model = AutoModelForPreTraining.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelWithLMHead.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    if config.model_type in ["bert", "roberta", "distilbert", "camembert"
                             ] and not data_args.mlm:
        raise ValueError(
            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
            "flag (masked language modeling).")

    if data_args.block_size <= 0:
        data_args.block_size = tokenizer.max_len
        # Our input block size will be the max possible for the model
    else:
        data_args.block_size = min(data_args.block_size, tokenizer.max_len)

    # Get datasets

    train_dataset = get_dataset(
        data_args, tokenizer=tokenizer) if training_args.do_train else None
    eval_dataset = get_dataset(
        data_args, tokenizer=tokenizer,
        evaluate=True) if training_args.do_eval else None
    data_collator = DataCollatorForLanguageModelingAVSD(
        tokenizer=tokenizer,
        mlm=data_args.mlm,
        mlm_probability=data_args.mlm_probability)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        #prediction_loss_only=True,
    )

    # Training
    if training_args.do_train:
        model_path = (model_args.model_name_or_path
                      if model_args.model_name_or_path is not None
                      and os.path.isdir(model_args.model_name_or_path) else
                      None)
        trainer.train(model_path=model_path)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    avsd_eval = True
    outputs = []
    if avsd_eval:
        outputs = []
        model = trainer.model
        data_loader = trainer.get_eval_dataloader(eval_dataset)
        for batch in tqdm(data_loader):

            for k, v in batch.items():
                batch[k] = v.to(training_args.device)

            with torch.no_grad():
                output = model(**batch)
                nsp_scores = output[1]
                #nsp_probs = F.softmax(nsp_scores, dim=1)
                nsp_scores = nsp_scores[:, 0].detach().cpu().tolist()
                outputs.extend(nsp_scores)

        results['avsd_train_set'] = outputs
        json.dump(
            results,
            open(
                '/home/halamri/summer2020/avsd-transofrmers/mlmAVSD/bertPredicitonResults_June15_corrected.txt',
                'w'))

    print('Done....')
    '''