Beispiel #1
0
def load_classification_model():
    global trainer
    global tokenizer
    mod = 'mtn_models/pytorch_model.bin'
    tok = 'mtn_models/vocab.txt'
    conf = 'mtn_models/config.json'
    tokenizer = BertTokenizer.from_pretrained(tok,
                                              do_lower_case=False,
                                              do_basic_tokenize=True,
                                              never_split=never_split_tokens,
                                              truncation=True)
    config = PretrainedConfig.from_pretrained(conf, num_labels=6)
    model = BertForSequenceClassification.from_pretrained(mod, config=config)

    training_args = TrainingArguments("./train")

    training_args.do_train = True
    training_args.evaluate_during_training = True
    training_args.adam_epsilon = 1e-8
    training_args.learning_rate = 2e-5
    training_args.warmup_steps = 0
    training_args.per_gpu_train_batch_size = 16
    training_args.per_gpu_eval_batch_size = 16
    training_args.num_train_epochs = 3
    #training_args.logging_steps = (len(train_features) - 1) // training_args.per_gpu_train_batch_size + 1
    training_args.save_steps = training_args.logging_steps
    training_args.seed = 42

    trainer = Trainer(model=model, args=training_args)
Beispiel #2
0
    def fit(self, train_df, dev_df):
        """
        fitting the model based on the train set. validation is done using the dev set

        Parameters
        ----------
        :param train_df: dataframe
            a pandas dataframe containing data to be trained on

        :param dev_df: dataframe
            a pandas dataframe containing data to validate on

        :return: None
            all relevant results are saved under the the location provided to save the model in.
            Next a prediction can be done
        """
        train_labels = Counter(train_df[self.label_col_name]).keys()
        num_labels = len(train_labels)
        dev_labels = Counter(train_df[self.label_col_name]).keys()
        if num_labels != len(dev_labels):
            raise IOError("train and dev datasets contain different number of labels")
        # creating a DF for train/test with relevant columns.
        # Not clear why the 'alpha' column is needed, but as written here
        # (https://towardsdatascience.com/https-medium-com-chaturangarajapakshe-text-classification-with-transformer-models-d370944b50ca) - it is required
        train_df = pd.DataFrame({
            'id': range(len(train_df)),
            'label': train_df[self.label_col_name],
            'alpha': ['a'] * train_df.shape[0],
            'text': train_df["text"].replace(r'\n', ' ', regex=True)
        })

        dev_df = pd.DataFrame({
            'id': range(len(dev_df)),
            'label': dev_df[self.label_col_name],
            'alpha': ['a'] * dev_df.shape[0],
            'text': dev_df["text"].replace(r'\n', ' ', regex=True)
        })
        # saving the DF to the new/old folder
        train_df.to_csv(os.path.join(self.saving_data_folder, "train.tsv"),
                        index=False, columns=train_df.columns, sep='\t', header=False)
        dev_df.to_csv(os.path.join(self.saving_data_folder, "dev.tsv"),
                      index=False, columns=dev_df.columns, sep='\t', header=False)

        config = AutoConfig.from_pretrained(self.model_name, num_labels=num_labels,
                                            output_attentions=True)  ##needed for the visualizations
        # loading the actual model to memory
        model = BertForSequenceClassification.from_pretrained(self.model_name, config=config)

        # Now we need to convert the examples in the dataset to features that the model can understand
        # this is a ready made class, provided by HuggingFace
        train_dataset = SingleSentenceClassificationProcessor(mode='classification')
        dev_dataset = SingleSentenceClassificationProcessor(mode='classification')

        # now adding examples (from the DF we created earlier) to the objects we created in the cell above)
        _ = train_dataset.add_examples(texts_or_text_and_labels=train_df['text'], labels=train_df[self.label_col_name],
                                       overwrite_examples=True)
        _ = dev_dataset.add_examples(texts_or_text_and_labels=dev_df['text'], labels=dev_df[self.label_col_name],
                                     overwrite_examples=True)

        train_features = train_dataset.get_features(tokenizer=self.tokenizer, max_length=self.max_length)
        test_features = dev_dataset.get_features(tokenizer=self.tokenizer, max_length=self.max_length)
        training_args = TrainingArguments("./train")

        training_args.do_train = True
        # setting the params of the BERT classifier
        for cur_param in self.bert_model_params.keys():
            try:
                training_args.__dict__[cur_param] = eval(self.bert_model_params[cur_param])
            except TypeError:
                training_args.__dict__[cur_param] = self.bert_model_params[cur_param]
        training_args.logging_steps = (len(train_features) - 1) // training_args.per_gpu_train_batch_size + 1
        training_args.save_steps = training_args.logging_steps
        training_args.output_dir = self.saving_model_folder
        training_args.eval_steps = 100
        # training_args.logging_dir = "gs://" from torch.utils.tensorboard import SummaryWriter supports google cloud storage

        trainer = Trainer(model=model,
                          args=training_args,
                          train_dataset=train_features,
                          eval_dataset=test_features,
                          compute_metrics=self.compute_metrics)
        trainer.train()
        # saving the model
        self.save_model(model=trainer.model, folder_name='bert_based_model')
def generate_training_args(args, inoculation_step):
    training_args = TrainingArguments("tmp_trainer")
    training_args.no_cuda = args.no_cuda
    training_args.seed = args.seed
    training_args.do_train = args.do_train
    training_args.do_eval = args.do_eval
    training_args.output_dir = os.path.join(args.output_dir, str(inoculation_step)+"-sample")
    training_args.evaluation_strategy = args.evaluation_strategy # evaluation is done after each epoch
    training_args.metric_for_best_model = args.metric_for_best_model
    training_args.greater_is_better = args.greater_is_better
    training_args.logging_dir = args.logging_dir
    training_args.task_name = args.task_name
    training_args.learning_rate = args.learning_rate
    training_args.per_device_train_batch_size = args.per_device_train_batch_size
    training_args.per_device_eval_batch_size = args.per_device_eval_batch_size
    training_args.num_train_epochs = args.num_train_epochs # this is the maximum num_train_epochs, we set this to be 100.
    training_args.eval_steps = args.eval_steps
    training_args.logging_steps = args.logging_steps
    training_args.load_best_model_at_end = args.load_best_model_at_end
    if args.save_total_limit != -1:
        # only set if it is specified
        training_args.save_total_limit = args.save_total_limit
    import datetime
    date_time = "{}-{}".format(datetime.datetime.now().month, datetime.datetime.now().day)
    run_name = "{0}_{1}_{2}_{3}_mlen_{4}_lr_{5}_seed_{6}_metrics_{7}".format(
        args.run_name,
        args.task_name,
        args.model_type,
        date_time,
        args.max_seq_length,
        args.learning_rate,
        args.seed,
        args.metric_for_best_model
    )
    training_args.run_name = run_name
    training_args_dict = training_args.to_dict()
    # for PR
    _n_gpu = training_args_dict["_n_gpu"]
    del training_args_dict["_n_gpu"]
    training_args_dict["n_gpu"] = _n_gpu
    HfParser = HfArgumentParser((TrainingArguments))
    training_args = HfParser.parse_dict(training_args_dict)[0]

    if args.model_path == "":
        args.model_path = args.model_type
        if args.model_type == "":
            assert False # you have to provide one of them.
    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )

    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")
    return training_args
        result = {"perplexity": perplexity}

        output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt")
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                for key in sorted(result.keys()):
                    writer.write("%s = %s\n" % (key, str(result[key])))

        results.update(result)

    return results


if __name__ == "__main__":
    outputDir = 'data/output/temp'
    trainFile = 'data/pmt.sample.lm'

    modelArgs = ModelArguments()
    modelArgs.model_name_or_path = 'gpt2'
    modelArgs.model_type = 'gpt2'
    dataArgs = DataTrainingArguments()
    dataArgs.train_data_file = trainFile
    dataArgs.line_by_line = True
    trainArgs = TrainingArguments(output_dir=outputDir)
    trainArgs.do_train = True
    trainArgs.per_device_train_batch_size = 1
    trainArgs.save_total_limit = 5
    trainArgs.num_train_epochs = 1
    process(modelArgs, dataArgs, trainArgs)