Exemple #1
0
def load_classification_model():
    global trainer
    global tokenizer
    mod = 'mtn_models/pytorch_model.bin'
    tok = 'mtn_models/vocab.txt'
    conf = 'mtn_models/config.json'
    tokenizer = BertTokenizer.from_pretrained(tok,
                                              do_lower_case=False,
                                              do_basic_tokenize=True,
                                              never_split=never_split_tokens,
                                              truncation=True)
    config = PretrainedConfig.from_pretrained(conf, num_labels=6)
    model = BertForSequenceClassification.from_pretrained(mod, config=config)

    training_args = TrainingArguments("./train")

    training_args.do_train = True
    training_args.evaluate_during_training = True
    training_args.adam_epsilon = 1e-8
    training_args.learning_rate = 2e-5
    training_args.warmup_steps = 0
    training_args.per_gpu_train_batch_size = 16
    training_args.per_gpu_eval_batch_size = 16
    training_args.num_train_epochs = 3
    #training_args.logging_steps = (len(train_features) - 1) // training_args.per_gpu_train_batch_size + 1
    training_args.save_steps = training_args.logging_steps
    training_args.seed = 42

    trainer = Trainer(model=model, args=training_args)
Exemple #2
0
 def init_training_args(self, model_path: str) -> TrainingArguments:
     r"""
     构造训练参数.
 """
     training_args = TrainingArguments(output_dir=model_path)
     training_args.logging_steps = 5000
     training_args.save_steps = 5000
     training_args.learning_rate = 2e-5
     training_args.num_train_epochs = 3
     training_args.per_device_train_batch_size = 32
     training_args.fp16 = self.fp16
     training_args.fp16_opt_level = "O1"
     return training_args
Exemple #3
0
    def fit(self, train_df, dev_df):
        """
        fitting the model based on the train set. validation is done using the dev set

        Parameters
        ----------
        :param train_df: dataframe
            a pandas dataframe containing data to be trained on

        :param dev_df: dataframe
            a pandas dataframe containing data to validate on

        :return: None
            all relevant results are saved under the the location provided to save the model in.
            Next a prediction can be done
        """
        train_labels = Counter(train_df[self.label_col_name]).keys()
        num_labels = len(train_labels)
        dev_labels = Counter(train_df[self.label_col_name]).keys()
        if num_labels != len(dev_labels):
            raise IOError("train and dev datasets contain different number of labels")
        # creating a DF for train/test with relevant columns.
        # Not clear why the 'alpha' column is needed, but as written here
        # (https://towardsdatascience.com/https-medium-com-chaturangarajapakshe-text-classification-with-transformer-models-d370944b50ca) - it is required
        train_df = pd.DataFrame({
            'id': range(len(train_df)),
            'label': train_df[self.label_col_name],
            'alpha': ['a'] * train_df.shape[0],
            'text': train_df["text"].replace(r'\n', ' ', regex=True)
        })

        dev_df = pd.DataFrame({
            'id': range(len(dev_df)),
            'label': dev_df[self.label_col_name],
            'alpha': ['a'] * dev_df.shape[0],
            'text': dev_df["text"].replace(r'\n', ' ', regex=True)
        })
        # saving the DF to the new/old folder
        train_df.to_csv(os.path.join(self.saving_data_folder, "train.tsv"),
                        index=False, columns=train_df.columns, sep='\t', header=False)
        dev_df.to_csv(os.path.join(self.saving_data_folder, "dev.tsv"),
                      index=False, columns=dev_df.columns, sep='\t', header=False)

        config = AutoConfig.from_pretrained(self.model_name, num_labels=num_labels,
                                            output_attentions=True)  ##needed for the visualizations
        # loading the actual model to memory
        model = BertForSequenceClassification.from_pretrained(self.model_name, config=config)

        # Now we need to convert the examples in the dataset to features that the model can understand
        # this is a ready made class, provided by HuggingFace
        train_dataset = SingleSentenceClassificationProcessor(mode='classification')
        dev_dataset = SingleSentenceClassificationProcessor(mode='classification')

        # now adding examples (from the DF we created earlier) to the objects we created in the cell above)
        _ = train_dataset.add_examples(texts_or_text_and_labels=train_df['text'], labels=train_df[self.label_col_name],
                                       overwrite_examples=True)
        _ = dev_dataset.add_examples(texts_or_text_and_labels=dev_df['text'], labels=dev_df[self.label_col_name],
                                     overwrite_examples=True)

        train_features = train_dataset.get_features(tokenizer=self.tokenizer, max_length=self.max_length)
        test_features = dev_dataset.get_features(tokenizer=self.tokenizer, max_length=self.max_length)
        training_args = TrainingArguments("./train")

        training_args.do_train = True
        # setting the params of the BERT classifier
        for cur_param in self.bert_model_params.keys():
            try:
                training_args.__dict__[cur_param] = eval(self.bert_model_params[cur_param])
            except TypeError:
                training_args.__dict__[cur_param] = self.bert_model_params[cur_param]
        training_args.logging_steps = (len(train_features) - 1) // training_args.per_gpu_train_batch_size + 1
        training_args.save_steps = training_args.logging_steps
        training_args.output_dir = self.saving_model_folder
        training_args.eval_steps = 100
        # training_args.logging_dir = "gs://" from torch.utils.tensorboard import SummaryWriter supports google cloud storage

        trainer = Trainer(model=model,
                          args=training_args,
                          train_dataset=train_features,
                          eval_dataset=test_features,
                          compute_metrics=self.compute_metrics)
        trainer.train()
        # saving the model
        self.save_model(model=trainer.model, folder_name='bert_based_model')
    def fit(self, train_df, dev_df):
        """
        fitting the model based on the train set. validation is done using the dev set

        Parameters
        ----------
        :param train_df: dataframe
            a pandas dataframe containing data to be trained on

        :param dev_df: dataframe
            a pandas dataframe containing data to validate on

        :return: None
            all relevant results are saved under the the location provided to save the model in.
            Next a prediction can be done
        """
        train_labels = Counter(train_df[self.label_col_name]).keys()
        num_labels = len(train_labels)
        dev_labels = Counter(dev_df[self.label_col_name]).keys()
        if num_labels != len(dev_labels):
            raise IOError(
                "train and dev datasets contain different number of labels")
        # creating a DF for train/test with relevant columns.
        # Not clear why the 'alpha' column is needed, but as written here
        # (https://towardsdatascience.com/https-medium-com-chaturangarajapakshe-text-classification-with-transformer-models-d370944b50ca) - it is required
        train_df = pd.DataFrame({
            'id':
            range(len(train_df)),
            'label':
            train_df[self.label_col_name],
            'alpha': ['a'] * train_df.shape[0],
            'text':
            train_df["text"].replace(r'\n', ' ', regex=True)
        })

        dev_df = pd.DataFrame({
            'id':
            range(len(dev_df)),
            'label':
            dev_df[self.label_col_name],
            'alpha': ['a'] * dev_df.shape[0],
            'text':
            dev_df["text"].replace(r'\n', ' ', regex=True)
        })
        # saving the DF to the new/old folder
        train_df.to_csv(os.path.join(self.saving_data_folder, "train.tsv"),
                        index=False,
                        columns=train_df.columns,
                        sep='\t',
                        header=False)
        dev_df.to_csv(os.path.join(self.saving_data_folder, "dev.tsv"),
                      index=False,
                      columns=dev_df.columns,
                      sep='\t',
                      header=False)

        config = AutoConfig.from_pretrained(
            self.model_name, num_labels=num_labels,
            output_attentions=True)  ##needed for the visualizations
        # loading the actual model to memory
        model = BertForSequenceClassification.from_pretrained(self.model_name,
                                                              config=config)

        # Now we need to convert the examples in the dataset to features that the model can understand
        # this is a ready made class, provided by HuggingFace
        train_dataset = SingleSentenceClassificationProcessor(
            mode='classification')
        dev_dataset = SingleSentenceClassificationProcessor(
            mode='classification')

        # now adding examples (from the DF we created earlier) to the objects we created in the cell above)
        _ = train_dataset.add_examples(
            texts_or_text_and_labels=train_df['text'],
            labels=train_df[self.label_col_name],
            overwrite_examples=True)
        _ = dev_dataset.add_examples(texts_or_text_and_labels=dev_df['text'],
                                     labels=dev_df[self.label_col_name],
                                     overwrite_examples=True)

        train_features = train_dataset.get_features(tokenizer=self.tokenizer,
                                                    max_length=self.max_length)
        dev_features = dev_dataset.get_features(tokenizer=self.tokenizer,
                                                max_length=self.max_length)

        # idea about a self-trainer is taken from here - https://huggingface.co/transformers/main_classes/trainer.html
        class MyTrainer(Trainer):
            def __init__(self, loss_func=torch.nn.CrossEntropyLoss(),
                         **kwargs):
                self.loss_func = loss_func
                super().__init__(**kwargs)

            def compute_loss(self, model, inputs):
                labels = inputs.pop("labels")
                outputs = model(**inputs)
                logits = outputs[0]
                return self.loss_func(logits, labels)

        class FocalLoss(nn.modules.loss._WeightedLoss):
            def __init__(self, weight=None, gamma=2, reduction='mean'):
                super(FocalLoss, self).__init__(weight, reduction=reduction)
                self.gamma = gamma
                self.weight = weight  # weight parameter will act as the alpha parameter to balance class weights

            def forward(self, input, target):
                ce_loss = F.cross_entropy(input,
                                          target,
                                          reduction=self.reduction,
                                          weight=self.weight)
                pt = torch.exp(-ce_loss)
                focal_loss = ((1 - pt)**self.gamma * ce_loss).mean()
                return focal_loss

        class_weights = compute_class_weight(class_weight='balanced',
                                             classes=np.unique(
                                                 list(train_labels)),
                                             y=train_df['label'])
        #my_loss_func = torch.nn.CrossEntropyLoss(weight=torch.tensor(class_weights, dtype=torch.float))
        my_loss_func = FocalLoss(
            weight=torch.tensor(class_weights, dtype=torch.float))

        # how to define a trainer and all its arguments is taken from here - https://github.com/huggingface/notebooks/blob/master/examples/text_classification.ipynb
        args = TrainingArguments(
            "arabic_nlp_model",
            evaluation_strategy="epoch",
            #learning_rate=1e-5,
            learning_rate=1e-4,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=8,
            num_train_epochs=5,
            weight_decay=0.01,
            load_best_model_at_end=True,
            #metric_for_best_model="macro_f1_PN",
        )

        # setting the params of the BERT classifier
        for cur_param in self.bert_model_params.keys():
            try:
                args.__dict__[cur_param] = eval(
                    self.bert_model_params[cur_param])
            except TypeError:
                args.__dict__[cur_param] = self.bert_model_params[cur_param]
        args.logging_steps = (len(train_features) -
                              1) // args.per_device_train_batch_size + 1
        args.save_steps = args.logging_steps
        args.output_dir = self.saving_model_folder
        #training_args.compute_metrics = f1_score
        #training_args.compute_metrics = self.compute_metrics
        # training_args.logging_dir = "gs://" from torch.utils.tensorboard import SummaryWriter supports google cloud storage

        trainer = MyTrainer(loss_func=my_loss_func,
                            model=model,
                            args=args,
                            train_dataset=train_features,
                            eval_dataset=dev_features,
                            compute_metrics=self.compute_metrics)

        #trainer = Trainer(model=model,
        #                  args=args,
        #                  train_dataset=train_features,
        #                 eval_dataset=dev_features,
        #                  #compute_metrics = compute_metrics)
        #                  compute_metrics=self.compute_metrics)
        trainer.train()
        # saving the model
        self.save_model(model=trainer.model)
            per_device_eval_batch_size=8,
            num_train_epochs=5,
            weight_decay=0.01,
            load_best_model_at_end=True,
            metric_for_best_model="macro_f1_PN",
        )

        # setting the params of the BERT classifier
        bert_model_params = config_dict['bert_model_params']
        bert_model_params['seed'] = config_dict['random_seed']
        for cur_param in bert_model_params.keys():
            try:
                args.__dict__[cur_param] = eval(bert_model_params[cur_param])
            except TypeError:
                args.__dict__[cur_param] = bert_model_params[cur_param]
        args.save_steps = args.logging_steps
        trainer = MyTrainer(
            model=loaded_model,
            args=args,
            train_dataset=dev_features,
            eval_dataset=dev_features,
            compute_metrics=BertBasedSentimentAnalyser.compute_metrics)
        # splitting the prediction set to bulks of 1000 so we will not meet memory errors)
        pred_bulks = 1001
        all_predictions = list()
        for cur_idx in range(int(len(dev_features) / pred_bulks) + 1):
            cur_bulk = dev_features[pred_bulks * cur_idx:pred_bulks *
                                    (cur_idx + 1)]
            trainer_predictions = trainer.predict(cur_bulk)
            all_predictions.append(trainer_predictions.predictions[0])
        # creating the predictions (proba)