def train(cfg):
    SEED = cfg.values.seed
    MODEL_NAME = cfg.values.model_name
    USE_KFOLD = cfg.values.val_args.use_kfold
    TRAIN_ONLY = cfg.values.train_only

    seed_everything(SEED)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # model_config_module = getattr(import_module('transformers'), cfg.values.model_arc + 'Config')
    model_config = AutoConfig.from_pretrained(MODEL_NAME)
    model_config.num_labels = 42

    whole_df = load_data("/opt/ml/input/data/train/train.tsv")
    additional_df = load_data("/opt/ml/input/data/train/additional_train.tsv")

    whole_label = whole_df['label'].values
    # additional_label = additional_df['label'].values

    if cfg.values.tokenizer_arc:
        tokenizer_module = getattr(import_module('transformers'),
                                   cfg.values.tokenizer_arc)
        tokenizer = tokenizer_module.from_pretrained(MODEL_NAME)
    else:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    early_stopping = EarlyStoppingCallback(early_stopping_patience=9999999,
                                           early_stopping_threshold=0.001)

    training_args = TrainingArguments(
        output_dir=cfg.values.train_args.output_dir,  # output directory
        save_total_limit=cfg.values.train_args.
        save_total_limit,  # number of total save model.
        save_steps=cfg.values.train_args.save_steps,  # model saving step.
        num_train_epochs=cfg.values.train_args.
        num_epochs,  # total number of training epochs
        learning_rate=cfg.values.train_args.lr,  # learning_rate
        per_device_train_batch_size=cfg.values.train_args.
        train_batch_size,  # batch size per device during training
        per_device_eval_batch_size=cfg.values.train_args.
        eval_batch_size,  # batch size for evaluation         
        warmup_steps=cfg.values.train_args.
        warmup_steps,  # number of warmup steps for learning rate scheduler
        weight_decay=cfg.values.train_args.
        weight_decay,  # strength of weight decay            
        max_grad_norm=cfg.values.train_args.max_grad_norm,
        logging_dir=cfg.values.train_args.
        logging_dir,  # directory for storing logs
        logging_steps=cfg.values.train_args.logging_steps,  # log saving step.
        evaluation_strategy=cfg.values.train_args.
        evaluation_strategy,  # evaluation strategy to adopt during training
        # `no`: No evaluation during training.
        # `steps`: Evaluate every `eval_steps`.
        # `epoch`: Evaluate every end of epoch.
        eval_steps=cfg.values.train_args.eval_steps,  # evaluation step.
        dataloader_num_workers=4,
        seed=SEED,
        label_smoothing_factor=cfg.values.train_args.label_smoothing_factor,
        load_best_model_at_end=True,
        # metric_for_best_model='accuracy'
    )

    if USE_KFOLD:
        kfold = StratifiedKFold(n_splits=cfg.values.val_args.num_k)

        k = 1
        for train_idx, val_idx in kfold.split(whole_df, whole_label):
            print('\n')
            cpprint('=' * 15 + f'{k}-Fold Cross Validation' + '=' * 15)
            train_df = whole_df.iloc[train_idx]
            # train_df = pd.concat((train_df, additional_df))
            val_df = whole_df.iloc[val_idx]

            if cfg.values.model_arc == 'Roberta':
                tokenized_train = roberta_tokenized_dataset(
                    train_df, tokenizer)
                tokenized_val = roberta_tokenized_dataset(val_df, tokenizer)
            else:
                tokenized_train = tokenized_dataset(train_df, tokenizer)
                tokenized_val = tokenized_dataset(val_df, tokenizer)

            RE_train_dataset = RE_Dataset(tokenized_train,
                                          train_df['label'].values)
            RE_val_dataset = RE_Dataset(tokenized_val, val_df['label'].values)

            try:
                if cfg.values.model_name == 'Bert':
                    model = BertForSequenceClassification.from_pretrained(
                        MODEL_NAME, config=model_config)
                else:
                    model = AutoModelForSequenceClassification.from_pretrained(
                        MODEL_NAME, config=model_config)
            except:
                # model_module = getattr(import_module('transformers'), cfg.values.model_arc)
                model_module = getattr(
                    import_module('transformers'),
                    cfg.values.model_arc + 'ForSequenceClassification')
                model = model_module.from_pretrained(MODEL_NAME,
                                                     config=model_config)

            model.parameters
            model.to(device)

            training_args.output_dir = cfg.values.train_args.output_dir + f'/{k}fold'
            training_args.logging_dir = cfg.values.train_args.output_dir + f'/{k}fold'

            optimizer = MADGRAD(model.parameters(),
                                lr=training_args.learning_rate)
            total_step = len(
                RE_train_dataset
            ) / training_args.per_device_train_batch_size * training_args.num_train_epochs
            scheduler = transformers.get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=training_args.warmup_steps,
                num_training_steps=total_step)
            optimizers = optimizer, scheduler

            trainer = Trainer(
                model=
                model,  # the instantiated 🤗 Transformers model to be trained
                args=training_args,  # training arguments, defined above
                train_dataset=RE_train_dataset,  # training dataset
                eval_dataset=RE_val_dataset,  # evaluation dataset
                compute_metrics=compute_metrics,  # define metrics function
                optimizers=optimizers,
                # callbacks=[early_stopping]
            )
            k += 1
            # train model
            trainer.train()

    else:
        cpprint('=' * 20 + f'START TRAINING' + '=' * 20)
        if not TRAIN_ONLY:
            train_df, val_df = train_test_split(
                whole_df,
                test_size=cfg.values.val_args.test_size,
                random_state=SEED)
            # train_df = pd.concat((train_df, additional_df))

            if cfg.values.model_arc == 'Roberta':
                tokenized_train = roberta_tokenized_dataset(
                    train_df, tokenizer)
                tokenized_val = roberta_tokenized_dataset(val_df, tokenizer)
            else:
                tokenized_train = tokenized_dataset(train_df, tokenizer)
                tokenized_val = tokenized_dataset(val_df, tokenizer)

            RE_train_dataset = RE_Dataset(tokenized_train,
                                          train_df['label'].values)
            RE_val_dataset = RE_Dataset(tokenized_val, val_df['label'].values)

            try:
                if cfg.values.model_name == 'Bert':
                    model = BertForSequenceClassification.from_pretrained(
                        MODEL_NAME, config=model_config)
                else:
                    model = AutoModelForSequenceClassification.from_pretrained(
                        MODEL_NAME, config=model_config)
            except:
                # model_module = getattr(import_module('transformers'), cfg.values.model_arc)
                model_module = getattr(
                    import_module('transformers'),
                    cfg.values.model_arc + 'ForSequenceClassification')
                model = model_module.from_pretrained(MODEL_NAME,
                                                     config=model_config)

            model.parameters
            model.to(device)

            optimizer = transformers.AdamW(model.parameters(),
                                           lr=training_args.learning_rate)
            total_step = len(
                RE_train_dataset
            ) / training_args.per_device_train_batch_size * training_args.num_train_epochs
            # scheduler = transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step)
            scheduler = transformers.get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=training_args.warmup_steps,
                num_training_steps=total_step)
            optimizers = optimizer, scheduler

            trainer = Trainer(
                model=
                model,  # the instantiated 🤗 Transformers model to be trained
                args=training_args,  # training arguments, defined above
                train_dataset=RE_train_dataset,  # training dataset
                eval_dataset=RE_val_dataset,  # evaluation dataset
                compute_metrics=compute_metrics,  # define metrics function
                optimizers=optimizers,
                callbacks=[early_stopping])

            # train model
            trainer.train()

        else:
            training_args.evaluation_strategy = 'no'

            if cfg.values.model_arc == 'Roberta':
                print('Roberta')
                tokenized_train = roberta_tokenized_dataset(
                    whole_df, tokenizer)
            else:
                tokenized_train = tokenized_dataset(whole_df, tokenizer)

            RE_train_dataset = RE_Dataset(tokenized_train,
                                          whole_df['label'].values)

            try:
                model = AutoModelForSequenceClassification.from_pretrained(
                    MODEL_NAME, config=model_config)
            except:
                # model_module = getattr(import_module('transformers'), cfg.values.model_arc)
                model_module = getattr(
                    import_module('transformers'),
                    cfg.values.model_arc + 'ForSequenceClassification')
                model = model_module.from_pretrained(MODEL_NAME,
                                                     config=model_config)

            model.parameters
            model.to(device)

            training_args.output_dir = cfg.values.train_args.output_dir + '/only_train'
            training_args.logging_dir = cfg.values.train_args.output_dir + '/only_train'

            optimizer = AdamP(model.parameters(),
                              lr=training_args.learning_rate)
            total_step = len(
                RE_train_dataset
            ) / training_args.per_device_train_batch_size * training_args.num_train_epochs
            scheduler = transformers.get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=training_args.warmup_steps,
                num_training_steps=total_step)
            optimizers = optimizer, scheduler

            trainer = Trainer(
                model=
                model,  # the instantiated 🤗 Transformers model to be trained
                args=training_args,  # training arguments, defined above
                train_dataset=RE_train_dataset,  # training dataset
                optimizers=optimizers,
                # callbacks=[early_stopping]
            )

            # train model
            trainer.train()
Exemple #2
0
    def fit(self, train_df, dev_df):
        """
        fitting the model based on the train set. validation is done using the dev set

        Parameters
        ----------
        :param train_df: dataframe
            a pandas dataframe containing data to be trained on

        :param dev_df: dataframe
            a pandas dataframe containing data to validate on

        :return: None
            all relevant results are saved under the the location provided to save the model in.
            Next a prediction can be done
        """
        train_labels = Counter(train_df[self.label_col_name]).keys()
        num_labels = len(train_labels)
        dev_labels = Counter(train_df[self.label_col_name]).keys()
        if num_labels != len(dev_labels):
            raise IOError("train and dev datasets contain different number of labels")
        # creating a DF for train/test with relevant columns.
        # Not clear why the 'alpha' column is needed, but as written here
        # (https://towardsdatascience.com/https-medium-com-chaturangarajapakshe-text-classification-with-transformer-models-d370944b50ca) - it is required
        train_df = pd.DataFrame({
            'id': range(len(train_df)),
            'label': train_df[self.label_col_name],
            'alpha': ['a'] * train_df.shape[0],
            'text': train_df["text"].replace(r'\n', ' ', regex=True)
        })

        dev_df = pd.DataFrame({
            'id': range(len(dev_df)),
            'label': dev_df[self.label_col_name],
            'alpha': ['a'] * dev_df.shape[0],
            'text': dev_df["text"].replace(r'\n', ' ', regex=True)
        })
        # saving the DF to the new/old folder
        train_df.to_csv(os.path.join(self.saving_data_folder, "train.tsv"),
                        index=False, columns=train_df.columns, sep='\t', header=False)
        dev_df.to_csv(os.path.join(self.saving_data_folder, "dev.tsv"),
                      index=False, columns=dev_df.columns, sep='\t', header=False)

        config = AutoConfig.from_pretrained(self.model_name, num_labels=num_labels,
                                            output_attentions=True)  ##needed for the visualizations
        # loading the actual model to memory
        model = BertForSequenceClassification.from_pretrained(self.model_name, config=config)

        # Now we need to convert the examples in the dataset to features that the model can understand
        # this is a ready made class, provided by HuggingFace
        train_dataset = SingleSentenceClassificationProcessor(mode='classification')
        dev_dataset = SingleSentenceClassificationProcessor(mode='classification')

        # now adding examples (from the DF we created earlier) to the objects we created in the cell above)
        _ = train_dataset.add_examples(texts_or_text_and_labels=train_df['text'], labels=train_df[self.label_col_name],
                                       overwrite_examples=True)
        _ = dev_dataset.add_examples(texts_or_text_and_labels=dev_df['text'], labels=dev_df[self.label_col_name],
                                     overwrite_examples=True)

        train_features = train_dataset.get_features(tokenizer=self.tokenizer, max_length=self.max_length)
        test_features = dev_dataset.get_features(tokenizer=self.tokenizer, max_length=self.max_length)
        training_args = TrainingArguments("./train")

        training_args.do_train = True
        # setting the params of the BERT classifier
        for cur_param in self.bert_model_params.keys():
            try:
                training_args.__dict__[cur_param] = eval(self.bert_model_params[cur_param])
            except TypeError:
                training_args.__dict__[cur_param] = self.bert_model_params[cur_param]
        training_args.logging_steps = (len(train_features) - 1) // training_args.per_gpu_train_batch_size + 1
        training_args.save_steps = training_args.logging_steps
        training_args.output_dir = self.saving_model_folder
        training_args.eval_steps = 100
        # training_args.logging_dir = "gs://" from torch.utils.tensorboard import SummaryWriter supports google cloud storage

        trainer = Trainer(model=model,
                          args=training_args,
                          train_dataset=train_features,
                          eval_dataset=test_features,
                          compute_metrics=self.compute_metrics)
        trainer.train()
        # saving the model
        self.save_model(model=trainer.model, folder_name='bert_based_model')
def generate_training_args(args, inoculation_step):
    training_args = TrainingArguments("tmp_trainer")
    training_args.no_cuda = args.no_cuda
    training_args.seed = args.seed
    training_args.do_train = args.do_train
    training_args.do_eval = args.do_eval
    training_args.output_dir = os.path.join(args.output_dir, str(inoculation_step)+"-sample")
    training_args.evaluation_strategy = args.evaluation_strategy # evaluation is done after each epoch
    training_args.metric_for_best_model = args.metric_for_best_model
    training_args.greater_is_better = args.greater_is_better
    training_args.logging_dir = args.logging_dir
    training_args.task_name = args.task_name
    training_args.learning_rate = args.learning_rate
    training_args.per_device_train_batch_size = args.per_device_train_batch_size
    training_args.per_device_eval_batch_size = args.per_device_eval_batch_size
    training_args.num_train_epochs = args.num_train_epochs # this is the maximum num_train_epochs, we set this to be 100.
    training_args.eval_steps = args.eval_steps
    training_args.logging_steps = args.logging_steps
    training_args.load_best_model_at_end = args.load_best_model_at_end
    if args.save_total_limit != -1:
        # only set if it is specified
        training_args.save_total_limit = args.save_total_limit
    import datetime
    date_time = "{}-{}".format(datetime.datetime.now().month, datetime.datetime.now().day)
    run_name = "{0}_{1}_{2}_{3}_mlen_{4}_lr_{5}_seed_{6}_metrics_{7}".format(
        args.run_name,
        args.task_name,
        args.model_type,
        date_time,
        args.max_seq_length,
        args.learning_rate,
        args.seed,
        args.metric_for_best_model
    )
    training_args.run_name = run_name
    training_args_dict = training_args.to_dict()
    # for PR
    _n_gpu = training_args_dict["_n_gpu"]
    del training_args_dict["_n_gpu"]
    training_args_dict["n_gpu"] = _n_gpu
    HfParser = HfArgumentParser((TrainingArguments))
    training_args = HfParser.parse_dict(training_args_dict)[0]

    if args.model_path == "":
        args.model_path = args.model_type
        if args.model_type == "":
            assert False # you have to provide one of them.
    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )

    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")
    return training_args
Exemple #4
0
def main(model_args, data_args, inf_args):
    model_name = model_args.model_name_or_path
    if model_name == None:
        model_name, model_args.model_name_or_path = get_recent_model()
    else:
        model_name = model_name.replace('/', '_')
    output_dir = f'./submit/{model_name}{model_args.suffix}/'
    # logging_dir = f'./logs/{model_name}{model_args.suffix}/'
    training_args = TrainingArguments(
        output_dir=output_dir,  # output directory
        do_predict=True,
        seed=42,
    )
    i = 0
    while os.path.exists(training_args.output_dir):
        training_args.output_dir = f'./submit/{model_name}{model_args.suffix}_{i}/'
        training_args.logging_dir = f'./logs/{model_name}{model_args.suffix}_{i}/'
        i += 1

    print(f"training Data : {training_args}")
    print(f"model Data : {model_args}")
    print(f"data : {data_args}")
    print(f"inference setting : {inf_args}")

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

    # Set the verbosity to info of the Transformers logger (on main process only):
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)
    if training_args.do_predict:
        data_args.dataset_name = './data/test_dataset'

    datasets = load_from_disk(data_args.dataset_name)
    print(datasets)
    # Load pretrained model and tokenizer
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path, )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        use_fast=True,
    )
    model = AutoModelForQuestionAnswering.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
    )

    # run passage retrieval if true
    if data_args.eval_retrieval:
        datasets = run_sparse_retrieval(datasets, training_args, inf_args)

    # eval or predict mrc model
    if training_args.do_eval or training_args.do_predict:
        run_mrc(data_args, training_args, model_args, datasets, tokenizer,
                model)
def main():
    parser = HfArgumentParser(
        (TrainingArgumentsInputs, DirectoryArgumentsInputs,
         TokenizerArgumentsInputs))
    train_args, dir_args, token_args = parser.parse_args_into_dataclasses()

    # Setup CUDA, GPU & distributed training
    if train_args.local_rank == -1:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(train_args.local_rank)
        device = torch.device("cuda", train_args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        # n_gpu = 1

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO
        if train_args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, distributed training: %s, 16-bits training: %s",
        train_args.local_rank, device, bool(train_args.local_rank != -1),
        train_args.fp16)

    # Set seed
    set_seed(train_args.seed)

    # set output_dir
    output_dir = dir_args.output_dir + "/" + \
        dir_args.model_dir_or_name.replace('/', '_') + dir_args.suffix
    i = 0
    model_name = dir_args.model_dir_or_name.replace('/', '_')
    while os.path.exists(output_dir):
        output_dir = f'{dir_args.output_dir}/{model_name}{dir_args.suffix}_{i}/'
        i += 1
    training_args = TrainingArguments(
        output_dir=output_dir,
        save_total_limit=train_args.save_total_limit,
        # total number of training epochs
        num_train_epochs=train_args.epochs,
        learning_rate=train_args.learning_rate,
        per_device_train_batch_size=train_args.per_device_batch_size,
        per_device_eval_batch_size=train_args.per_device_batch_size,
        warmup_ratio=train_args.warmup_ratio,
        weight_decay=train_args.weight_decay,  # strength of weight decay
        evaluation_strategy=
        'steps',  # evaluation strategy to adopt during training
        adam_epsilon=train_args.adam_epsilon,
        eval_steps=train_args.evaluation_step_ratio *
        train_args.per_device_batch_size,
        dataloader_num_workers=4,
        load_best_model_at_end=True,  # save_strategy, save_steps will be ignored
        metric_for_best_model="exact_match",  # eval_accuracy
        greater_is_better=True,  # set True if metric isn't loss
        label_smoothing_factor=0.5,
        fp16=train_args.fp16,
        fp16_opt_level=train_args.fp16_opt_level,
        do_train=True,
        do_eval=True,
        seed=train_args.seed,
        gradient_accumulation_steps=train_args.gradient_accumulation_steps,
        max_grad_norm=train_args.max_grad_norm,
        local_rank=train_args.local_rank,
        report_to=[])
    if dir_args.data_dir == "korquad":
        datasets = load_dataset('squad_kor_v1')
    else:
        datasets = load_from_disk(dir_args.data_dir)
    # dataset을 slicing 한 뒤 집어넣으면 오류가 있음 -> datasets 자체의 오류
    dataset_list = []
    if train_args.k_fold > 1:
        dataset_len = len(datasets)
        for i in range(train_args.k_fold):
            validation = datasets.select(
                range(int(dataset_len * (i / train_args.k_fold)),
                      int(dataset_len * ((i + 1) / train_args.k_fold))))
            dataset_train = pd.concat([
                pd.DataFrame(
                    datasets.select(
                        range(0, int(dataset_len * (i / train_args.k_fold))))),
                pd.DataFrame(
                    datasets.select(
                        range(
                            int(dataset_len *
                                ((i + 1) / train_args.k_fold), dataset_len))))
            ],
                                      ignore_index=True)
            train = Dataset.from_pandas(dataset_train)
            dataset = DatasetDict({'train': train, 'validation': validation})
            dataset_list.append(dataset)

    elif 'validation' not in datasets.column_names:
        datasets = datasets.train_test_split(test_size=0.1)
        datasets = DatasetDict({
            'train': datasets['train'],
            'validation': datasets['test']
        })
        dataset_list.append(datasets)

    else:
        dataset_list.append(datasets)

    config = AutoConfig.from_pretrained(
        dir_args.config_dir
        if dir_args.config_dir else dir_args.model_dir_or_name, )
    tokenizer = AutoTokenizer.from_pretrained(
        dir_args.vocab_dir
        if dir_args.vocab_dir else dir_args.model_dir_or_name,
        use_fast=True,
    )

    model = AutoModelForQuestionAnswering.from_pretrained(
        dir_args.model_dir_or_name,
        from_tf=bool(".ckpt" in dir_args.model_dir_or_name),
        config=config,
    )

    print("Train Arguments :")
    print(training_args)

    print("Directory Arguments:")
    print(dir_args)

    print("Tokenizer Arguments:")
    print(token_args)

    root_dir = output_dir
    for idx, dataset in enumerate(dataset_list):
        print(f"processing {idx}-fold")
        training_args.output_dir = root_dir + f'/{idx}'
        run_mrc(training_args, dir_args, token_args, dataset, tokenizer, model)
    def fit(self, train_df, dev_df):
        """
        fitting the model based on the train set. validation is done using the dev set

        Parameters
        ----------
        :param train_df: dataframe
            a pandas dataframe containing data to be trained on

        :param dev_df: dataframe
            a pandas dataframe containing data to validate on

        :return: None
            all relevant results are saved under the the location provided to save the model in.
            Next a prediction can be done
        """
        train_labels = Counter(train_df[self.label_col_name]).keys()
        num_labels = len(train_labels)
        dev_labels = Counter(dev_df[self.label_col_name]).keys()
        if num_labels != len(dev_labels):
            raise IOError(
                "train and dev datasets contain different number of labels")
        # creating a DF for train/test with relevant columns.
        # Not clear why the 'alpha' column is needed, but as written here
        # (https://towardsdatascience.com/https-medium-com-chaturangarajapakshe-text-classification-with-transformer-models-d370944b50ca) - it is required
        train_df = pd.DataFrame({
            'id':
            range(len(train_df)),
            'label':
            train_df[self.label_col_name],
            'alpha': ['a'] * train_df.shape[0],
            'text':
            train_df["text"].replace(r'\n', ' ', regex=True)
        })

        dev_df = pd.DataFrame({
            'id':
            range(len(dev_df)),
            'label':
            dev_df[self.label_col_name],
            'alpha': ['a'] * dev_df.shape[0],
            'text':
            dev_df["text"].replace(r'\n', ' ', regex=True)
        })
        # saving the DF to the new/old folder
        train_df.to_csv(os.path.join(self.saving_data_folder, "train.tsv"),
                        index=False,
                        columns=train_df.columns,
                        sep='\t',
                        header=False)
        dev_df.to_csv(os.path.join(self.saving_data_folder, "dev.tsv"),
                      index=False,
                      columns=dev_df.columns,
                      sep='\t',
                      header=False)

        config = AutoConfig.from_pretrained(
            self.model_name, num_labels=num_labels,
            output_attentions=True)  ##needed for the visualizations
        # loading the actual model to memory
        model = BertForSequenceClassification.from_pretrained(self.model_name,
                                                              config=config)

        # Now we need to convert the examples in the dataset to features that the model can understand
        # this is a ready made class, provided by HuggingFace
        train_dataset = SingleSentenceClassificationProcessor(
            mode='classification')
        dev_dataset = SingleSentenceClassificationProcessor(
            mode='classification')

        # now adding examples (from the DF we created earlier) to the objects we created in the cell above)
        _ = train_dataset.add_examples(
            texts_or_text_and_labels=train_df['text'],
            labels=train_df[self.label_col_name],
            overwrite_examples=True)
        _ = dev_dataset.add_examples(texts_or_text_and_labels=dev_df['text'],
                                     labels=dev_df[self.label_col_name],
                                     overwrite_examples=True)

        train_features = train_dataset.get_features(tokenizer=self.tokenizer,
                                                    max_length=self.max_length)
        dev_features = dev_dataset.get_features(tokenizer=self.tokenizer,
                                                max_length=self.max_length)

        # idea about a self-trainer is taken from here - https://huggingface.co/transformers/main_classes/trainer.html
        class MyTrainer(Trainer):
            def __init__(self, loss_func=torch.nn.CrossEntropyLoss(),
                         **kwargs):
                self.loss_func = loss_func
                super().__init__(**kwargs)

            def compute_loss(self, model, inputs):
                labels = inputs.pop("labels")
                outputs = model(**inputs)
                logits = outputs[0]
                return self.loss_func(logits, labels)

        class FocalLoss(nn.modules.loss._WeightedLoss):
            def __init__(self, weight=None, gamma=2, reduction='mean'):
                super(FocalLoss, self).__init__(weight, reduction=reduction)
                self.gamma = gamma
                self.weight = weight  # weight parameter will act as the alpha parameter to balance class weights

            def forward(self, input, target):
                ce_loss = F.cross_entropy(input,
                                          target,
                                          reduction=self.reduction,
                                          weight=self.weight)
                pt = torch.exp(-ce_loss)
                focal_loss = ((1 - pt)**self.gamma * ce_loss).mean()
                return focal_loss

        class_weights = compute_class_weight(class_weight='balanced',
                                             classes=np.unique(
                                                 list(train_labels)),
                                             y=train_df['label'])
        #my_loss_func = torch.nn.CrossEntropyLoss(weight=torch.tensor(class_weights, dtype=torch.float))
        my_loss_func = FocalLoss(
            weight=torch.tensor(class_weights, dtype=torch.float))

        # how to define a trainer and all its arguments is taken from here - https://github.com/huggingface/notebooks/blob/master/examples/text_classification.ipynb
        args = TrainingArguments(
            "arabic_nlp_model",
            evaluation_strategy="epoch",
            #learning_rate=1e-5,
            learning_rate=1e-4,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=8,
            num_train_epochs=5,
            weight_decay=0.01,
            load_best_model_at_end=True,
            #metric_for_best_model="macro_f1_PN",
        )

        # setting the params of the BERT classifier
        for cur_param in self.bert_model_params.keys():
            try:
                args.__dict__[cur_param] = eval(
                    self.bert_model_params[cur_param])
            except TypeError:
                args.__dict__[cur_param] = self.bert_model_params[cur_param]
        args.logging_steps = (len(train_features) -
                              1) // args.per_device_train_batch_size + 1
        args.save_steps = args.logging_steps
        args.output_dir = self.saving_model_folder
        #training_args.compute_metrics = f1_score
        #training_args.compute_metrics = self.compute_metrics
        # training_args.logging_dir = "gs://" from torch.utils.tensorboard import SummaryWriter supports google cloud storage

        trainer = MyTrainer(loss_func=my_loss_func,
                            model=model,
                            args=args,
                            train_dataset=train_features,
                            eval_dataset=dev_features,
                            compute_metrics=self.compute_metrics)

        #trainer = Trainer(model=model,
        #                  args=args,
        #                  train_dataset=train_features,
        #                 eval_dataset=dev_features,
        #                  #compute_metrics = compute_metrics)
        #                  compute_metrics=self.compute_metrics)
        trainer.train()
        # saving the model
        self.save_model(model=trainer.model)