def train(cfg): SEED = cfg.values.seed MODEL_NAME = cfg.values.model_name USE_KFOLD = cfg.values.val_args.use_kfold TRAIN_ONLY = cfg.values.train_only seed_everything(SEED) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # model_config_module = getattr(import_module('transformers'), cfg.values.model_arc + 'Config') model_config = AutoConfig.from_pretrained(MODEL_NAME) model_config.num_labels = 42 whole_df = load_data("/opt/ml/input/data/train/train.tsv") additional_df = load_data("/opt/ml/input/data/train/additional_train.tsv") whole_label = whole_df['label'].values # additional_label = additional_df['label'].values if cfg.values.tokenizer_arc: tokenizer_module = getattr(import_module('transformers'), cfg.values.tokenizer_arc) tokenizer = tokenizer_module.from_pretrained(MODEL_NAME) else: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) early_stopping = EarlyStoppingCallback(early_stopping_patience=9999999, early_stopping_threshold=0.001) training_args = TrainingArguments( output_dir=cfg.values.train_args.output_dir, # output directory save_total_limit=cfg.values.train_args. save_total_limit, # number of total save model. save_steps=cfg.values.train_args.save_steps, # model saving step. num_train_epochs=cfg.values.train_args. num_epochs, # total number of training epochs learning_rate=cfg.values.train_args.lr, # learning_rate per_device_train_batch_size=cfg.values.train_args. train_batch_size, # batch size per device during training per_device_eval_batch_size=cfg.values.train_args. eval_batch_size, # batch size for evaluation warmup_steps=cfg.values.train_args. warmup_steps, # number of warmup steps for learning rate scheduler weight_decay=cfg.values.train_args. weight_decay, # strength of weight decay max_grad_norm=cfg.values.train_args.max_grad_norm, logging_dir=cfg.values.train_args. logging_dir, # directory for storing logs logging_steps=cfg.values.train_args.logging_steps, # log saving step. evaluation_strategy=cfg.values.train_args. evaluation_strategy, # evaluation strategy to adopt during training # `no`: No evaluation during training. # `steps`: Evaluate every `eval_steps`. # `epoch`: Evaluate every end of epoch. eval_steps=cfg.values.train_args.eval_steps, # evaluation step. dataloader_num_workers=4, seed=SEED, label_smoothing_factor=cfg.values.train_args.label_smoothing_factor, load_best_model_at_end=True, # metric_for_best_model='accuracy' ) if USE_KFOLD: kfold = StratifiedKFold(n_splits=cfg.values.val_args.num_k) k = 1 for train_idx, val_idx in kfold.split(whole_df, whole_label): print('\n') cpprint('=' * 15 + f'{k}-Fold Cross Validation' + '=' * 15) train_df = whole_df.iloc[train_idx] # train_df = pd.concat((train_df, additional_df)) val_df = whole_df.iloc[val_idx] if cfg.values.model_arc == 'Roberta': tokenized_train = roberta_tokenized_dataset( train_df, tokenizer) tokenized_val = roberta_tokenized_dataset(val_df, tokenizer) else: tokenized_train = tokenized_dataset(train_df, tokenizer) tokenized_val = tokenized_dataset(val_df, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, train_df['label'].values) RE_val_dataset = RE_Dataset(tokenized_val, val_df['label'].values) try: if cfg.values.model_name == 'Bert': model = BertForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) else: model = AutoModelForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) except: # model_module = getattr(import_module('transformers'), cfg.values.model_arc) model_module = getattr( import_module('transformers'), cfg.values.model_arc + 'ForSequenceClassification') model = model_module.from_pretrained(MODEL_NAME, config=model_config) model.parameters model.to(device) training_args.output_dir = cfg.values.train_args.output_dir + f'/{k}fold' training_args.logging_dir = cfg.values.train_args.output_dir + f'/{k}fold' optimizer = MADGRAD(model.parameters(), lr=training_args.learning_rate) total_step = len( RE_train_dataset ) / training_args.per_device_train_batch_size * training_args.num_train_epochs scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) optimizers = optimizer, scheduler trainer = Trainer( model= model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_val_dataset, # evaluation dataset compute_metrics=compute_metrics, # define metrics function optimizers=optimizers, # callbacks=[early_stopping] ) k += 1 # train model trainer.train() else: cpprint('=' * 20 + f'START TRAINING' + '=' * 20) if not TRAIN_ONLY: train_df, val_df = train_test_split( whole_df, test_size=cfg.values.val_args.test_size, random_state=SEED) # train_df = pd.concat((train_df, additional_df)) if cfg.values.model_arc == 'Roberta': tokenized_train = roberta_tokenized_dataset( train_df, tokenizer) tokenized_val = roberta_tokenized_dataset(val_df, tokenizer) else: tokenized_train = tokenized_dataset(train_df, tokenizer) tokenized_val = tokenized_dataset(val_df, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, train_df['label'].values) RE_val_dataset = RE_Dataset(tokenized_val, val_df['label'].values) try: if cfg.values.model_name == 'Bert': model = BertForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) else: model = AutoModelForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) except: # model_module = getattr(import_module('transformers'), cfg.values.model_arc) model_module = getattr( import_module('transformers'), cfg.values.model_arc + 'ForSequenceClassification') model = model_module.from_pretrained(MODEL_NAME, config=model_config) model.parameters model.to(device) optimizer = transformers.AdamW(model.parameters(), lr=training_args.learning_rate) total_step = len( RE_train_dataset ) / training_args.per_device_train_batch_size * training_args.num_train_epochs # scheduler = transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) optimizers = optimizer, scheduler trainer = Trainer( model= model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_val_dataset, # evaluation dataset compute_metrics=compute_metrics, # define metrics function optimizers=optimizers, callbacks=[early_stopping]) # train model trainer.train() else: training_args.evaluation_strategy = 'no' if cfg.values.model_arc == 'Roberta': print('Roberta') tokenized_train = roberta_tokenized_dataset( whole_df, tokenizer) else: tokenized_train = tokenized_dataset(whole_df, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, whole_df['label'].values) try: model = AutoModelForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) except: # model_module = getattr(import_module('transformers'), cfg.values.model_arc) model_module = getattr( import_module('transformers'), cfg.values.model_arc + 'ForSequenceClassification') model = model_module.from_pretrained(MODEL_NAME, config=model_config) model.parameters model.to(device) training_args.output_dir = cfg.values.train_args.output_dir + '/only_train' training_args.logging_dir = cfg.values.train_args.output_dir + '/only_train' optimizer = AdamP(model.parameters(), lr=training_args.learning_rate) total_step = len( RE_train_dataset ) / training_args.per_device_train_batch_size * training_args.num_train_epochs scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) optimizers = optimizer, scheduler trainer = Trainer( model= model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset optimizers=optimizers, # callbacks=[early_stopping] ) # train model trainer.train()
def fit(self, train_df, dev_df): """ fitting the model based on the train set. validation is done using the dev set Parameters ---------- :param train_df: dataframe a pandas dataframe containing data to be trained on :param dev_df: dataframe a pandas dataframe containing data to validate on :return: None all relevant results are saved under the the location provided to save the model in. Next a prediction can be done """ train_labels = Counter(train_df[self.label_col_name]).keys() num_labels = len(train_labels) dev_labels = Counter(train_df[self.label_col_name]).keys() if num_labels != len(dev_labels): raise IOError("train and dev datasets contain different number of labels") # creating a DF for train/test with relevant columns. # Not clear why the 'alpha' column is needed, but as written here # (https://towardsdatascience.com/https-medium-com-chaturangarajapakshe-text-classification-with-transformer-models-d370944b50ca) - it is required train_df = pd.DataFrame({ 'id': range(len(train_df)), 'label': train_df[self.label_col_name], 'alpha': ['a'] * train_df.shape[0], 'text': train_df["text"].replace(r'\n', ' ', regex=True) }) dev_df = pd.DataFrame({ 'id': range(len(dev_df)), 'label': dev_df[self.label_col_name], 'alpha': ['a'] * dev_df.shape[0], 'text': dev_df["text"].replace(r'\n', ' ', regex=True) }) # saving the DF to the new/old folder train_df.to_csv(os.path.join(self.saving_data_folder, "train.tsv"), index=False, columns=train_df.columns, sep='\t', header=False) dev_df.to_csv(os.path.join(self.saving_data_folder, "dev.tsv"), index=False, columns=dev_df.columns, sep='\t', header=False) config = AutoConfig.from_pretrained(self.model_name, num_labels=num_labels, output_attentions=True) ##needed for the visualizations # loading the actual model to memory model = BertForSequenceClassification.from_pretrained(self.model_name, config=config) # Now we need to convert the examples in the dataset to features that the model can understand # this is a ready made class, provided by HuggingFace train_dataset = SingleSentenceClassificationProcessor(mode='classification') dev_dataset = SingleSentenceClassificationProcessor(mode='classification') # now adding examples (from the DF we created earlier) to the objects we created in the cell above) _ = train_dataset.add_examples(texts_or_text_and_labels=train_df['text'], labels=train_df[self.label_col_name], overwrite_examples=True) _ = dev_dataset.add_examples(texts_or_text_and_labels=dev_df['text'], labels=dev_df[self.label_col_name], overwrite_examples=True) train_features = train_dataset.get_features(tokenizer=self.tokenizer, max_length=self.max_length) test_features = dev_dataset.get_features(tokenizer=self.tokenizer, max_length=self.max_length) training_args = TrainingArguments("./train") training_args.do_train = True # setting the params of the BERT classifier for cur_param in self.bert_model_params.keys(): try: training_args.__dict__[cur_param] = eval(self.bert_model_params[cur_param]) except TypeError: training_args.__dict__[cur_param] = self.bert_model_params[cur_param] training_args.logging_steps = (len(train_features) - 1) // training_args.per_gpu_train_batch_size + 1 training_args.save_steps = training_args.logging_steps training_args.output_dir = self.saving_model_folder training_args.eval_steps = 100 # training_args.logging_dir = "gs://" from torch.utils.tensorboard import SummaryWriter supports google cloud storage trainer = Trainer(model=model, args=training_args, train_dataset=train_features, eval_dataset=test_features, compute_metrics=self.compute_metrics) trainer.train() # saving the model self.save_model(model=trainer.model, folder_name='bert_based_model')
def generate_training_args(args, inoculation_step): training_args = TrainingArguments("tmp_trainer") training_args.no_cuda = args.no_cuda training_args.seed = args.seed training_args.do_train = args.do_train training_args.do_eval = args.do_eval training_args.output_dir = os.path.join(args.output_dir, str(inoculation_step)+"-sample") training_args.evaluation_strategy = args.evaluation_strategy # evaluation is done after each epoch training_args.metric_for_best_model = args.metric_for_best_model training_args.greater_is_better = args.greater_is_better training_args.logging_dir = args.logging_dir training_args.task_name = args.task_name training_args.learning_rate = args.learning_rate training_args.per_device_train_batch_size = args.per_device_train_batch_size training_args.per_device_eval_batch_size = args.per_device_eval_batch_size training_args.num_train_epochs = args.num_train_epochs # this is the maximum num_train_epochs, we set this to be 100. training_args.eval_steps = args.eval_steps training_args.logging_steps = args.logging_steps training_args.load_best_model_at_end = args.load_best_model_at_end if args.save_total_limit != -1: # only set if it is specified training_args.save_total_limit = args.save_total_limit import datetime date_time = "{}-{}".format(datetime.datetime.now().month, datetime.datetime.now().day) run_name = "{0}_{1}_{2}_{3}_mlen_{4}_lr_{5}_seed_{6}_metrics_{7}".format( args.run_name, args.task_name, args.model_type, date_time, args.max_seq_length, args.learning_rate, args.seed, args.metric_for_best_model ) training_args.run_name = run_name training_args_dict = training_args.to_dict() # for PR _n_gpu = training_args_dict["_n_gpu"] del training_args_dict["_n_gpu"] training_args_dict["n_gpu"] = _n_gpu HfParser = HfArgumentParser((TrainingArguments)) training_args = HfParser.parse_dict(training_args_dict)[0] if args.model_path == "": args.model_path = args.model_type if args.model_type == "": assert False # you have to provide one of them. # Set seed before initializing model. set_seed(training_args.seed) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") return training_args
def main(model_args, data_args, inf_args): model_name = model_args.model_name_or_path if model_name == None: model_name, model_args.model_name_or_path = get_recent_model() else: model_name = model_name.replace('/', '_') output_dir = f'./submit/{model_name}{model_args.suffix}/' # logging_dir = f'./logs/{model_name}{model_args.suffix}/' training_args = TrainingArguments( output_dir=output_dir, # output directory do_predict=True, seed=42, ) i = 0 while os.path.exists(training_args.output_dir): training_args.output_dir = f'./submit/{model_name}{model_args.suffix}_{i}/' training_args.logging_dir = f'./logs/{model_name}{model_args.suffix}_{i}/' i += 1 print(f"training Data : {training_args}") print(f"model Data : {model_args}") print(f"data : {data_args}") print(f"inference setting : {inf_args}") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) # Set the verbosity to info of the Transformers logger (on main process only): logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) if training_args.do_predict: data_args.dataset_name = './data/test_dataset' datasets = load_from_disk(data_args.dataset_name) print(datasets) # Load pretrained model and tokenizer config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, use_fast=True, ) model = AutoModelForQuestionAnswering.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, ) # run passage retrieval if true if data_args.eval_retrieval: datasets = run_sparse_retrieval(datasets, training_args, inf_args) # eval or predict mrc model if training_args.do_eval or training_args.do_predict: run_mrc(data_args, training_args, model_args, datasets, tokenizer, model)
def main(): parser = HfArgumentParser( (TrainingArgumentsInputs, DirectoryArgumentsInputs, TokenizerArgumentsInputs)) train_args, dir_args, token_args = parser.parse_args_into_dataclasses() # Setup CUDA, GPU & distributed training if train_args.local_rank == -1: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(train_args.local_rank) device = torch.device("cuda", train_args.local_rank) torch.distributed.init_process_group(backend='nccl') # n_gpu = 1 # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if train_args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, distributed training: %s, 16-bits training: %s", train_args.local_rank, device, bool(train_args.local_rank != -1), train_args.fp16) # Set seed set_seed(train_args.seed) # set output_dir output_dir = dir_args.output_dir + "/" + \ dir_args.model_dir_or_name.replace('/', '_') + dir_args.suffix i = 0 model_name = dir_args.model_dir_or_name.replace('/', '_') while os.path.exists(output_dir): output_dir = f'{dir_args.output_dir}/{model_name}{dir_args.suffix}_{i}/' i += 1 training_args = TrainingArguments( output_dir=output_dir, save_total_limit=train_args.save_total_limit, # total number of training epochs num_train_epochs=train_args.epochs, learning_rate=train_args.learning_rate, per_device_train_batch_size=train_args.per_device_batch_size, per_device_eval_batch_size=train_args.per_device_batch_size, warmup_ratio=train_args.warmup_ratio, weight_decay=train_args.weight_decay, # strength of weight decay evaluation_strategy= 'steps', # evaluation strategy to adopt during training adam_epsilon=train_args.adam_epsilon, eval_steps=train_args.evaluation_step_ratio * train_args.per_device_batch_size, dataloader_num_workers=4, load_best_model_at_end=True, # save_strategy, save_steps will be ignored metric_for_best_model="exact_match", # eval_accuracy greater_is_better=True, # set True if metric isn't loss label_smoothing_factor=0.5, fp16=train_args.fp16, fp16_opt_level=train_args.fp16_opt_level, do_train=True, do_eval=True, seed=train_args.seed, gradient_accumulation_steps=train_args.gradient_accumulation_steps, max_grad_norm=train_args.max_grad_norm, local_rank=train_args.local_rank, report_to=[]) if dir_args.data_dir == "korquad": datasets = load_dataset('squad_kor_v1') else: datasets = load_from_disk(dir_args.data_dir) # dataset을 slicing 한 뒤 집어넣으면 오류가 있음 -> datasets 자체의 오류 dataset_list = [] if train_args.k_fold > 1: dataset_len = len(datasets) for i in range(train_args.k_fold): validation = datasets.select( range(int(dataset_len * (i / train_args.k_fold)), int(dataset_len * ((i + 1) / train_args.k_fold)))) dataset_train = pd.concat([ pd.DataFrame( datasets.select( range(0, int(dataset_len * (i / train_args.k_fold))))), pd.DataFrame( datasets.select( range( int(dataset_len * ((i + 1) / train_args.k_fold), dataset_len)))) ], ignore_index=True) train = Dataset.from_pandas(dataset_train) dataset = DatasetDict({'train': train, 'validation': validation}) dataset_list.append(dataset) elif 'validation' not in datasets.column_names: datasets = datasets.train_test_split(test_size=0.1) datasets = DatasetDict({ 'train': datasets['train'], 'validation': datasets['test'] }) dataset_list.append(datasets) else: dataset_list.append(datasets) config = AutoConfig.from_pretrained( dir_args.config_dir if dir_args.config_dir else dir_args.model_dir_or_name, ) tokenizer = AutoTokenizer.from_pretrained( dir_args.vocab_dir if dir_args.vocab_dir else dir_args.model_dir_or_name, use_fast=True, ) model = AutoModelForQuestionAnswering.from_pretrained( dir_args.model_dir_or_name, from_tf=bool(".ckpt" in dir_args.model_dir_or_name), config=config, ) print("Train Arguments :") print(training_args) print("Directory Arguments:") print(dir_args) print("Tokenizer Arguments:") print(token_args) root_dir = output_dir for idx, dataset in enumerate(dataset_list): print(f"processing {idx}-fold") training_args.output_dir = root_dir + f'/{idx}' run_mrc(training_args, dir_args, token_args, dataset, tokenizer, model)
def fit(self, train_df, dev_df): """ fitting the model based on the train set. validation is done using the dev set Parameters ---------- :param train_df: dataframe a pandas dataframe containing data to be trained on :param dev_df: dataframe a pandas dataframe containing data to validate on :return: None all relevant results are saved under the the location provided to save the model in. Next a prediction can be done """ train_labels = Counter(train_df[self.label_col_name]).keys() num_labels = len(train_labels) dev_labels = Counter(dev_df[self.label_col_name]).keys() if num_labels != len(dev_labels): raise IOError( "train and dev datasets contain different number of labels") # creating a DF for train/test with relevant columns. # Not clear why the 'alpha' column is needed, but as written here # (https://towardsdatascience.com/https-medium-com-chaturangarajapakshe-text-classification-with-transformer-models-d370944b50ca) - it is required train_df = pd.DataFrame({ 'id': range(len(train_df)), 'label': train_df[self.label_col_name], 'alpha': ['a'] * train_df.shape[0], 'text': train_df["text"].replace(r'\n', ' ', regex=True) }) dev_df = pd.DataFrame({ 'id': range(len(dev_df)), 'label': dev_df[self.label_col_name], 'alpha': ['a'] * dev_df.shape[0], 'text': dev_df["text"].replace(r'\n', ' ', regex=True) }) # saving the DF to the new/old folder train_df.to_csv(os.path.join(self.saving_data_folder, "train.tsv"), index=False, columns=train_df.columns, sep='\t', header=False) dev_df.to_csv(os.path.join(self.saving_data_folder, "dev.tsv"), index=False, columns=dev_df.columns, sep='\t', header=False) config = AutoConfig.from_pretrained( self.model_name, num_labels=num_labels, output_attentions=True) ##needed for the visualizations # loading the actual model to memory model = BertForSequenceClassification.from_pretrained(self.model_name, config=config) # Now we need to convert the examples in the dataset to features that the model can understand # this is a ready made class, provided by HuggingFace train_dataset = SingleSentenceClassificationProcessor( mode='classification') dev_dataset = SingleSentenceClassificationProcessor( mode='classification') # now adding examples (from the DF we created earlier) to the objects we created in the cell above) _ = train_dataset.add_examples( texts_or_text_and_labels=train_df['text'], labels=train_df[self.label_col_name], overwrite_examples=True) _ = dev_dataset.add_examples(texts_or_text_and_labels=dev_df['text'], labels=dev_df[self.label_col_name], overwrite_examples=True) train_features = train_dataset.get_features(tokenizer=self.tokenizer, max_length=self.max_length) dev_features = dev_dataset.get_features(tokenizer=self.tokenizer, max_length=self.max_length) # idea about a self-trainer is taken from here - https://huggingface.co/transformers/main_classes/trainer.html class MyTrainer(Trainer): def __init__(self, loss_func=torch.nn.CrossEntropyLoss(), **kwargs): self.loss_func = loss_func super().__init__(**kwargs) def compute_loss(self, model, inputs): labels = inputs.pop("labels") outputs = model(**inputs) logits = outputs[0] return self.loss_func(logits, labels) class FocalLoss(nn.modules.loss._WeightedLoss): def __init__(self, weight=None, gamma=2, reduction='mean'): super(FocalLoss, self).__init__(weight, reduction=reduction) self.gamma = gamma self.weight = weight # weight parameter will act as the alpha parameter to balance class weights def forward(self, input, target): ce_loss = F.cross_entropy(input, target, reduction=self.reduction, weight=self.weight) pt = torch.exp(-ce_loss) focal_loss = ((1 - pt)**self.gamma * ce_loss).mean() return focal_loss class_weights = compute_class_weight(class_weight='balanced', classes=np.unique( list(train_labels)), y=train_df['label']) #my_loss_func = torch.nn.CrossEntropyLoss(weight=torch.tensor(class_weights, dtype=torch.float)) my_loss_func = FocalLoss( weight=torch.tensor(class_weights, dtype=torch.float)) # how to define a trainer and all its arguments is taken from here - https://github.com/huggingface/notebooks/blob/master/examples/text_classification.ipynb args = TrainingArguments( "arabic_nlp_model", evaluation_strategy="epoch", #learning_rate=1e-5, learning_rate=1e-4, per_device_train_batch_size=16, per_device_eval_batch_size=8, num_train_epochs=5, weight_decay=0.01, load_best_model_at_end=True, #metric_for_best_model="macro_f1_PN", ) # setting the params of the BERT classifier for cur_param in self.bert_model_params.keys(): try: args.__dict__[cur_param] = eval( self.bert_model_params[cur_param]) except TypeError: args.__dict__[cur_param] = self.bert_model_params[cur_param] args.logging_steps = (len(train_features) - 1) // args.per_device_train_batch_size + 1 args.save_steps = args.logging_steps args.output_dir = self.saving_model_folder #training_args.compute_metrics = f1_score #training_args.compute_metrics = self.compute_metrics # training_args.logging_dir = "gs://" from torch.utils.tensorboard import SummaryWriter supports google cloud storage trainer = MyTrainer(loss_func=my_loss_func, model=model, args=args, train_dataset=train_features, eval_dataset=dev_features, compute_metrics=self.compute_metrics) #trainer = Trainer(model=model, # args=args, # train_dataset=train_features, # eval_dataset=dev_features, # #compute_metrics = compute_metrics) # compute_metrics=self.compute_metrics) trainer.train() # saving the model self.save_model(model=trainer.model)