def cross_validation(lang): print(lang) model_name = 'EMBEDDIA/crosloengual-bert' logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger('transformers') transformers_logger.setLevel(logging.WARNING) # Train and Evaluation data needs to be in a Pandas Dataframe containing at least two columns. # If the Dataframe has a header, it should contain a 'text' and a 'labels' column. # If no header is present, the Dataframe should contain at least two columns, # with the first column is the text with type str, and the second column in the label with type int. accs = [] f1s = [] df = load_single_lang(lang) kf = KFold(n_splits=10) for train_index, test_index in kf.split(df.index): df_train = df.iloc[train_index] df_test = df.iloc[test_index] # hyperparameters model_args = ClassificationArgs() model_args.logging_steps = 1000000 model_args.save_eval_checkpoints = False model_args.save_steps = 1000000 model_args.no_cache = True model_args.save_model_every_epoch = False model_args.num_train_epochs = 1 model_args.learning_rate = 2e-4 model_args.train_batch_size = 32 model_args.overwrite_output_dir = True ''' model_args.train_custom_parameters_only = True model_args.custom_parameter_groups = [ { "params": ["classifier.weight"], "lr": 2e-4, }, { "params": ["classifier.bias"], "lr": 2e-4, "weight_decay": 0.0, }, ] ''' # Create a ClassificationModel model = ClassificationModel('bert', model_name, num_labels=3, args=model_args) print(model.get_named_parameters()) # Train the model print('Training ...') model.train_model(df_train) # Evaluate the model print('Evaluating ...') predictions, raw_outputs = model.predict(df_test['text'].values) out = eval(df_test['labels'].values, predictions) accs.append(out['acc']) f1s.append(out['avg_f1']) del model # write results to file with open('results_csebert.txt', 'a+') as f: f.write("{} {} {}\n".format(lang, statistics.mean(accs), statistics.mean(f1s)))
def fit_and_evaluate(train_langs, test_lang): print(train_langs, test_lang) model_name = 'EMBEDDIA/crosloengual-bert' logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger('transformers') transformers_logger.setLevel(logging.WARNING) # Train and Evaluation data needs to be in a Pandas Dataframe containing at least two columns. # If the Dataframe has a header, it should contain a 'text' and a 'labels' column. # If no header is present, the Dataframe should contain at least two columns, # with the first column is the text with type str, and the second column in the label with type int. df_train, df_test = load_dataset(train_langs, test_lang, train_on_test_lang=True) # hyperparameters model_args = ClassificationArgs() model_args.logging_steps = 1000000 model_args.save_eval_checkpoints = False model_args.save_steps = 1000000 model_args.no_cache = True model_args.save_model_every_epoch = False model_args.num_train_epochs = 1 model_args.learning_rate = 2e-4 model_args.train_batch_size = 32 model_args.overwrite_output_dir = True ''' model_args.train_custom_parameters_only = True model_args.custom_parameter_groups = [ { "params": ["classifier.weight"], "lr": 2e-4, }, { "params": ["classifier.bias"], "lr": 2e-4, "weight_decay": 0.0, }, ] ''' # Create a ClassificationModel model = ClassificationModel('bert', model_name, num_labels=3, args=model_args) print(model.get_named_parameters()) # Train the model print('Training ...') model.train_model(df_train) # Evaluate the model print('Evaluating ...') predictions, raw_outputs = model.predict(df_test['text'].values) out = eval(df_test['labels'].values, predictions) # write results to file with open('results_csebert.txt', 'a+') as f: f.write("{} {} {}\n".format(train_langs, test_lang, out)) del model