Exemple #1
0
def finalmodel(outfolder):  # type: (str) -> None
    """
  Trains the BERT model using the parameters currently set in
  buildbertargs().  The parameters have been explored with a
  train/test split, so this training is with the full dataset.

  :param outfolder: the folder to write the model to
  :type outfolder: str
  """

    rawdata = helpers.refdf.copy(deep=True)
    print('Raw data: ' + str(rawdata.shape))
    rawdata.set_index('Clause ID', inplace=True)
    # sourcedata = helpers.dedupdf.copy(deep=True)
    # print('Deduped data: ' + str(sourcedata.shape))
    sourcedata = helpers.refdf.copy(deep=True)
    print('Raw data: ' + str(sourcedata.shape))
    sourcedata = sourcedata[sourcedata['Clause Text'].map(helpers.goodsize)]
    print('Sized data: ' + str(sourcedata.shape))
    sourcedata.set_index('Clause ID', inplace=True)

    traindata = pd.DataFrame(
        {
            'text': sourcedata['Clause Text'],
            'labels': sourcedata['Classification']
        },
        index=sourcedata.index)

    evaldata = pd.DataFrame(
        {
            'text': rawdata['Clause Text'],
            'labels': rawdata['Classification']
        },
        index=rawdata.index)

    print('Data for BERT: ' + str(traindata.shape))

    accargs = buildbertargs()
    accargs.output_dir = outfolder
    accmodel = ClassificationModel('roberta',
                                   'roberta-base',
                                   args=accargs,
                                   weight=[2, 1])
    accmodel.train_model(traindata)

    print('---------------')
    print('Training Data Eval:')

    result, model_outputs, wrong_predictions = accmodel.eval_model(traindata)
    print(result)

    print('---------------')
    print('Full Data Eval:')

    result, model_outputs, wrong_predictions = accmodel.eval_model(evaldata)
    # {'mcc': 0.9062028924099057, 'tp': 4835, 'tn': 1368, 'fp': 74, 'fn': 140, 'eval_loss': 0.18330956540325125}
    print(result)
def train():
    # Initialize a new wandb run
    wandb.init()

    # Create a TransformerModel
    model = ClassificationModel("roberta", "roberta-base", use_cuda=True, args=model_args, sweep_config=wandb.config,)

    # Train the model
    model.train_model(train_df, eval_df=eval_df)

    # Evaluate the model
    model.eval_model(eval_df)

    # Sync wandb
    wandb.join()
Exemple #3
0
def trainroberta():  # type: () -> None
    """
  Uses a default BERT language model to train a classifier on clauses
  provided in the training CSV
  """

    sourcedata = helpers.refdf.copy(deep=True)
    print('Raw data: ' + str(sourcedata.shape))
    # sourcedata['Clause Text'] = sourcedata['Clause Text'].str.lower()
    sourcedata = sourcedata[sourcedata['Clause Text'].map(helpers.goodsize)]
    print('Cleaned data: ' + str(sourcedata.shape))
    sourcedata.set_index('Clause ID', inplace=True)

    bertdata = pd.DataFrame(
        {
            'text': sourcedata['Clause Text'],
            'labels': sourcedata['Classification']
        },
        index=sourcedata.index)

    traindf, testdf = train_test_split(bertdata,
                                       test_size=0.2,
                                       random_state=18)

    print('Data for BERT: ' + str(bertdata.shape))
    print('Data for training: ' + str(traindf.shape))
    print('Data for testing: ' + str(testdf.shape))

    accmodel = ClassificationModel('roberta',
                                   'roberta-base',
                                   args=buildbertargs(),
                                   weight=[2, 1])
    accmodel.train_model(traindf, eval_df=testdf)

    print('---------------')
    print('Test Data Eval:')

    result, model_outputs, wrong_predictions = accmodel.eval_model(testdf)
    print(result)

    #  model_outputs = [softmax(curclause, axis=1) for curclause in model_outputs]
    #  print(str(model_outputs))

    print('---------------')
    print('Full Data Eval:')

    result, model_outputs, wrong_predictions = accmodel.eval_model(bertdata)
    print(result)
Exemple #4
0
def fake_classify(train_set, eval_set, test_set, seed):

    # Create a TransformerModel

    model = ClassificationModel('bert',
                                'bert-base-multilingual-uncased',
                                args={
                                    'max_seq_length': 512,
                                    'num_train_epochs': 3,
                                    'overwrite_output_dir': True,
                                    'manual_seed': seed
                                },
                                use_cuda=True)
    print(model.args)

    # Train the model
    model.train_model(train_set)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        test_set,
        f1=sklearn.metrics.f1_score,
        acc=sklearn.metrics.accuracy_score)
    #print('Evaluation results = ', results(results))

    return result, model_outputs, wrong_predictions
Exemple #5
0
def fake_classify(train_set, eval_set, test_set, seed):

    # Create a TransformerModel

    model = ClassificationModel('bert',
                                'bert-base-multilingual-uncased',
                                args={
                                    'num_train_epochs': 3,
                                    'overwrite_output_dir': True,
                                    'manual_seed': seed
                                },
                                use_cuda=True)
    print(model.args)
    # Train the model
    model.train_model(train_set)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        test_set,
        f1=sklearn.metrics.f1_score,
        acc=sklearn.metrics.accuracy_score)
    #print('Evaluation results = ', results(results))

    #save the model

    #import torch
    #torch.save(model, path) --> no need to do this, model gets saved in output dir

    return result, model_outputs, wrong_predictions
Exemple #6
0
def main(path, valid_in_cat_path, valid_out_of_cat_path):
    steam_df = load_steam_data()
    i = 1
    print("starting training, using fold " + str(i))

    train, test = load_fold_data(path, i)
    # Train the model using roberta model
    args_dict = {'output_dir': '../../models/roberta-base-bs8-e6-fold' + str(i),
                 'use_cached_eval_features': False,
                 'reprocess_input_data': True,
                 'train_batch_size': 8,
                 'num_train_epochs': 6,
                 'fp16': False,
                 'overwrite_output_dir': True}
    model = ClassificationModel('roberta', 'roberta-base', num_labels=2, args=args_dict)
    model.train_model(train)
    print("done training model fold " + str(i))
    result, model_outputs, wrong_predictions = model.eval_model(test, acc=accuracy_score, f1=f1_score)
    acc = result['acc']
    f1 = result['f1']
    print(f"acc: {acc} , f1: {f1}")

    # Make predictions with the model
    save_path = '../../reports/steam-prediction.csv'
    print("predicting...")
    predictions, raw_outputs = model.predict(steam_df["sentence"].tolist())
    print(f"predicting finished - saved to {save_path}" )
    steam_df['prediction'] = predictions
    steam_df.to_csv(save_path, index=False)
Exemple #7
0
def transformer(train_df, eval_df, datafile):

    #tokenizer = BertTokenizer.from_pretrained("bert-base-dutch-cased")
    model = ClassificationModel(
        "bert", "bert-base-dutch-cased", use_cuda=False, num_labels=2
    )  # You can set class weights by using the optional weight argument

    # Train the model
    model.train_model(train_df)

    result, model_outputs, wrong_predictions = model.eval_model(eval_df)
    print(model_outputs)

    predlist = []
    model1_outputs = model_outputs.tolist()
    for output in model1_outputs:
        if output[0] > output[1]:
            prediction = 0
        else:
            prediction = 1
        predlist.append(prediction)

    labels = eval_df["labels"].tolist()
    print(labels)
    print(predlist)

    print(classification_report(labels, predlist))
    print(confusion_matrix(labels, predlist))
    print(accuracy_score(labels, predlist))
Exemple #8
0
def main():
    # load train & test data
    df_train = pd.read_csv("sentiment_train.csv")
    df_test = pd.read_csv("sentiment_test.csv")

    #set random seed
    random = 42

    # Train test split
    X_train, X_val, y_train, y_val = train_test_split(df_train['Sentence'],
                                                      df_train['Polarity'],
                                                      test_size=0.10,
                                                      random_state=random)
    train_dataset = pd.concat([X_train, y_train], axis=1)
    val_dataset = pd.concat([X_val, y_val], axis=1)

    # Load a pre-trained model, and train it with our data | See all models available: https://huggingface.co/transformers/pretrained_models.html
    # Create model ... args = parameters
    args = {
        'reprocess_input_data': True,
        'max_seq_length': 300,
        'num_train_epochs': 1,
        'fp16': False,
        'train_batch_size': 4,
        'overwrite_output_dir': True
    }
    my_model = ClassificationModel('roberta',
                                   'distilroberta-base',
                                   num_labels=2,
                                   use_cuda=True,
                                   cuda_device=0,
                                   args=args)
    # Train the model
    my_model.train_model(train_dataset)

    # Evaluate the model
    result, model_outputs, wrong_predictions = my_model.eval_model(
        val_dataset, acc=f1_score)
    pred_val = np.argmax(model_outputs, axis=1).tolist()

    print("Results on evaluation:")
    print("----------------------")
    print("F1 Score = {:.6f}\n".format(
        f1_score(y_val, pred_val, average='micro') * 100))

    print(classification_report(y_val, pred_val))
    print(confusion_matrix(y_val, pred_val))

    # get results on test set
    pred_test, _ = my_model.predict(df_test['Sentence'])

    # print f1 score
    print(f1_score(df_test.Polarity, pred_test))

    # print accuracy score
    print(accuracy_score(df_test.Polarity, pred_test))

    # save input/ground truth/prediction as one csv
    df_test['prediction'] = pred_test
    df_test.to_csv('q3_ans.csv', index=False)
Exemple #9
0
def objective(args):
    pbar.update(1)
    try:
        # cast np values to python and convert list to dict
        args = list(map(int, args[:3])) + list(map(float, args[3:]))
        args = dict(
            zip([
                'train_batch_size', 'gradient_accumulation_steps',
                'weight_decay', 'learning_rate', 'learning_rate',
                'adam_epsilon', 'warmup_ratio', 'max_grad_norm'
            ], args))
        args['overwrite_output_dir'] = True
        args['eval_batch_size'] = args['train_batch_size']
        model = ClassificationModel('albert', 'albert-base-v1', num_labels=5)

        # train model, find reverse f1, force garbage collection
        model.train_model(train, args=args)
        result, *_ = model.eval_model(test,
                                      f1=f1_multiclass,
                                      acc=accuracy_score)
        del model
        return 1. - result['f1']
    except:
        print('skip')
        return 1.
Exemple #10
0
def eval(model_path, our_gen_file, human_file):
    gen = open(our_gen_file, 'r').readlines()
    gen = [i.strip() for i in gen]
    human = open(human_file, 'r').readlines()
    human = [i.strip() for i in human]

    assert len(human) - len(gen) == 0, "please balance the eval file"

    test_df = pd.DataFrame(gen+human)
    test_input = test_df.sample(frac=1, random_state=123)

    train_args={
        'learning_rate':3e-5,
        'num_train_epochs': 5,
        'reprocess_input_data': True,
        'overwrite_output_dir': False,
        'process_count': 10,
        'train_batch_size': 4,
        'eval_batch_size': 400,
        'max_seq_length': 300,
        "fp16":False
    }

    model = ClassificationModel('roberta', model_path, num_labels=4, use_cuda=True, cuda_device=0, args=train_args)

    result, model_outputs, wrong_predictions = model.eval_model(test_input)
    print(result)
def main():
    f_path = 'Breast Cancer(Raw_data_2_Classes).csv'
    data = loadDataAsDataFrame(f_path)
    X = data
    y = data['Class'].tolist()
    training_set_size = int(0.8 * len(X))
    training_rows, test_rows, training_classes, test_classes = train_test_split(
        X, y, train_size=training_set_size, random_state=42069)
    training_rows, test_rows, training_classes, test_classes = train_test_split(
        X, y, train_size=training_set_size, random_state=42069)
    model_args = {'overwrite_output_dir': True}
    # Create a TransformerModel
    model = ClassificationModel('roberta',
                                'roberta-base',
                                use_cuda=False,
                                args=model_args)
    #model = ClassificationModel('roberta', 'roberta-base', use_cuda=True, args=model_args)

    #change our data into a format that simpletransformers can process
    training_rows['text'] = training_rows['Text']
    training_rows['labels'] = training_rows['Class']
    test_rows['text'] = test_rows['Text']
    test_rows['labels'] = test_rows['Class']

    # Train the model
    model.train_model(training_rows)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(test_rows)

    print("f1 score")
    precision = result['tp'] / (result['tp'] + result['fp'])
    recall = result['tp'] / (result['tp'] + result['fn'])
    f1score = 2 * precision * recall / (precision + recall)
    print(f1score)
def test_binary_classification(model_type, model_name):
    # Train and Evaluation data needs to be in a Pandas Dataframe of two columns.
    # The first column is the text with type str, and the second column is the
    # label with type int.
    train_data = [
        ["Example sentence belonging to class 1", 1],
        ["Example sentence belonging to class 0", 0],
    ]
    train_df = pd.DataFrame(train_data)

    eval_data = [
        ["Example eval sentence belonging to class 1", 1],
        ["Example eval sentence belonging to class 0", 0],
    ]
    eval_df = pd.DataFrame(eval_data)

    # Create a ClassificationModel
    model = ClassificationModel(
        model_type,
        model_name,
        use_cuda=False,
        args={
            "reprocess_input_data": True,
            "overwrite_output_dir": True
        },
    )

    # Train the model
    model.train_model(train_df)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(eval_df)
Exemple #13
0
def get_evaluation_parameter(model):
    eval_df = pd.read_csv("data/reviews/new_test.csv", header=None)
    eval_df.columns = ["text", "labels"]

    model_type = f'outputs/{model}/best_model'
    model = ClassificationModel(model, model_type)
    result, model_outputs, wrong_predictions = model.eval_model(eval_df)
    print('Results:', result)
    print('Outputs:', model_outputs)

    plots = []
    differences = []
    max_difference = 0
    min_difference = 5
    for i in range(len(model_outputs)):
        value = round(abs(model_outputs[i] - eval_df['labels'][i]), 2)
        actual = round(eval_df['labels'][i], 2)
        plots.append([actual, model_outputs[i], value])

        if value > max_difference:
            max_difference = value
        if value < min_difference:
            min_difference = value

        differences.append(value)

    print('Max Difference:', max_difference)  # 3.8447265625
    print('Min Difference:', min_difference)  # 0.0

    parameter = sum(differences) / len(differences)
    print('Parameter:', parameter)  # 0.40202807008058644

    pd.DataFrame(differences).to_csv("test.csv", index=None)
    pd.DataFrame(plots).to_csv("plots.csv", index=None)
def cross_pseudo_labeling(train, pseudo_test, test, params, n_folds,
                          model_name, model_type, lb_hack):
    splits = list(
        StratifiedKFold(n_splits=n_folds, shuffle=True,
                        random_state=1234).split(train["text"],
                                                 train["label"]))
    splits_test = list(
        KFold(n_splits=n_folds, shuffle=True,
              random_state=1234).split(test["jobflag"]))

    y_pred = np.zeros((test.shape[0], n_folds))
    oof = np.zeros(train.shape[0])
    oof_raw = np.zeros((train.shape[0], n_folds))
    weight = len(train) / train["label"].value_counts().sort_index().values

    f1_score = 0

    for fold, (train_idx, valid_idx) in enumerate(splits):
        X_train = pd.concat([train.iloc[train_idx], pseudo_test])
        X_valid = train.iloc[valid_idx]
        model = ClassificationModel(model_type=model_type,
                                    model_name=model_name,
                                    num_labels=4,
                                    args=params,
                                    use_cuda=True,
                                    weight=weight.tolist())

        model.train_model(X_train)

        result, model_outputs, wrong_predictions = model.eval_model(
            X_valid, f1=metric_f1)
        print(result)
        f1_score += result["f1"] / n_folds

        fold_pred, raw_outputs = model.predict(test["description"].values)
        # y_pred[:, fold] = hack(raw_outputs)
        y_pred[:, :] = raw_outputs / n_folds

        oof_pred, oof_outputs = model.predict(
            X_valid["text"].values)  # 謎のバグが発生するので変換
        oof[valid_idx] = oof_pred
        oof_raw[valid_idx, :] = oof_outputs
        # oof[valid_idx] = hack(oof_outputs)

    print(f"mean f1_score: {f1_score}")

    raw_pred = y_pred.copy()

    y_pred = hack(y_pred, lb_hack)

    # oof = hack(oof_raw)

    # y_pred = stats.mode(y_pred, axis=1)[0].flatten().astype(int)

    test_pred = pd.DataFrame(
        np.concatenate([y_pred.reshape(-1, 1), raw_pred], 1))
    oof_pred = pd.DataFrame(np.concatenate([oof.reshape(-1, 1), oof_raw], 1))

    return test_pred, f1_score, oof_pred
Exemple #15
0
def main(source=source,
         data_dir='data',
         checkpoint_dir="outputs/eval2/roberta_finetune_nogptneo",
         best_model_dir='outputs/eval2/best_model_roberta_finetune_nogptneo',
         n_train=240000,
         n_valid=4000,
         n_test=4000,
         n_epochs=10,
         learning_rate=4e-05,
         train_batch_size=64,
         eval_batch_size=64,
         evaluate_during_training=True,
         evaluate_during_training_steps=2000,
         reprocess_input=True,
         overwrite_output_dir=True,
         n_gpu=2):

    # import pdb; pdb.set_trace()
    train_df = data_loading.load_split(data_dir, source, 'train', n=n_train)
    valid_df = data_loading.load_split(data_dir,
                                       source_test,
                                       'valid',
                                       n=n_valid)
    test_df = data_loading.load_split(data_dir, source_test, 'test', n=n_test)

    # Optional model configuration
    model_args = ClassificationArgs(
        num_train_epochs=n_epochs,
        evaluate_during_training=evaluate_during_training,
        evaluate_during_training_steps=evaluate_during_training_steps,
        best_model_dir=best_model_dir,
        manual_seed=0,
        train_batch_size=train_batch_size,
        eval_batch_size=eval_batch_size,
        overwrite_output_dir=overwrite_output_dir,
        n_gpu=n_gpu,
        output_dir=checkpoint_dir,
        reprocess_input_data=reprocess_input,
        learning_rate=learning_rate)

    # Create a ClassificationModel
    model = ClassificationModel("roberta",
                                model_name="roberta-large",
                                args=model_args,
                                use_cuda=True)

    # Train the model
    model.train_model(train_df,
                      eval_df=valid_df,
                      f1=sklearn.metrics.f1_score,
                      acc=sklearn.metrics.accuracy_score,
                      eer=eer)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        test_df,
        f1=sklearn.metrics.f1_score,
        acc=sklearn.metrics.accuracy_score,
        eer=eer)
Exemple #16
0
def main(argv):
    wandb.login()

    tasks, (train_df, valid_df,
            test_df), transformers = load_molnet_dataset(FLAGS.molnet_dataset,
                                                         tasks_wanted=None)

    logging.basicConfig(level=logging.INFO)
    transformers_logger = logging.getLogger("transformers")
    transformers_logger.setLevel(logging.WARNING)

    model = ClassificationModel(FLAGS.model_type,
                                FLAGS.model_name,
                                args={
                                    'evaluate_each_epoch': True,
                                    'evaluate_during_training_verbose': True,
                                    'no_save': True,
                                    'num_train_epochs': FLAGS.num_train_epochs,
                                    'auto_weights': True
                                })
    # You can set class weights by using the optional weight argument

    # check if our train and evaluation dataframes are setup properly. There should only be two columns for the SMILES string and its corresponding label.
    print("Train Dataset: {}".format(train_df.shape))
    print("Eval Dataset: {}".format(valid_df.shape))
    print("TEST Dataset: {}".format(test_df.shape))

    model.train_model(train_df,
                      eval_df=valid_df,
                      output_dir=FLAGS.output_dir,
                      args={'wandb_project': 'project-name'})

    # accuracy
    result, model_outputs, wrong_predictions = model.eval_model(
        test_df, acc=sklearn.metrics.accuracy_score)

    # ROC-PRC
    result, model_outputs, wrong_predictions = model.eval_model(
        test_df, acc=sklearn.metrics.average_precision_score)

    # Lets input a molecule with a toxicity value of 1
    predictions, raw_outputs = model.predict(['C1=C(C(=O)NC(=O)N1)F'])
    print(predictions)
    print(raw_outputs)
Exemple #17
0
def main (
        source='xl-1542M-k40;xl-1542M',
        data_dir='data',
        load_model_dir="outputs/checkpoint-15626-epoch-2",
        checkpoint_dir="outputs",
        n_train=250000,
        n_valid=10000,
        n_test=np.inf,
        reprocess_input=False,
    ):

    transformers_logger.info(f'source: {source}, checkpoint_dir: {checkpoint_dir}')

    pristine_articles = extract_articles('data/pristine')
    manipulated_articles = extract_articles('data/Manipulated')

    test_texts = pristine_articles + manipulated_articles
    test_labels = [0]*len(pristine_articles) + [1]*len(manipulated_articles)

    print(f'Testing {len(test_texts)} artifles, of which {len(pristine_articles)} are pristine and {len(manipulated_articles)} are manipulated')
    # Preparing test data
    test_data = {
        'text':test_texts,
        'labels':test_labels
    }
    test_df = pd.DataFrame(data=test_data)

    # Optional model configuration
    model_args = ClassificationArgs(
        output_dir=checkpoint_dir,
        num_train_epochs=2,
        evaluate_during_training=True,
        save_steps=25000,
        evaluate_during_training_steps=25000,
        manual_seed=0,
        train_batch_size=256,
        eval_batch_size=256,
        overwrite_output_dir=True,
        reprocess_input_data=reprocess_input,
        n_gpu=2,
    )

    # Create a ClassificationModel
    model = ClassificationModel(
        "roberta",
        load_model_dir,
        args=model_args,
        use_cuda=True
    )

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        test_df,
        f1=sklearn.metrics.f1_score,
        acc=sklearn.metrics.accuracy_score
    )
Exemple #18
0
class FakeNewsClassifier:
    def __init__(self, model_type: str = 'bert', model_name: str = 'bert-base-uncased',
                 use_cuda: bool = False) -> None:
        self.model = ClassificationModel(model_type, model_name, num_labels=4,
                                         args=DEFAULT_ARGS, use_cuda=use_cuda)

    def finetune(self, train_df: pd.DataFrame, dev_df: pd.DataFrame = None,
                 evaluate_during_training: bool = False) -> None:

        train_df['labels'] = list(map(lambda x: CLASSES[x], train_df['labels']))

        if evaluate_during_training:
            self.model.train_model(train_df, eval_df=dev_df, multi_label=True, show_running_loss=True)
        else:
            self.model.train_model(train_df, multi_label=True, show_running_loss=True)

    def evaluate(self, test_df: pd.DataFrame) -> str:
        test_df['labels'] = list(map(lambda x: CLASSES[x], test_df['labels']))

        _, model_outputs_test, _ = self.model.eval_model(test_df)
        preds_test = np.argmax(model_outputs_test, axis=1)

        result_string = calculate_f1_scores(preds_test, test_df['labels'])

        conf_matrix = get_conf_matrix(preds_test, test_df['labels'])
        fnc_score = fake_news_score(preds_test, test_df['labels'])
        result_string += f'\nRelative FNC Score: {100 / 13204.75 * fnc_score:.3f}%\n'
        result_string += get_conf_matrix_string(conf_matrix)

        eval_report = classification_report(test_df['labels'], preds_test)
        result_string += 'Test report'
        result_string += eval_report

        return result_string

    def predict(self, test_df: pd.DataFrame) -> List[str]:
        def get_label_name(label_int):
            return list(CLASSES.keys())[list(CLASSES.values()).index(label_int)]

        _, model_outputs_test, _ = self.model.eval_model(test_df)
        preds_test = np.argmax(model_outputs_test, axis=1)
        return get_label_name(preds_test[0])
Exemple #19
0
class Classifier:
    def __init__(self, model_type, model_name, use_cuda=True):
        logging.basicConfig(level=logging.INFO)
        transformers_logger = logging.getLogger("transformers")
        transformers_logger.setLevel(logging.WARNING)

        # Create a ClassificationModel
        self.model_type = model_type
        self.model_name = model_name
        self.use_cuda = use_cuda
        self.dat = {}
        self.rerun = False

    def add(self, X, Y):
        self.dat[Y] = X

    def train(self, split=0.7, num_epochs=10):
        self.le = preprocessing.LabelEncoder()
        print(list(self.dat.keys()))
        self.le.fit(list(self.dat.keys()))

        train_data = []
        eval_data = []
        for k, v in self.dat.items():
            len_train = int(round(len(v) * split))
            train_data.extend([[i, self.le.transform([k])[0]]
                               for i in v[:len_train]])

            eval_data.extend([[i, self.le.transform([k])[0]]
                              for i in v[len_train:]])

        print(train_data, eval_data)
        train_df = pd.DataFrame(train_data)
        eval_df = pd.DataFrame(eval_data)
        train_args = {
            'overwrite_output_dir': True,
            'num_train_epochs': num_epochs,
        }
        self.model = ClassificationModel(self.model_type,
                                         self.model_name,
                                         num_labels=len(list(self.dat.keys())),
                                         use_cuda=self.use_cuda,
                                         cuda_device=0,
                                         args=train_args)
        # Train the model
        self.model.train_model(train_df, eval_df=eval_df)

        # Evaluate the model
        result, model_outputs, wrong_predictions = self.model.eval_model(
            eval_df, acc=sklearn.metrics.accuracy_score)

    def predict(self, x):
        predictions, raw_outputs = self.model.predict(x)
        return self.le.inverse_transform(predictions)
def train():
    # Initialize a new wandb run
    wandb.init()

    # Creamos el ClassificationModel
    model = ClassificationModel(
        #'bert', 'bert-base-multilingual-cased',
        model_type='bert',
        model_name='dccuchile/bert-base-spanish-wwm-cased',
        num_labels=CANTIDAD_CLASES,
        use_cuda=False,
        args=train_args)

    # Entrenamos el modelo
    model.train_model(train_df, eval_df=test_df)

    # Evalúo el modelo
    model.eval_model(test_df)
    # Sync wandb
    wandb.join()
    def run_trainer(self):
        logging.basicConfig(level=logging.INFO)
        transformers_logger = logging.getLogger("transformers")
        transformers_logger.setLevel(logging.WARNING)

        print('output dir: ' + self.output_dir)

        model_args = {
            'max_seq_length': self.max_seq_length,
            'learning_rate': 4e-5,
            'num_train_epochs': self.epochs,
            'reprocess_input_data': True,
            'overwrite_output_dir': True,
            'evaluate_during_training': True,
            'evaluate_during_training_steps': 800,  # 400
            'best_model_dir': '{}/best-models'.format(self.output_dir),
            'logging_steps': 100,  # 50
            'do_lower_case': True,
            'train_batch_size': self.batch_size,
            'use_batch_norm': False,
            'tensorboard_dir': '{}/runs'.format(self.output_dir),
            'early_stopping_patience': 1,
            'save_only_best': True,
            'overwrite_last_saved': True,
            'save_steps': 0,
            'wandb_project': 'gallery',
        }
        # Create a ClassificationModel
        model = ClassificationModel(self.model_name,
                                    self.model_name + "-base-uncased",
                                    num_labels=self.num_labels,
                                    args=model_args,
                                    use_cuda=self.use_cuda)

        # Train the model
        # model.train_model(self.train_df)
        # model.train_model(self.train_df, output_dir=output_dir, eval_df=test_x, acc=accuracy_score)
        model.train_model(self.train_df,
                          output_dir=self.output_dir,
                          eval_df=self.eval_df,
                          acc=accuracy_score)

        # Evaluate the model
        # eval_df, multi_label=False, output_dir=None, verbose=True, silent=False, wandb_log=True, **kwargs
        result, model_outputs, wrong_predictions = model.eval_model(
            eval_df=self.eval_df,
            multi_label=False,
            output_dir=self.output_dir,
            verbose=True,
            silent=False,
            wandb_log=True)

        print("result: ", result)
        return model
Exemple #22
0
def run(model_name=("distilbert", "distilbert-base-uncased")):
    # TODO: make directories in VM
    training_data, test_data = get_dataset()
    model = ClassificationModel(model_name[0], model_name[1])
    output_dir_train = "./saved_states/category3/" + model_name[0]
    output_dir_eval = "./results/category3/" + model_name[0]
    # create paths if they do not exist
    from pathlib import Path
    Path(output_dir_train).mkdir(parents=True, exist_ok=True)
    Path(output_dir_eval).mkdir(parents=True, exist_ok=True)
    model.train_model(training_data, args={"overwrite_output_dir": True}, output_dir=output_dir_train)
    result, model_outputs, wrong_predictions = model.eval_model(test_data, output_dir=output_dir_eval)
Exemple #23
0
def transformer(train_df, eval_df, architecture, model_type, args):
    model = ClassificationModel(architecture,
                                model_type,
                                use_cuda=True,
                                args=args)
    model.train_model(train_df)
    result, model_outputs, wrong_predictions = model.eval_model(
        eval_df, cr=classification_report, cm=confusion_matrix)
    for values in model_outputs:
        print("P:\t", values[0], "\tN:\t", values[1])
    print(result['cr'])  # Classification report
    print(result['cm'])  # Confusion matrix
def main():
    script_info = pd.read_csv('./data/IMSDB/final_movie_budgets.csv', sep=',')
    script_info['Budget'] = [
        int(bud.replace(',', '')) for bud in script_info['Budget']
    ]  # reformatting budget

    # creating Budget Categories by quartile
    script_info['Bud_Cat'] = pd.qcut(script_info['Budget'], 2, labels=[0, 1])

    # get list of scripts from data folder
    scripts = []
    for file in script_info['Filename']:
        with open(file, 'r') as txt:
            scripts.append(txt.read().replace('\n', ''))

    X_train, X_test, y_train, y_test = train_test_split(scripts,
                                                        script_info['Bud_Cat'],
                                                        test_size=0.2,
                                                        random_state=0)

    docs = [
        ' '.join(tokenize_script(script, stop_words=True))
        for script in X_train
    ]
    train_docs = [list(x) for x in zip(docs, y_train)]

    train_df = pd.DataFrame(train_docs)
    train_df.columns = ["text", "labels"]

    docs = [
        ' '.join(tokenize_script(script, stop_words=True)) for script in X_test
    ]
    test_docs = [list(x) for x in zip(docs, y_test)]

    test_df = pd.DataFrame(test_docs)
    test_df.columns = ["text", 'labels']

    model_args = ClassificationArgs(sliding_window=True,
                                    overwrite_output_dir=True)

    model = ClassificationModel("roberta",
                                "roberta-base",
                                args=model_args,
                                use_cuda=True,
                                n_epochs=3)

    # Train the model
    model.train_model(train_df)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(test_df)

    print(result)
Exemple #25
0
def model(train, test, params, n_folds, model_name, model_type, lb_hack, prediction=False):
    kfold = StratifiedKFold(n_splits=n_folds)

    y_pred = np.zeros((test.shape[0], n_folds))
    oof = np.zeros(train.shape[0])
    oof_raw = np.zeros((train.shape[0], n_folds))
    weight = len(train) / train["label"].value_counts().sort_index().values

    f1_score = 0

    for fold, (train_idx, valid_idx) in enumerate(kfold.split(train["text"], train['label'])):
        args = params.copy()
        args["output_dir"] = params["output_dir"] + "_" + str(fold + 1)

        X_train = train.iloc[train_idx]
        X_valid = train.iloc[valid_idx]
        if prediction:
            model_name = args["output_dir"]

        model = ClassificationModel(model_type=model_type, model_name=model_name, num_labels=4,
                                    args=args, use_cuda=True, weight=weight.tolist())

        if not prediction:
            model.train_model(X_train)

        result, model_outputs, wrong_predictions = model.eval_model(X_valid, f1=metric_f1)
        print(result)
        f1_score += result["f1"] / n_folds

        fold_pred, raw_outputs = model.predict(test['description'])
        # y_pred[:, fold] = hack(raw_outputs)
        y_pred += raw_outputs / n_folds

        oof_pred, oof_outputs = model.predict(X_valid["text"].values)  # 謎のバグが発生するので
        oof[valid_idx] = oof_pred
        oof_raw[valid_idx, :] = oof_outputs
        # oof[valid_idx] = hack(oof_outputs)

    print(f"mean f1_score: {f1_score}")

    raw_pred = y_pred.copy()
    y_pred = hack(y_pred, lb_hack)

    # oof = hack(oof_raw)

    # y_pred = stats.mode(y_pred, axis=1)[0].flatten().astype(int)

    test_pred = pd.DataFrame(np.concatenate([y_pred.reshape(-1, 1), raw_pred], 1))
    oof_pred = pd.DataFrame(np.concatenate([oof.reshape(-1, 1), oof_raw], 1))

    return test_pred, f1_score, oof_pred
Exemple #26
0
def train_eval(train_df, eval_df, output_dirp):
    """
    Train and eval test a model
    :param train_df:
    :param eval_df:
    :param output_dirp:
    :return:
    """
    print(train_df.head())

    # Define model
    model = ClassificationModel(
        settings.MODEL_SETTINGS["model_type"],
        settings.MODEL_SETTINGS["model_name"],
        num_labels=4,
        args=settings.MODEL_SETTINGS["train_args"],
    )

    # Write train and eval
    Path(output_dirp).mkdir(parents=True, exist_ok=True)
    train_df.to_csv(Path(output_dirp) / "trainset.tsv", sep="\t", index=False)
    eval_df.to_csv(Path(output_dirp) / "testset.tsv", sep="\t", index=False)

    # # Reload train and eval for testing
    # train_df = pd.read_csv(Path(output_dirp) / "trainset.tsv", sep="\t", converters={"labels": literal_eval})
    # eval_df = pd.read_csv(Path(output_dirp) / "testset.tsv", sep="\t", converters={"labels": literal_eval})

    # Set tensorflow_dir in model args to run dir
    model.args["tensorboard_dir"] = Path(output_dirp) / "tensorboard/"
    model.args["cache_dir"] = (Path(output_dirp) / "cache/"
                               )  # to ensure no weights are shared
    model.args["output_dir"] = output_dirp  # is redundant

    # Train the model
    print(f"Training model with args: {model.args}")
    model.train_model(train_df, output_dir=output_dirp)

    # Evaluate the model on eval set
    result, model_outputs, _ = model.eval_model(eval_df)

    # Write model result and outputs
    eval_df["y_pred"] = model_outputs.tolist()
    predictions_fp = Path(output_dirp) / "testset_with_predictions.tsv"
    eval_df.to_csv(predictions_fp, sep="\t", index=False)

    with open(Path(output_dirp) / "result.json", "wt") as result_out:
        json.dump(result, result_out)

    return result, model_outputs
def transformer(train_df, eval_df):
    model = ClassificationModel("bert",
                                "bert-base-dutch-cased",
                                use_cuda=False,
                                args={"overwrite_output_dir": True})
    model.train_model(train_df)

    result, model_outputs, wrong_predictions = model.eval_model(
        eval_df, cr=classification_report, cm=confusion_matrix)

    print(model_outputs)
    for i in model_outputs:
        print(i)
    print(result['cr'])  # Classification report
    print(result['cm'])  # Confusion matrix
Exemple #28
0
def main():
    logging.basicConfig(level=logging.INFO)
    transformers_logger = logging.getLogger("transformers")
    transformers_logger.setLevel(logging.WARNING)

    # Preparing train data
    train_df = pd.read_csv("data/train.csv")
    train_df = train_df[["comment_text", "target"]]
    train_df = clean_text(train_df, "comment_text")
    # train_df["target"] = class_labels(train_df["target"])
    train_df.columns = ["text", "labels"]
    # train_df["labels"] = train_df["labels"].astype(int)
    # Duplicate the data that is toxic
    train_df = train_df.append([train_df[train_df["labels"] > 0]] * 5)

    # Preparing eval data
    eval_df = pd.read_csv("data/test_public_expanded.csv")
    eval_df = eval_df[["comment_text", "toxicity"]]
    eval_df = clean_text(eval_df, "comment_text")
    # eval_df["toxicity"] = class_labels(eval_df["toxicity"])
    eval_df.columns = ["text", "labels"]

    train_df.to_csv("data/train_clean.csv", sep=",", index=False)
    eval_df.to_csv("data/eval_clean.csv", sep=",", index=False)

    # Optional model configuration
    model_args = ClassificationArgs(num_train_epochs=1,
                                    lazy_loading=True,
                                    lazy_labels_column=1,
                                    lazy_text_column=0,
                                    lazy_delimiter=',',
                                    regression=True)

    # Create a ClassificationModel
    model = ClassificationModel("roberta",
                                "roberta-base",
                                use_cuda=False,
                                args=model_args)

    # Train the model
    # pdb.set_trace()
    model.train_model("data/train_clean.csv")

    # Evaluate the model
    # pdb.set_trace()
    result, model_outputs, wrong_predictions = model.eval_model(
        "data/eval_clean.csv")
Exemple #29
0
def main (
        source=source,
        data_dir='data',
        load_model_dir="outputs/eval2/best_model_openai_finetune_1",
        checkpoint_dir="outputs/eval2/test_xl-1542M-nucleus_eval2_analytic",
        n_train=250000,
        n_valid=10000,
        n_test=np.inf,
        reprocess_input=True,
    ):

    transformers_logger.info(f'source: {source}, checkpoint_dir: {checkpoint_dir}')

    test_df = data_loading.load_split(data_dir, source, 'test', n=n_test)

    # Optional model configuration
    model_args = ClassificationArgs(
        output_dir=checkpoint_dir,
        num_train_epochs=2,
        evaluate_during_training=True,
        save_steps=25000,
        evaluate_during_training_steps=25000,
        manual_seed=0,
        train_batch_size=256,
        eval_batch_size=256,
        overwrite_output_dir=True,
        reprocess_input_data=reprocess_input,
        n_gpu=2,
        no_cache=True,
    )

    # Create a ClassificationModel
    model = ClassificationModel(
        "roberta",
        # model_name="roberta-large-openai-detector",
        load_model_dir,
        args=model_args,
        use_cuda=True
    )

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        test_df,
        f1=sklearn.metrics.f1_score,
        acc=sklearn.metrics.accuracy_score,
        eer=eer
    )
class TrainAndEval:
    def __init__(self, model_type, model_name, model_args):
        self.train_df = pd.read_pickle("D:/Language Models/train_df_500000")
        self.eval_df = pd.read_pickle("D:/Language Models/test_df_500000")
        self.model = ClassificationModel(model_type,
                                         model_name,
                                         use_cuda=False,
                                         args=model_args)

    def train(self):
        self.model.train_model(self.train_df)

    def eval(self):
        result, model_outputs, wrong_predictions = self.model.eval_model(
            self.eval_df)
        print(result, model_outputs, wrong_predictions)
        return result, model_outputs, wrong_predictions