Exemple #1
0
def main(source=source,
         data_dir='data',
         checkpoint_dir="outputs/eval2/roberta_finetune_nogptneo",
         best_model_dir='outputs/eval2/best_model_roberta_finetune_nogptneo',
         n_train=240000,
         n_valid=4000,
         n_test=4000,
         n_epochs=10,
         learning_rate=4e-05,
         train_batch_size=64,
         eval_batch_size=64,
         evaluate_during_training=True,
         evaluate_during_training_steps=2000,
         reprocess_input=True,
         overwrite_output_dir=True,
         n_gpu=2):

    # import pdb; pdb.set_trace()
    train_df = data_loading.load_split(data_dir, source, 'train', n=n_train)
    valid_df = data_loading.load_split(data_dir,
                                       source_test,
                                       'valid',
                                       n=n_valid)
    test_df = data_loading.load_split(data_dir, source_test, 'test', n=n_test)

    # Optional model configuration
    model_args = ClassificationArgs(
        num_train_epochs=n_epochs,
        evaluate_during_training=evaluate_during_training,
        evaluate_during_training_steps=evaluate_during_training_steps,
        best_model_dir=best_model_dir,
        manual_seed=0,
        train_batch_size=train_batch_size,
        eval_batch_size=eval_batch_size,
        overwrite_output_dir=overwrite_output_dir,
        n_gpu=n_gpu,
        output_dir=checkpoint_dir,
        reprocess_input_data=reprocess_input,
        learning_rate=learning_rate)

    # Create a ClassificationModel
    model = ClassificationModel("roberta",
                                model_name="roberta-large",
                                args=model_args,
                                use_cuda=True)

    # Train the model
    model.train_model(train_df,
                      eval_df=valid_df,
                      f1=sklearn.metrics.f1_score,
                      acc=sklearn.metrics.accuracy_score,
                      eer=eer)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        test_df,
        f1=sklearn.metrics.f1_score,
        acc=sklearn.metrics.accuracy_score,
        eer=eer)
Exemple #2
0
def main (
        source='xl-1542M-k40;xl-1542M',
        data_dir='data',
        load_model_dir="outputs/checkpoint-15626-epoch-2",
        checkpoint_dir="outputs",
        n_train=250000,
        n_valid=10000,
        n_test=np.inf,
        reprocess_input=False,
    ):

    transformers_logger.info(f'source: {source}, checkpoint_dir: {checkpoint_dir}')

    pristine_articles = extract_articles('data/pristine')
    manipulated_articles = extract_articles('data/Manipulated')

    test_texts = pristine_articles + manipulated_articles
    test_labels = [0]*len(pristine_articles) + [1]*len(manipulated_articles)

    print(f'Testing {len(test_texts)} artifles, of which {len(pristine_articles)} are pristine and {len(manipulated_articles)} are manipulated')
    # Preparing test data
    test_data = {
        'text':test_texts,
        'labels':test_labels
    }
    test_df = pd.DataFrame(data=test_data)

    # Optional model configuration
    model_args = ClassificationArgs(
        output_dir=checkpoint_dir,
        num_train_epochs=2,
        evaluate_during_training=True,
        save_steps=25000,
        evaluate_during_training_steps=25000,
        manual_seed=0,
        train_batch_size=256,
        eval_batch_size=256,
        overwrite_output_dir=True,
        reprocess_input_data=reprocess_input,
        n_gpu=2,
    )

    # Create a ClassificationModel
    model = ClassificationModel(
        "roberta",
        load_model_dir,
        args=model_args,
        use_cuda=True
    )

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        test_df,
        f1=sklearn.metrics.f1_score,
        acc=sklearn.metrics.accuracy_score
    )
def main():
    script_info = pd.read_csv('./data/IMSDB/final_movie_budgets.csv', sep=',')
    script_info['Budget'] = [
        int(bud.replace(',', '')) for bud in script_info['Budget']
    ]  # reformatting budget

    # creating Budget Categories by quartile
    script_info['Bud_Cat'] = pd.qcut(script_info['Budget'], 2, labels=[0, 1])

    # get list of scripts from data folder
    scripts = []
    for file in script_info['Filename']:
        with open(file, 'r') as txt:
            scripts.append(txt.read().replace('\n', ''))

    X_train, X_test, y_train, y_test = train_test_split(scripts,
                                                        script_info['Bud_Cat'],
                                                        test_size=0.2,
                                                        random_state=0)

    docs = [
        ' '.join(tokenize_script(script, stop_words=True))
        for script in X_train
    ]
    train_docs = [list(x) for x in zip(docs, y_train)]

    train_df = pd.DataFrame(train_docs)
    train_df.columns = ["text", "labels"]

    docs = [
        ' '.join(tokenize_script(script, stop_words=True)) for script in X_test
    ]
    test_docs = [list(x) for x in zip(docs, y_test)]

    test_df = pd.DataFrame(test_docs)
    test_df.columns = ["text", 'labels']

    model_args = ClassificationArgs(sliding_window=True,
                                    overwrite_output_dir=True)

    model = ClassificationModel("roberta",
                                "roberta-base",
                                args=model_args,
                                use_cuda=True,
                                n_epochs=3)

    # Train the model
    model.train_model(train_df)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(test_df)

    print(result)
Exemple #4
0
def model_train(data, output_dir='models/'):
    '''
    Trains a roberta model based on input data.
    Inputs: 
      data (pd.DataFrame): with columns content (text), ground_truth_risk (label), probability_risk.
      output_dir (str): output path or directory to save model.
    Output:
      None, model will already be saved in specified output directory.
    '''

    # extract relevant columns
    df = pd.DataFrame(data[['content', 'ground_truth_risk', 'probability_risk']])

    # if ground truth risk is NA, convert probability to ground truth risk to train
    df_NA = df[df['ground_truth_risk'].isnull()]
    df_NA['ground_truth_risk'] = df_NA.apply(lambda x: to_binary(x['probability_risk']), axis=1)

    # update df
    df = df.dropna(subset=['ground_truth_risk'])
    df = pd.concat([df, df_NA], ignore_index=True)

    # format df for training
    df = pd.DataFrame(df[['content', 'ground_truth_risk']])
    # rename columns - requirement of the simpletransformers package
    df = df.rename({'content': 'text', 'ground_truth_risk': 'labels'}, axis=1)


    # processing text column
    df['text'] = df.apply(lambda x: text_processing(x.text,                     
                                                    lower=False, 
                                                    remove_url=True, 
                                                    remove_punctuation=False, 
                                                    remove_stopwords=False, 
                                                    replace_entity=True, 
                                                    replace_hash=True,
                                                    split_alphanumeric=False,
                                                    lemmatize=False,
                                                    stem=False), axis=1)

    # initialise Model
    model_args = ClassificationArgs(num_train_epochs=2, learning_rate = 5e-5, \
                                    output_dir=output_dir)
    model = ClassificationModel(model_type = 'roberta', model_name = 'roberta-base', \
                                args = model_args, use_cuda = False)
    # train the model
    model.train_model(df)

    return
Exemple #5
0
def main (
        source=source,
        data_dir='data',
        load_model_dir="outputs/eval2/best_model_openai_finetune_1",
        checkpoint_dir="outputs/eval2/test_xl-1542M-nucleus_eval2_analytic",
        n_train=250000,
        n_valid=10000,
        n_test=np.inf,
        reprocess_input=True,
    ):

    transformers_logger.info(f'source: {source}, checkpoint_dir: {checkpoint_dir}')

    test_df = data_loading.load_split(data_dir, source, 'test', n=n_test)

    # Optional model configuration
    model_args = ClassificationArgs(
        output_dir=checkpoint_dir,
        num_train_epochs=2,
        evaluate_during_training=True,
        save_steps=25000,
        evaluate_during_training_steps=25000,
        manual_seed=0,
        train_batch_size=256,
        eval_batch_size=256,
        overwrite_output_dir=True,
        reprocess_input_data=reprocess_input,
        n_gpu=2,
        no_cache=True,
    )

    # Create a ClassificationModel
    model = ClassificationModel(
        "roberta",
        # model_name="roberta-large-openai-detector",
        load_model_dir,
        args=model_args,
        use_cuda=True
    )

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        test_df,
        f1=sklearn.metrics.f1_score,
        acc=sklearn.metrics.accuracy_score,
        eer=eer
    )
Exemple #6
0
def main():
    logging.basicConfig(level=logging.INFO)
    transformers_logger = logging.getLogger("transformers")
    transformers_logger.setLevel(logging.WARNING)

    # Preparing train data
    train_df = pd.read_csv("data/train.csv")
    train_df = train_df[["comment_text", "target"]]
    train_df = clean_text(train_df, "comment_text")
    # train_df["target"] = class_labels(train_df["target"])
    train_df.columns = ["text", "labels"]
    # train_df["labels"] = train_df["labels"].astype(int)
    # Duplicate the data that is toxic
    train_df = train_df.append([train_df[train_df["labels"] > 0]] * 5)

    # Preparing eval data
    eval_df = pd.read_csv("data/test_public_expanded.csv")
    eval_df = eval_df[["comment_text", "toxicity"]]
    eval_df = clean_text(eval_df, "comment_text")
    # eval_df["toxicity"] = class_labels(eval_df["toxicity"])
    eval_df.columns = ["text", "labels"]

    train_df.to_csv("data/train_clean.csv", sep=",", index=False)
    eval_df.to_csv("data/eval_clean.csv", sep=",", index=False)

    # Optional model configuration
    model_args = ClassificationArgs(num_train_epochs=1,
                                    lazy_loading=True,
                                    lazy_labels_column=1,
                                    lazy_text_column=0,
                                    lazy_delimiter=',',
                                    regression=True)

    # Create a ClassificationModel
    model = ClassificationModel("roberta",
                                "roberta-base",
                                use_cuda=False,
                                args=model_args)

    # Train the model
    # pdb.set_trace()
    model.train_model("data/train_clean.csv")

    # Evaluate the model
    # pdb.set_trace()
    result, model_outputs, wrong_predictions = model.eval_model(
        "data/eval_clean.csv")
def model_predict(text):
    '''
    Takes in an array of text and returns predicted probability of risk.

    Input:
        text (arr): E.g. data[['content']]
    Output: 
        pred (arr): returns label of 0 for low risk and 1 for high risk based on prob_risk
        prob_risk (arr): E.g. data['probability_risk'] = model_predict(data[['content']])
        pred_risk (arr): Risk score for each article
    '''

    #read text file to get model path
    model_txt = open("../automation/curr_model.txt", "r")
    model_path = model_txt.read()
    model_txt.close()

    # loading saved model, specifying same args as model init
    # model names: path to directory containing model files
    # model naming convention : roberta_YYYY_MM
    model_args = ClassificationArgs(num_train_epochs=2, learning_rate=5e-5)
    model = ClassificationModel(model_type = 'roberta', model_name = model_path, \
                                args = model_args, use_cuda = False)

    # Preprocess text
    processed_text = text.apply(
        lambda x: text_processing(x,
                                  lower=False,
                                  remove_url=True,
                                  remove_punctuation=False,
                                  remove_stopwords=False,
                                  replace_entity=True,
                                  replace_hash=True,
                                  split_alphanumeric=False,
                                  lemmatize=False,
                                  stem=False))

    # predict
    pred, raw_outputs = model.predict(text)

    # convert to probability of risk
    prob = softmax(raw_outputs, axis=1)
    prob_risk = [x[1] for x in prob]
    pred_risk = [predicted_risk(x) for x in prob_risk]

    return pred, prob_risk, pred_risk
Exemple #8
0
def make_baseline_estimator(config, train_data, val_data):
	model_args = ClassificationArgs(
		num_train_epochs=config.num_epoch, output_dir=config.output_dir,
		overwrite_output_dir=True,
		max_seq_length=config.max_length, train_batch_size=config.train_batch_size,
		eval_batch_size=config.eval_batch_size
	)
	model = ClassificationModel(
		config.model_type, config.model_name,
		num_labels=config.num_label,
		use_cuda=config.device != 'cpu',
		args=model_args
	)

	model.train_model(train_df=train_data, eval_df=val_data)

	return model
Exemple #9
0
def train(
    arch,
    model_name,
):
    model_args = ClassificationArgs(
        num_train_epochs=5,
        output_dir="./models",
        evaluate_during_training_steps=1000,
        train_batch_size=64,
        reprocess_input_data=True,
        evaluate_during_training=True,
        eval_batch_size=32,
        save_model_every_epoch=False,
        overwrite_output_dir=True,
        learning_rate=7e-5,
        save_eval_checkpoints=False,
        best_model_dir=f"./models/{model_name}/best_model",
        use_early_stopping=True,
        early_stopping_delta=1e-2,
        early_stopping_metric="mcc",
        tensorboard_dir='./runs/',
        early_stopping_metric_minimize=False,
        wandb_project='my_roberta',
        manual_seed=69,
        early_stopping_patience=5,
    )
    model = ClassificationModel(arch,
                                model_name,
                                args=model_args,
                                use_cuda=True)

    model.train_model(
        train_df,
        eval_df=test,
        accuracy=lambda x, y: accuracy_score(x, [round(a) for a in y]),
    )

    result, model_output, top_loss = model.eval_model(test)
    print(result)
    print(top_loss)

    pred, _ = model.predict(["thanks for bearing with us"])
    print(pred)
Exemple #10
0
def transformer2(model_name, train_df, eval_df, epochs, labels):
    logging.basicConfig(level=logging.INFO)
    transformers_logger = logging.getLogger("transformers")
    transformers_logger.setLevel(logging.WARNING)
    model_args = ClassificationArgs()
    model_args.num_train_epochs=epochs
    model_args.labels_list = [0, 1, 2, 3, 4]
    model_args.reprocess_input_data=True
    model_args.overwrite_output_dir=True
    model = ClassificationModel(model_name, 'bert-base-cased', num_labels=labels, args=model_args)
    # You can set class weights by using the optional weight argument

    # Train the model
    model.train_model(train_df)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(eval_df)
    # You can set class weights by using the optional weight argument

    return model, result, model_outputs, wrong_predictions
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

categories = ["area", "assignee"]

for category in categories:
    test_df, train_df, target_names = load_dataframes(category)

    model_args = ClassificationArgs(
        output_dir=category + "_model",
        best_model_dir=category + "_model_best",
        overwrite_output_dir=True,
        train_batch_size=16,
        eval_batch_size=32,
        max_seq_length=256,
        num_train_epochs=2,
        save_model_every_epoch=False,
        save_eval_checkpoints=False,
    )

    def f1_multiclass(labels, preds):
        return f1_score(labels, preds, average="micro")

    # Create a ClassificationModel
    model = ClassificationModel(
        "bert",
        "finetuned",
        num_labels=len(target_names),
        args=model_args,
Exemple #12
0
def cross_validation(lang):
    print(lang)
    model_name = 'EMBEDDIA/crosloengual-bert'

    logging.basicConfig(level=logging.INFO)
    transformers_logger = logging.getLogger('transformers')
    transformers_logger.setLevel(logging.WARNING)

    # Train and Evaluation data needs to be in a Pandas Dataframe containing at least two columns.
    # If the Dataframe has a header, it should contain a 'text' and a 'labels' column.
    # If no header is present, the Dataframe should contain at least two columns,
    # with the first column is the text with type str, and the second column in the label with type int.
    accs = []
    f1s = []
    df = load_single_lang(lang)

    kf = KFold(n_splits=10)
    for train_index, test_index in kf.split(df.index):
        df_train = df.iloc[train_index]
        df_test = df.iloc[test_index]

        # hyperparameters
        model_args = ClassificationArgs()
        model_args.logging_steps = 1000000
        model_args.save_eval_checkpoints = False
        model_args.save_steps = 1000000
        model_args.no_cache = True
        model_args.save_model_every_epoch = False

        model_args.num_train_epochs = 1
        model_args.learning_rate = 2e-4
        model_args.train_batch_size = 32
        model_args.overwrite_output_dir = True
        '''
        model_args.train_custom_parameters_only = True
        model_args.custom_parameter_groups = [
            {
                "params": ["classifier.weight"],
                "lr": 2e-4,
            },
            {
                "params": ["classifier.bias"],
                "lr": 2e-4,
                "weight_decay": 0.0,
            },
        ]
        '''

        # Create a ClassificationModel
        model = ClassificationModel('bert',
                                    model_name,
                                    num_labels=3,
                                    args=model_args)
        print(model.get_named_parameters())

        # Train the model
        print('Training ...')
        model.train_model(df_train)

        # Evaluate the model
        print('Evaluating ...')
        predictions, raw_outputs = model.predict(df_test['text'].values)
        out = eval(df_test['labels'].values, predictions)
        accs.append(out['acc'])
        f1s.append(out['avg_f1'])

        del model

    # write results to file
    with open('results_csebert.txt', 'a+') as f:
        f.write("{} {} {}\n".format(lang, statistics.mean(accs),
                                    statistics.mean(f1s)))
Exemple #13
0
eval_df["text"] = xtest.cleaned
eval_df["labels"] = xtest.category

if TEST:
    eval_df = eval_df[0:SAMPLE]
    train_df = eval_df[0:SAMPLE]

print("Defining model")

# Optional model configuration
model_args = ClassificationArgs(
    num_train_epochs=EPOCH,
    no_save=True,
    overwrite_output_dir=True,
    save_eval_checkpoints=False,
    save_model_every_epoch=False,
    save_optimizer_and_scheduler=False,
    max_seq_length=LENGTH,
    fp16=True,
    train_batch_size=BATCH,
    eval_batch_size=BATCH,
)

# Create a ClassificationModel
model = ClassificationModel(FAMILY,
                            FAMILYMODEL,
                            num_labels=len(eval_df.labels.unique()),
                            args=model_args)
print("Model training")
# Train the model
model.train_model(train_df)
Exemple #14
0
def main(source=source,
         data_dir='data',
         checkpoint_dir="outputs/eval2/openai_finetune_1",
         best_model_dir='outputs/eval2/best_model_openai_finetune_1',
         model_name="roberta-large-openai-detector",
         n_train=300000,
         n_valid=8000,
         n_test=10000,
         n_epochs=5,
         learning_rate=1e-06,
         train_batch_size=64,
         eval_batch_size=64,
         evaluate_during_training=True,
         evaluate_during_training_steps=400,
         reprocess_input=True,
         overwrite_output_dir=True,
         n_gpu=2):

    print(
        f'{source}\n{data_dir}\n{checkpoint_dir}\n{best_model_dir}\n{model_name}\n{n_train}\n{n_valid}\n{n_test}\n{n_epochs}\n{learning_rate}\n{train_batch_size}\n{eval_batch_size}\n{evaluate_during_training}\n{evaluate_during_training_steps}\n{reprocess_input}\n{overwrite_output_dir}\n{n_gpu}\n'
    )

    # import pdb; pdb.set_trace()
    train_df = data_loading.load_split(data_dir, source, 'train', n=n_train)
    valid_df = data_loading.load_split(data_dir,
                                       source_test,
                                       'valid',
                                       n=n_valid)
    test_df = data_loading.load_split(data_dir, source_test, 'test', n=n_test)

    # Optional model configuration
    model_args = ClassificationArgs(
        num_train_epochs=n_epochs,
        evaluate_during_training=evaluate_during_training,
        evaluate_during_training_steps=evaluate_during_training_steps,
        best_model_dir=best_model_dir,
        manual_seed=0,
        train_batch_size=train_batch_size,
        eval_batch_size=eval_batch_size,
        overwrite_output_dir=overwrite_output_dir,
        n_gpu=n_gpu,
        output_dir=checkpoint_dir,
        reprocess_input_data=reprocess_input,
        learning_rate=learning_rate)

    # Create a ClassificationModel
    model = ClassificationModel("roberta",
                                model_name=model_name,
                                args=model_args,
                                use_cuda=True)

    # Train the model
    model.train_model(train_df,
                      eval_df=valid_df,
                      f1=sklearn.metrics.f1_score,
                      acc=sklearn.metrics.accuracy_score,
                      eer=eer)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        test_df,
        f1=sklearn.metrics.f1_score,
        acc=sklearn.metrics.accuracy_score,
        eer=eer)
Exemple #15
0
def main(
    source=source,
    data_dir='data',
    checkpoint_dir="outputs/" + experiment_name,
    n_train=np.inf,
    n_valid=5000,
    n_epochs=10,
    n_test=np.inf,
    reprocess_input=True,
    small=True,
):

    train_texts, train_labels = data_loading.load_split(data_dir,
                                                        source,
                                                        'train',
                                                        n=n_train)
    valid_texts, valid_labels = data_loading.load_split(data_dir,
                                                        source_test,
                                                        'test',
                                                        n=n_valid)
    test_texts, test_labels = data_loading.load_split(data_dir,
                                                      source_test,
                                                      'test',
                                                      n=n_test)

    for i, text in enumerate(train_texts):
        for key in ['Article: ', 'Body: ', 'Abstract: ']:
            if key in text:
                train_texts[i] = text.split(key)[-1]

    train_labels = [int(not label) for label in train_labels]

    for i, text in enumerate(valid_texts):
        for key in ['Article: ', 'Body: ', 'Abstract: ']:
            if key in text:
                valid_texts[i] = text.split(key)[-1]

    valid_labels = [int(not label) for label in valid_labels]

    for i, text in enumerate(test_texts):
        for key in ['Article: ', 'Body: ', 'Abstract: ']:
            if key in text:
                test_texts[i] = text.split(key)[-1]

    test_labels = [int(not label) for label in test_labels]

    def sample_sequences(texts, labels):
        small_texts = []
        small_labels = []
        for text, label in zip(texts, labels):
            toks = text.split()
            for seq_len in [16, 32, 64, 128, 256]:
                if len(toks) > seq_len:
                    start_idx = random.randrange(len(toks) - seq_len)
                    subseq = toks[start_idx:start_idx + seq_len]
                    small_texts.append(" ".join(subseq))
                    small_labels.append(label)
            # import pdb; pdb.set_trace()
        all_texts = texts + small_texts
        all_labels = labels + small_labels
        return all_texts, all_labels

    if small:
        train_texts, train_labels = sample_sequences(train_texts, train_labels)

    # Preparing train data
    train_data = {'text': train_texts, 'labels': train_labels}
    train_df = pd.DataFrame(data=train_data)

    # Preparing eval data
    valid_data = {'text': valid_texts, 'labels': valid_labels}
    valid_df = pd.DataFrame(data=valid_data)

    # Preparing test data
    test_data = {'text': test_texts, 'labels': test_labels}
    test_df = pd.DataFrame(data=test_data)

    # Optional model configuration
    model_args = ClassificationArgs(num_train_epochs=n_epochs,
                                    evaluate_during_training=True,
                                    manual_seed=0,
                                    train_batch_size=16,
                                    eval_batch_size=32,
                                    overwrite_output_dir=True,
                                    n_gpu=2,
                                    output_dir=checkpoint_dir,
                                    reprocess_input_data=reprocess_input,
                                    cache_dir="cache_dir/" + experiment_name,
                                    best_model_dir='outputs/best_model_' +
                                    experiment_name,
                                    max_seq_length=256)

    # Create a ClassificationModel
    model = ClassificationModel("roberta",
                                "roberta-large",
                                args=model_args,
                                use_cuda=True)

    # Train the model
    model.train_model(train_df,
                      eval_df=valid_df,
                      f1=sklearn.metrics.f1_score,
                      acc=sklearn.metrics.accuracy_score,
                      eer=eer)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        test_df,
        f1=sklearn.metrics.f1_score,
        acc=sklearn.metrics.accuracy_score,
        eer=eer)
Exemple #16
0
def bert_training(model_type, model_base, train_data, early_stop,
                  early_stop_delta, overwrite, epoch, batch_size,
                  learning_rate, output):

    # Bringing in the training data
    with open(train_data, 'r') as json_file:
        json_list = list(json_file)

    for json_str in json_list:
        train.append(json.loads(json_str))

    # Data cleaning
    train_labels = [train[i]['label'] for i in range(len(train))]

    train_response = [
        remove_stopwords(convert_emojis(train[i]['response']))
        for i in range(len(train))
    ]

    # Split data into training and test sets
    labels_train, labels_test, response_train, response_test = train_test_split(
        train_labels, train_response, test_size=0.2, random_state=42)

    # Convert SARCASM/NO SARCASM labels into 1s and 0s
    labels_train_pd = (pd.DataFrame(labels_train) == 'SARCASM').astype(int)
    labels_test_pd = (pd.DataFrame(labels_test) == 'SARCASM').astype(int)
    response_train_pd = pd.DataFrame(response_train)
    response_test_pd = pd.DataFrame(response_test)

    train_bert = pd.DataFrame({
        'text':
        response_train_pd[0].replace(r'\n', ' ', regex=True),
        'label':
        labels_train_pd[0]
    })

    eval_bert = pd.DataFrame({
        'text':
        response_test_pd[0].replace(r'\n', ' ', regex=True),
        'label':
        labels_test_pd[0]
    })

    model_args = ClassificationArgs()
    model_args.use_early_stopping = early_stop
    model_args.early_stopping_delta = early_stop_delta
    model_args.overwrite_output_dir = overwrite
    model_args.num_train_epochs = epoch
    model_args.train_batch_size = batch_size
    model_args.learning_rate = learning_rate
    model_args.output_dir = output

    # Create a TransformerModel
    model = ClassificationModel(model_type,
                                model_base,
                                use_cuda=False,
                                args=model_args)

    # Train the model
    model.train_model(train_bert)

    # Evaluate the model
    model.eval_model(eval_bert)
Exemple #17
0
print(df.shape)
eval_df = df.drop(balanced_train_df.index, axis=0)
eval_df, test_df = train_test_split(eval_df, test_size=0.5)

train_df = balanced_train_df

# Optional model configuration
model_args = ClassificationArgs(
    num_train_epochs=10,
    do_lower_case=True,
    overwrite_output_dir=True,
    output_dir=get_path_from_project_dir('sentence_relevance',
                                         'trained_models'),
    best_model_dir=get_path_from_project_dir('sentence_relevance/best',
                                             'trained_models'),
    save_model_every_epoch=False,
    save_eval_checkpoints=False,
    save_steps=-1,
    evaluate_during_training_verbose=True,
    evaluate_during_training=True,
    early_stopping_consider_epochs=True,
    use_early_stopping=True,
    early_stopping_patience=5,
    early_stopping_delta=5e-3)
# Create a ClassificationModel
model = ClassificationModel("bert",
                            "bert-base-uncased",
                            args=model_args,
                            num_labels=len(set(df['labels'])))

# Train the model
Exemple #18
0
def hyperargs():  # type: () -> {}
    """
  Builds different sets of arguments for the classifier.  Must be the same for
  training and predicting.

  :return: the labeled arguments
  :rtype: {}
  """

    retdict = {}

    for curwindow in [128, 64, 32, 256]:
        for curstride in [0.7, 0.8, 0.9]:
            accargs = ClassificationArgs()
            accargs.num_train_epochs = 5
            accargs.fp16 = False
            accargs.overwrite_output_dir = True
            accargs.evaluate_during_training = False
            accargs.sliding_window = True
            accargs.max_seq_length = curwindow
            accargs.stride = curstride
            accargs.labels_list = [1, 0]
            accargs.save_eval_checkpoints = False
            accargs.save_model_every_epoch = False
            accargs.silent = True
            accargs.manual_seed = 18
            retdict['basic5epochs' + str(curwindow) + 'win' +
                    str(int(curstride * 10.0)) + 'stride'] = accargs

    return retdict
Exemple #19
0
def main (
        sources=sources,
        data_dir='data',
        load_model_dir="outputs/analytic_checkpoint_v0",
        checkpoint_dir="outputs/hackathon_eval1",
        n_train=250000,
        n_valid=10000,
        n_test=np.inf,
        reprocess_input=False,
    ):


        # Optional model configuration
    model_args = ClassificationArgs(
        output_dir=checkpoint_dir,
        num_train_epochs=2,
        evaluate_during_training=True,
        save_steps=25000,
        evaluate_during_training_steps=25000,
        manual_seed=0,
        train_batch_size=256,
        eval_batch_size=256,
        overwrite_output_dir=True,
        reprocess_input_data=reprocess_input,
        n_gpu=1,
        no_cache=True,
    )

    # Create a ClassificationModel
    model = ClassificationModel(
        "roberta",
        # load_model_dir,
        model_name="roberta-large-openai-detector",
        args=model_args,
        use_cuda=True
    )

    for source in sources:
        print(source)
        
        test_texts, test_labels = data_loading.load_split(data_dir, source, 'test', n=n_test)
        for i, text in enumerate(test_texts):
            for key in ['Article: ', 'Body: ', 'Abstract: ']:
                if key in text:
                    test_texts[i] = text.split(key)[-1]
                
        test_labels = [not label for label in test_labels]

        # Preparing test data
        test_data = {
            'text':test_texts,
            'labels':test_labels
        }
        test_df = pd.DataFrame(data=test_data)



        # Evaluate the model
        result, model_outputs, wrong_predictions = model.eval_model(
            test_df,
            f1=sklearn.metrics.f1_score,
            acc=sklearn.metrics.accuracy_score,
            eer=eer
        )
        print(result)
        with open('hackathon_outputs/'+source+'.csv', 'w') as outfile:
            outfile.write(f'row_id, score\n')
            for line, output in enumerate(model_outputs):
                outfile.write(f'{line}, {output[1]}\n')
    "max_seq_len": 512,
    "model": model_types[int(sys.argv[2])],
    "save": model_saves[int(sys.argv[2])]
}

df = pd.read_csv("data.csv")

train_df = df.iloc[:wandb_config["samples"], :]

train_df.columns = ["text", "labels"]

eval_df = df.iloc[wandb_config["samples"]:, :]

eval_df.columns = ["text", "labels"]

model_args = ClassificationArgs()
model_args.num_train_epochs = wandb_config["epochs"]
model_args.eval_batch_size = wandb_config["eval_batch_size"]
model_args.train_batch_size = wandb_config["train_batch_size"]
model_args.wandb_project = "transformer-aes"
model_args.wandb_kwargs = {
    "name": "{}-{}".format(wandb_config["model"], wandb_config["samples"])
}
model_args.learning_rate = wandb_config["lr"]
model_args.model = wandb_config["model"]
model_args.samples = wandb_config["samples"]
# model_args.max_seq_length = wandb_config["max_seq_length"]
model_args.regression = True
model_args.no_save = True
model_args.overwrite_output_dir = True
model_args.logging_steps = 1
        "min_iter": 6,
    },
}

sweep_id = wandb.sweep(sweep_config, project="RTE - Hyperparameter Optimization")

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Preparing train data
train_df = load_rte_data_file("data/train.jsonl")
eval_df = pd.read_json("data/eval_df", lines=True, orient="records")
test_df = pd.read_json("data/test_df", lines=True, orient="records")

model_args = ClassificationArgs()
model_args.eval_batch_size = 8
model_args.evaluate_during_training = True
model_args.evaluate_during_training_silent = False
model_args.evaluate_during_training_steps = 1000
model_args.learning_rate = 4e-4
model_args.manual_seed = 4
model_args.max_seq_length = 256
model_args.multiprocessing_chunksize = 5000
model_args.no_cache = True
model_args.no_save = True
model_args.num_train_epochs = 10
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.train_batch_size = 16
model_args.gradient_accumulation_steps = 2
Exemple #22
0
def main(
    source=source,
    data_dir='data',
    checkpoint_dir="outputs/eval2/openai_finetune",
    n_train=120000,
    n_valid=5000,
    n_epochs=20,
    n_test=5000,
    reprocess_input=False,
):

    # import pdb; pdb.set_trace()
    train_texts, train_labels = data_loading.load_split(data_dir,
                                                        source,
                                                        'train',
                                                        n=n_train)
    valid_texts, valid_labels = data_loading.load_split(data_dir,
                                                        source_test,
                                                        'test',
                                                        n=n_valid)
    test_texts, test_labels = data_loading.load_split(data_dir,
                                                      source_test,
                                                      'test',
                                                      n=n_test)

    # for i, text in enumerate(train_texts):
    #     for key in ['Article: ', 'Body: ', 'Abstract: ']:
    #         if key in text:
    #             train_texts[i] = text.split(key)[-1]

    train_labels = [int(not label) for label in train_labels]

    # for i, text in enumerate(valid_texts):
    #     for key in ['Article: ', 'Body: ', 'Abstract: ']:
    #         if key in text:
    #             valid_texts[i] = text.split(key)[-1]

    valid_labels = [int(not label) for label in valid_labels]

    # for i, text in enumerate(test_texts):
    #     for key in ['Article: ', 'Body: ', 'Abstract: ']:
    #         if key in text:
    #             test_texts[i] = text.split(key)[-1]

    test_labels = [int(not label) for label in test_labels]

    # Preparing train data
    train_data = {'text': train_texts, 'labels': train_labels}
    train_df = pd.DataFrame(data=train_data)

    # Preparing eval data
    valid_data = {'text': valid_texts, 'labels': valid_labels}
    valid_df = pd.DataFrame(data=valid_data)

    # Preparing test data
    test_data = {'text': test_texts, 'labels': test_labels}
    test_df = pd.DataFrame(data=test_data)

    # Optional model configuration
    model_args = ClassificationArgs(
        num_train_epochs=n_epochs,
        evaluate_during_training=True,
        evaluate_during_training_steps=2000,
        best_model_dir='outputs/eval2/best_model_openai_finetune',
        manual_seed=0,
        train_batch_size=32,
        eval_batch_size=128,
        overwrite_output_dir=True,
        n_gpu=2,
        output_dir=checkpoint_dir,
        reprocess_input_data=reprocess_input,
        learning_rate=0.00001)

    # Create a ClassificationModel
    model = ClassificationModel("roberta",
                                model_name="roberta-large-openai-detector",
                                args=model_args,
                                use_cuda=True)

    # Train the model
    model.train_model(train_df,
                      eval_df=valid_df,
                      f1=sklearn.metrics.f1_score,
                      acc=sklearn.metrics.accuracy_score,
                      eer=eer)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        test_df,
        f1=sklearn.metrics.f1_score,
        acc=sklearn.metrics.accuracy_score,
        eer=eer)
df = pd.read_csv(r"./train.csv", header=None, names=["text", "labels"])
sentences_train = sentence_level_data_prep(df)
df.reset_index(inplace=True)
df.columns = ["ind", "text", "labels"]
sentences_train.merge(original_train[["ind", "labels"]], on="ind", how="inner")

sentences_train[
    "sentence_length"] = sentences_train.sentences_from_abstract.map(
        lambda x: len(x.split()))
sentences_train["label_text"] = pd.Categorical(sentences_train.labels)
sentences_train["labels"] = sentences_train.label_text.cat.codes

model_args = ClassificationArgs(
    num_train_epochs=10,
    sliding_window=True,
    fp16=False,
    use_early_stopping=True,
    reprocess_input_data=True,
    overwrite_output_dir=True,
)

# Create a ClassificationModel
model = ClassificationModel("roberta",
                            "roberta-base",
                            num_labels=7,
                            args=model_args)

# We train 4 models by selecting sentences above sent_len. We save these model for 10 epochs. At the end, we select best model from these 40 saved epoch models by selecting the one doing the best on the validation set.
#
for sent_len in [0, 6, 10, 15]:
    print(sent_len)
    sentences_train_filtred = sentences_train[(
Exemple #24
0
    },
}

sweep_id = wandb.sweep(sweep_config, project="concepticon")

model_args = ClassificationArgs(
    num_train_epochs=3,
    learning_rate=4e-6,
    no_cache=True if SWEEP else False,
    no_save=True if SWEEP else False,
    save_eval_checkpoints=False if SWEEP else True,
    save_model_every_epoch=False if SWEEP else True,
    overwrite_output_dir=True,
    reprocess_input_data=True,
    evaluate_during_training=True,
    evaluate_during_training_silent=False,
    evaluate_during_training_steps=1000,
    wandb_project="concepticon",
    train_batch_size=15,
    eval_batch_size=10,
    use_early_stopping=True,
    early_stopping_delta=0.01,
    early_stopping_metric="f1",
    early_stopping_metric_minimize=False,
    early_stopping_patience=5,
)


def train():
    wandb.init()
    model_args.wandb_kwargs = {"id": wandb.run.id}
)

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Preparing train data
train_df = load_rte_data_file("data/train.jsonl")
eval_df = pd.read_json("data/eval_df.jsonl", lines=True, orient="records")
test_df = pd.read_json("data/test_df.jsonl", lines=True, orient="records")

sweep_result = pd.read_csv("sweep_results/deep-sweep.csv")

best_params = sweep_result.to_dict()

model_args = ClassificationArgs()
model_args.eval_batch_size = 32
model_args.evaluate_during_training = True
model_args.evaluate_during_training_silent = False
model_args.evaluate_during_training_steps = 1000
model_args.learning_rate = 4e-5
model_args.manual_seed = 4
model_args.max_seq_length = 256
model_args.multiprocessing_chunksize = 5000
model_args.no_cache = True
# model_args.no_save = True
model_args.num_train_epochs = 10
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.train_batch_size = 16
model_args.gradient_accumulation_steps = 2
Exemple #26
0
from simpletransformers.classification import ClassificationModel, ClassificationArgs

# Path to the model.
PATH = "/projects/tir5/users/apagnoni/gpt-2-output-dataset/outputs/eval2/best_model_openai_finetune_1"

# Model configuration
model_args = ClassificationArgs(eval_batch_size=1, no_cache=True)

# Loading the model (can take some time).
model = ClassificationModel(
    "roberta",
    PATH,
    args=model_args,
    use_cuda=True,
)


def analytics(text):
    '''
    inputs
        text: (string)
    outputs
        (llr, evidence)
    '''
    predictions, raw_ouputs = model.predict([text])
    llr = raw_ouputs[0][1]
    return (llr, None)
WANDB_PROJ_COMPLETE_DATA = "model_complete_data"
WANDB_PROJ_AL_BASELINE = "model_al_baseline"
WANDB_PROJ_AL_EXP = "model_al_experiments"

# Model args for the simpletransformer model
# Add or modify parameters based on experiment
BEST_MODEL_SPEC_DIR = str(BEST_MODEL_DIR).format(WANDB_PROJ_AL_EXP)
MODEL_ARGS = ClassificationArgs(
    num_train_epochs=5,
    overwrite_output_dir=True,
    train_batch_size=16,
    max_seq_length=250,
    # modify based on the experiment
    wandb_project=WANDB_PROJ_AL_EXP,
    best_model_dir=BEST_MODEL_SPEC_DIR,
    cache_dir=str(CACHE_DIR),
    eval_batch_size=16,
    evaluate_during_training=True,
    evaluate_during_training_verbose=True,
    manual_seed=100,
    output_dir=str(OUTPUT_DIR),
    use_early_stopping=True,
    early_stopping_patience=3,
    reprocess_input_data=True,
)

# Model name (roberta-base, roberta-base-uncased, etc)
MODEL_NAME = "roberta"
MODEL_TYPE = "roberta-base"

# Labels for classification
LABELS = {
get_ipython().system(' pip install torchvision ')

# In[4]:

from simpletransformers.classification import ClassificationModel, ClassificationArgs
import logging
import torch
import torchvision

# set logggin messages
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Optional model configuration
model_args = ClassificationArgs(num_train_epochs=1)

# In[6]:

# Create a ClassificationModel
model = ClassificationModel(
    'bert',
    'bert-base-cased',
    num_labels=3,
    args=model_args,
    #     args={'reprocess_input_data': True},
    use_cuda=False,
)

# Train the model
model.train_model(train_df)
train_data = [
    ["Aragorn was the heir of Isildur", "true"],
    ["Frodo was the heir of Isildur", "false"],
]
train_df = pd.DataFrame(train_data)
train_df.columns = ["text", "labels"]

# Preparing eval data
eval_data = [
    ["Theoden was the king of Rohan", "true"],
    ["Merry was the king of Rohan", "false"],
]
eval_df = pd.DataFrame(eval_data)
eval_df.columns = ["text", "labels"]

model_args = ClassificationArgs()
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.evaluate_during_training = True
model_args.manual_seed = 4
model_args.use_multiprocessing = True
model_args.train_batch_size = 16
model_args.eval_batch_size = 8
model_args.labels_list = ["true", "false"]
model_args.wandb_project = "Simple Sweep"


def train():
    # Initialize a new wandb run
    wandb.init()
Exemple #30
0
def buildbertargs():  # type: () -> ClassificationArgs
    """
  Builds arguments for the classifier.  Must be the same for
  training and predicting.

  :return: the arguments
  :rtype: ClassificationArgs
  """

    accargs = ClassificationArgs()
    accargs.num_train_epochs = 5
    accargs.fp16 = False
    accargs.overwrite_output_dir = True
    accargs.evaluate_during_training = False
    accargs.sliding_window = True
    accargs.max_seq_length = 256
    accargs.stride = 0.9
    accargs.labels_list = [1, 0]
    accargs.save_model_every_epoch = False
    accargs.silent = True
    accargs.manual_seed = 18

    return accargs