Esempio n. 1
0
def main(source=source,
         data_dir='data',
         checkpoint_dir="outputs/eval2/roberta_finetune_nogptneo",
         best_model_dir='outputs/eval2/best_model_roberta_finetune_nogptneo',
         n_train=240000,
         n_valid=4000,
         n_test=4000,
         n_epochs=10,
         learning_rate=4e-05,
         train_batch_size=64,
         eval_batch_size=64,
         evaluate_during_training=True,
         evaluate_during_training_steps=2000,
         reprocess_input=True,
         overwrite_output_dir=True,
         n_gpu=2):

    # import pdb; pdb.set_trace()
    train_df = data_loading.load_split(data_dir, source, 'train', n=n_train)
    valid_df = data_loading.load_split(data_dir,
                                       source_test,
                                       'valid',
                                       n=n_valid)
    test_df = data_loading.load_split(data_dir, source_test, 'test', n=n_test)

    # Optional model configuration
    model_args = ClassificationArgs(
        num_train_epochs=n_epochs,
        evaluate_during_training=evaluate_during_training,
        evaluate_during_training_steps=evaluate_during_training_steps,
        best_model_dir=best_model_dir,
        manual_seed=0,
        train_batch_size=train_batch_size,
        eval_batch_size=eval_batch_size,
        overwrite_output_dir=overwrite_output_dir,
        n_gpu=n_gpu,
        output_dir=checkpoint_dir,
        reprocess_input_data=reprocess_input,
        learning_rate=learning_rate)

    # Create a ClassificationModel
    model = ClassificationModel("roberta",
                                model_name="roberta-large",
                                args=model_args,
                                use_cuda=True)

    # Train the model
    model.train_model(train_df,
                      eval_df=valid_df,
                      f1=sklearn.metrics.f1_score,
                      acc=sklearn.metrics.accuracy_score,
                      eer=eer)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        test_df,
        f1=sklearn.metrics.f1_score,
        acc=sklearn.metrics.accuracy_score,
        eer=eer)
Esempio n. 2
0
def main (
        source=source,
        data_dir='data',
        load_model_dir="outputs/eval2/best_model_openai_finetune_1",
        checkpoint_dir="outputs/eval2/test_xl-1542M-nucleus_eval2_analytic",
        n_train=250000,
        n_valid=10000,
        n_test=np.inf,
        reprocess_input=True,
    ):

    transformers_logger.info(f'source: {source}, checkpoint_dir: {checkpoint_dir}')

    test_df = data_loading.load_split(data_dir, source, 'test', n=n_test)

    # Optional model configuration
    model_args = ClassificationArgs(
        output_dir=checkpoint_dir,
        num_train_epochs=2,
        evaluate_during_training=True,
        save_steps=25000,
        evaluate_during_training_steps=25000,
        manual_seed=0,
        train_batch_size=256,
        eval_batch_size=256,
        overwrite_output_dir=True,
        reprocess_input_data=reprocess_input,
        n_gpu=2,
        no_cache=True,
    )

    # Create a ClassificationModel
    model = ClassificationModel(
        "roberta",
        # model_name="roberta-large-openai-detector",
        load_model_dir,
        args=model_args,
        use_cuda=True
    )

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        test_df,
        f1=sklearn.metrics.f1_score,
        acc=sklearn.metrics.accuracy_score,
        eer=eer
    )
Esempio n. 3
0
def main (
        source='xl-1542M-k40;xl-1542M',
        data_dir='data',
        load_model_dir="outputs/checkpoint-15626-epoch-2",
        checkpoint_dir="outputs",
        n_train=250000,
        n_valid=10000,
        n_test=100,
        reprocess_input=False,
    ):

    print('loading data')
    test_texts, test_labels = data_loading.load_split(data_dir, source, 'test', n=n_test)
    print('Done loading data')
    for text, label in zip(test_texts, test_labels):
        output = analytics.analytics(text)
        print(output, label)
Esempio n. 4
0
def main(source=source,
         data_dir='data',
         checkpoint_dir="outputs/eval2/openai_finetune_1",
         best_model_dir='outputs/eval2/best_model_openai_finetune_1',
         model_name="roberta-large-openai-detector",
         n_train=300000,
         n_valid=8000,
         n_test=10000,
         n_epochs=5,
         learning_rate=1e-06,
         train_batch_size=64,
         eval_batch_size=64,
         evaluate_during_training=True,
         evaluate_during_training_steps=400,
         reprocess_input=True,
         overwrite_output_dir=True,
         n_gpu=2):

    print(
        f'{source}\n{data_dir}\n{checkpoint_dir}\n{best_model_dir}\n{model_name}\n{n_train}\n{n_valid}\n{n_test}\n{n_epochs}\n{learning_rate}\n{train_batch_size}\n{eval_batch_size}\n{evaluate_during_training}\n{evaluate_during_training_steps}\n{reprocess_input}\n{overwrite_output_dir}\n{n_gpu}\n'
    )

    # import pdb; pdb.set_trace()
    train_df = data_loading.load_split(data_dir, source, 'train', n=n_train)
    valid_df = data_loading.load_split(data_dir,
                                       source_test,
                                       'valid',
                                       n=n_valid)
    test_df = data_loading.load_split(data_dir, source_test, 'test', n=n_test)

    # Optional model configuration
    model_args = ClassificationArgs(
        num_train_epochs=n_epochs,
        evaluate_during_training=evaluate_during_training,
        evaluate_during_training_steps=evaluate_during_training_steps,
        best_model_dir=best_model_dir,
        manual_seed=0,
        train_batch_size=train_batch_size,
        eval_batch_size=eval_batch_size,
        overwrite_output_dir=overwrite_output_dir,
        n_gpu=n_gpu,
        output_dir=checkpoint_dir,
        reprocess_input_data=reprocess_input,
        learning_rate=learning_rate)

    # Create a ClassificationModel
    model = ClassificationModel("roberta",
                                model_name=model_name,
                                args=model_args,
                                use_cuda=True)

    # Train the model
    model.train_model(train_df,
                      eval_df=valid_df,
                      f1=sklearn.metrics.f1_score,
                      acc=sklearn.metrics.accuracy_score,
                      eer=eer)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        test_df,
        f1=sklearn.metrics.f1_score,
        acc=sklearn.metrics.accuracy_score,
        eer=eer)
Esempio n. 5
0
def main(
    source=source,
    data_dir='data',
    checkpoint_dir="outputs/" + experiment_name,
    n_train=np.inf,
    n_valid=5000,
    n_epochs=10,
    n_test=np.inf,
    reprocess_input=True,
    small=True,
):

    train_texts, train_labels = data_loading.load_split(data_dir,
                                                        source,
                                                        'train',
                                                        n=n_train)
    valid_texts, valid_labels = data_loading.load_split(data_dir,
                                                        source_test,
                                                        'test',
                                                        n=n_valid)
    test_texts, test_labels = data_loading.load_split(data_dir,
                                                      source_test,
                                                      'test',
                                                      n=n_test)

    for i, text in enumerate(train_texts):
        for key in ['Article: ', 'Body: ', 'Abstract: ']:
            if key in text:
                train_texts[i] = text.split(key)[-1]

    train_labels = [int(not label) for label in train_labels]

    for i, text in enumerate(valid_texts):
        for key in ['Article: ', 'Body: ', 'Abstract: ']:
            if key in text:
                valid_texts[i] = text.split(key)[-1]

    valid_labels = [int(not label) for label in valid_labels]

    for i, text in enumerate(test_texts):
        for key in ['Article: ', 'Body: ', 'Abstract: ']:
            if key in text:
                test_texts[i] = text.split(key)[-1]

    test_labels = [int(not label) for label in test_labels]

    def sample_sequences(texts, labels):
        small_texts = []
        small_labels = []
        for text, label in zip(texts, labels):
            toks = text.split()
            for seq_len in [16, 32, 64, 128, 256]:
                if len(toks) > seq_len:
                    start_idx = random.randrange(len(toks) - seq_len)
                    subseq = toks[start_idx:start_idx + seq_len]
                    small_texts.append(" ".join(subseq))
                    small_labels.append(label)
            # import pdb; pdb.set_trace()
        all_texts = texts + small_texts
        all_labels = labels + small_labels
        return all_texts, all_labels

    if small:
        train_texts, train_labels = sample_sequences(train_texts, train_labels)

    # Preparing train data
    train_data = {'text': train_texts, 'labels': train_labels}
    train_df = pd.DataFrame(data=train_data)

    # Preparing eval data
    valid_data = {'text': valid_texts, 'labels': valid_labels}
    valid_df = pd.DataFrame(data=valid_data)

    # Preparing test data
    test_data = {'text': test_texts, 'labels': test_labels}
    test_df = pd.DataFrame(data=test_data)

    # Optional model configuration
    model_args = ClassificationArgs(num_train_epochs=n_epochs,
                                    evaluate_during_training=True,
                                    manual_seed=0,
                                    train_batch_size=16,
                                    eval_batch_size=32,
                                    overwrite_output_dir=True,
                                    n_gpu=2,
                                    output_dir=checkpoint_dir,
                                    reprocess_input_data=reprocess_input,
                                    cache_dir="cache_dir/" + experiment_name,
                                    best_model_dir='outputs/best_model_' +
                                    experiment_name,
                                    max_seq_length=256)

    # Create a ClassificationModel
    model = ClassificationModel("roberta",
                                "roberta-large",
                                args=model_args,
                                use_cuda=True)

    # Train the model
    model.train_model(train_df,
                      eval_df=valid_df,
                      f1=sklearn.metrics.f1_score,
                      acc=sklearn.metrics.accuracy_score,
                      eer=eer)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        test_df,
        f1=sklearn.metrics.f1_score,
        acc=sklearn.metrics.accuracy_score,
        eer=eer)
Esempio n. 6
0
def main(
        data_dir, 
        log_dir, 
        source='xl-1542M-k40', 
        n_train=500000, 
        n_valid=10000, 
        n_test=np.inf,
        n_jobs=None, 
        n_jobs_custom=None, 
        verbose=False,
        save_featureizer=False,
        save_model=False,
        save_features=False,
        load_featureizer=None,
        load_features=None,
        load_model=None,
        no_hyperparam_search=False,
        tfidf_features=False,
        custom_features=False,
        dual=False,
        max_iter=1000,
        test_only=False,
        min_df=5,
    ):
    start_time = time.time()
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    # Loading data.
    train_texts, train_labels = data_loading.load_split(data_dir, source, 'valid', n=n_train)
    valid_texts, valid_labels = data_loading.load_split(data_dir, source, 'train', n=n_valid)
    test_texts, test_labels = data_loading.load_split(data_dir, source, 'test', n=n_test)

    cur_time = time.time()
    print(f'{cur_time - start_time:.2f}\tFinished loading data.')
    start_time = cur_time

    # Extracting features.
    if not load_features:
        if not load_featureizer:
            transformers = []
            if tfidf_features:
                transformers.append([
                    'tfidf',
                    TfidfVectorizer(ngram_range=(1, 2), min_df=min_df, max_features=2**21)
                ])
            if custom_features:
                transformers.append([
                    'custom_features',
                    features.CustomFeatures(n_jobs=n_jobs_custom)
                ])
            assert len(transformers) >= 1, f'You should select at least one set of features to use.'
            vect = FeatureUnion(transformers, n_jobs=min(n_jobs, len(transformers)))
            train_features = vect.fit_transform(train_texts)
        else:
            with open(os.path.join(load_featureizer, 'featureizer.pickle'), 'rb') as infile:
                vect = pickle.load(infile)
                if not test_only:
                    train_features = vect.transform(train_texts)
        valid_features = vect.transform(valid_texts)
        test_features = vect.transform(test_texts)
    else:
        with open(os.path.join(log_dir, 'train_features.pickle'), 'rb') as infile:
            train_features = pickle.load(infile)
        with open(os.path.join(log_dir, 'valid_features.pickle'), 'rb') as infile:
            valid_features = pickle.load(infile)
        with open(os.path.join(log_dir, 'test_features.pickle'), 'rb') as infile:
            test_features = pickle.load(infile)

    cur_time = time.time()
    print(f'{cur_time - start_time:.2f}\tFinished extracting features. Number of features: {test_features.shape[1]}')
    start_time = cur_time

    # Training the model.
    if not load_model:
        model = LogisticRegression(solver='liblinear', dual=dual, max_iter=max_iter)
        if not no_hyperparam_search:
            params = {'C': [1/64, 1/32, 1/16, 1/8, 1/4, 1/2, 1, 2, 4, 8, 16, 32, 64]}
            split = PredefinedSplit([-1]*n_train+[0]*n_valid)
            search = GridSearchCV(model, params, cv=split, n_jobs=n_jobs, verbose=verbose, refit=False)
            search.fit(sparse.vstack([train_features, valid_features]), train_labels+valid_labels)
            model = model.set_params(**search.best_params_)
            cur_time = time.time()
            print(f'{cur_time - start_time:.2f}\tFinished hyperparam search.')
            start_time = cur_time
        model.fit(train_features, train_labels)
    else:
        with open(os.path.join(load_model, 'model.pickle'), 'rb') as infile:
            model = pickle.load(infile)
    cur_time = time.time()
    print(f'{cur_time - start_time:.2f}\tFinished training model.')
    start_time = cur_time

    # Scoring the model.
    valid_accuracy = model.score(valid_features, valid_labels)*100.
    test_accuracy = model.score(test_features, test_labels)*100.
    data = {
        'source':source,
        'n_train':n_train,
        'valid_accuracy':valid_accuracy,
        'test_accuracy':test_accuracy
    }
    cur_time = time.time()
    print(f'{cur_time - start_time:.2f}\tFinished evaluating model.')
    start_time = cur_time
    print(data)
    json.dump(data, open(os.path.join(log_dir, f'stats.json'), 'w'))

    # Saving the model.
    if save_features:
        with open(os.path.join(log_dir, 'train_features.pickle'), 'wb') as outfile:
            pickle.dump(train_features, outfile)
        with open(os.path.join(log_dir, 'valid_features.pickle'), 'wb') as outfile:
            pickle.dump(valid_features, outfile)
        with open(os.path.join(log_dir, 'test_features.pickle'), 'wb') as outfile:
            pickle.dump(test_features, outfile)
        
        cur_time = time.time()
        print(f'{cur_time - start_time:.2f}\tFinished saving features.')
        start_time = cur_time
    if save_model:
        with open(os.path.join(log_dir, 'model.pickle'), 'wb') as outfile:
            pickle.dump(model, outfile)
        cur_time = time.time()
        print(f'{cur_time - start_time:.2f}\tFinished saving model.')
        start_time = cur_time
    if save_featureizer:
        with open(os.path.join(log_dir, 'featureizer.pickle'), 'wb') as outfile:
            pickle.dump(vect, outfile)
            cur_time = time.time()
            print(f'{cur_time - start_time:.2f}\tFinished saving featureizer.')
            start_time = cur_time
Esempio n. 7
0
def main(
    source=source,
    data_dir='data',
    checkpoint_dir="outputs/eval2/openai_finetune",
    n_train=120000,
    n_valid=5000,
    n_epochs=20,
    n_test=5000,
    reprocess_input=False,
):

    # import pdb; pdb.set_trace()
    train_texts, train_labels = data_loading.load_split(data_dir,
                                                        source,
                                                        'train',
                                                        n=n_train)
    valid_texts, valid_labels = data_loading.load_split(data_dir,
                                                        source_test,
                                                        'test',
                                                        n=n_valid)
    test_texts, test_labels = data_loading.load_split(data_dir,
                                                      source_test,
                                                      'test',
                                                      n=n_test)

    # for i, text in enumerate(train_texts):
    #     for key in ['Article: ', 'Body: ', 'Abstract: ']:
    #         if key in text:
    #             train_texts[i] = text.split(key)[-1]

    train_labels = [int(not label) for label in train_labels]

    # for i, text in enumerate(valid_texts):
    #     for key in ['Article: ', 'Body: ', 'Abstract: ']:
    #         if key in text:
    #             valid_texts[i] = text.split(key)[-1]

    valid_labels = [int(not label) for label in valid_labels]

    # for i, text in enumerate(test_texts):
    #     for key in ['Article: ', 'Body: ', 'Abstract: ']:
    #         if key in text:
    #             test_texts[i] = text.split(key)[-1]

    test_labels = [int(not label) for label in test_labels]

    # Preparing train data
    train_data = {'text': train_texts, 'labels': train_labels}
    train_df = pd.DataFrame(data=train_data)

    # Preparing eval data
    valid_data = {'text': valid_texts, 'labels': valid_labels}
    valid_df = pd.DataFrame(data=valid_data)

    # Preparing test data
    test_data = {'text': test_texts, 'labels': test_labels}
    test_df = pd.DataFrame(data=test_data)

    # Optional model configuration
    model_args = ClassificationArgs(
        num_train_epochs=n_epochs,
        evaluate_during_training=True,
        evaluate_during_training_steps=2000,
        best_model_dir='outputs/eval2/best_model_openai_finetune',
        manual_seed=0,
        train_batch_size=32,
        eval_batch_size=128,
        overwrite_output_dir=True,
        n_gpu=2,
        output_dir=checkpoint_dir,
        reprocess_input_data=reprocess_input,
        learning_rate=0.00001)

    # Create a ClassificationModel
    model = ClassificationModel("roberta",
                                model_name="roberta-large-openai-detector",
                                args=model_args,
                                use_cuda=True)

    # Train the model
    model.train_model(train_df,
                      eval_df=valid_df,
                      f1=sklearn.metrics.f1_score,
                      acc=sklearn.metrics.accuracy_score,
                      eer=eer)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        test_df,
        f1=sklearn.metrics.f1_score,
        acc=sklearn.metrics.accuracy_score,
        eer=eer)
Esempio n. 8
0
def main (
        sources=sources,
        data_dir='data',
        load_model_dir="outputs/analytic_checkpoint_v0",
        checkpoint_dir="outputs/hackathon_eval1",
        n_train=250000,
        n_valid=10000,
        n_test=np.inf,
        reprocess_input=False,
    ):


        # Optional model configuration
    model_args = ClassificationArgs(
        output_dir=checkpoint_dir,
        num_train_epochs=2,
        evaluate_during_training=True,
        save_steps=25000,
        evaluate_during_training_steps=25000,
        manual_seed=0,
        train_batch_size=256,
        eval_batch_size=256,
        overwrite_output_dir=True,
        reprocess_input_data=reprocess_input,
        n_gpu=1,
        no_cache=True,
    )

    # Create a ClassificationModel
    model = ClassificationModel(
        "roberta",
        # load_model_dir,
        model_name="roberta-large-openai-detector",
        args=model_args,
        use_cuda=True
    )

    for source in sources:
        print(source)
        
        test_texts, test_labels = data_loading.load_split(data_dir, source, 'test', n=n_test)
        for i, text in enumerate(test_texts):
            for key in ['Article: ', 'Body: ', 'Abstract: ']:
                if key in text:
                    test_texts[i] = text.split(key)[-1]
                
        test_labels = [not label for label in test_labels]

        # Preparing test data
        test_data = {
            'text':test_texts,
            'labels':test_labels
        }
        test_df = pd.DataFrame(data=test_data)



        # Evaluate the model
        result, model_outputs, wrong_predictions = model.eval_model(
            test_df,
            f1=sklearn.metrics.f1_score,
            acc=sklearn.metrics.accuracy_score,
            eer=eer
        )
        print(result)
        with open('hackathon_outputs/'+source+'.csv', 'w') as outfile:
            outfile.write(f'row_id, score\n')
            for line, output in enumerate(model_outputs):
                outfile.write(f'{line}, {output[1]}\n')