Ejemplo n.º 1
0
def test_data_split_on_offenseval():
    task = of.Offenseval()
    task.load(offenseval_data_dir)
    train_X, train_y, test_X, test_y = utils.get_instances(
        task, split_train_dev=False)
    assert len(train_X) == 13240
    assert len(test_X) == 320
    assert isinstance(train_X[0], str)

    train_X, train_y, test_X, test_y = utils.get_instances(
        task, split_train_dev=True)
    assert len(train_X) == 13240 * 0.9
    assert len(test_X) == 13240 * 0.1
Ejemplo n.º 2
0
def run(task_name, data_dir, pipeline_name, print_predictions):
    logger.info('>> Running {} experiment'.format(task_name))
    tsk = task(task_name)
    logger.info('>> Loading data...')
    tsk.load(data_dir)
    logger.info('>> retrieving train/data instances...')
    train_X, train_y, test_X, test_y = utils.get_instances(
        tsk, split_train_dev=False)
    test_X_ref = test_X

    if pipeline_name.startswith('cnn'):
        pipe = cnn(pipeline_name)
        train_X, train_y, test_X, test_y = pipe.encode(train_X, train_y,
                                                       test_X, test_y)
        logger.info('>> testing...')
    else:
        pipe = pipeline(pipeline_name)

    logger.info('>> training pipeline ' + pipeline_name)
    pipe.fit(train_X, train_y)
    if pipeline_name == 'naive_bayes_counts_lex':
        logger.info("   -- Found {} tokens in lexicon".format(
            pipe.tokens_from_lexicon))

    logger.info('>> testing...')
    sys_y = pipe.predict(test_X)

    logger.info('>> evaluation...')
    logger.info(utils.eval(test_y, sys_y))

    if print_predictions:
        logger.info('>> predictions')
        utils.print_all_predictions(test_X_ref, test_y, sys_y, logger)
def run(task_name, data_dir, pipeline_name):
    logger.info('>> Running {} experiment'.format(task_name))
    tsk = task(task_name)
    logger.info('>> Loading data...')
    tsk.load(data_dir)
    logger.info('>> retrieving train/test instances...')
    train_X, train_y, test_X, test_y = utils.get_instances(
        tsk, split_train_dev=False)

    if pipeline_name.startswith('cnn'):
        pipe = cnn(pipeline_name)
        train_X, train_y, test_X, test_y = pipe.encode(train_X, train_y,
                                                       test_X, test_y)
        logger.info('>> testing...')
    else:
        pipe = pipeline(pipeline_name)

    logger.info('>> training pipeline ' + pipeline_name)
    pipe.fit(train_X, train_y)

    logger.info('>> testing...')
    sys_y = pipe.predict(test_X)
    logger.info(utils.print_prediction(test_X, test_y, sys_y))
    logger.info('>> evaluation...')
    logger.info(utils.eval(test_y, sys_y))
Ejemplo n.º 4
0
def test_data_load():
    task = vf.VuaFormat()
    task.load(test_data_dir)
    train_X, train_y, test_X, test_y = utils.get_instances(
        task, split_train_dev=False)
    assert len(train_X) == 199
    assert len(test_X) == 99
Ejemplo n.º 5
0
def test_grid_search():
    task = of.Offenseval()
    task.load(offenseval_data_dir)
    train_X, train_y, test_X, test_y = utils.get_instances(
        task, split_train_dev=True, proportion_train=0.1, proportion_dev=0.01)
    params = {'clf__C': (0.1, 1)}
    best_sys_y = utils.grid_search(pipelines.svm_libsvc_counts(), params,
                                   train_X, train_y, test_X)
    assert len(best_sys_y) == len(test_y)
Ejemplo n.º 6
0
def test_naive_bayes_pipeline():
    task = of.Offenseval()
    task.load(offenseval_data_dir)
    train_X, train_y, test_X, test_y = utils.get_instances(
        task, split_train_dev=True, proportion_train=0.1, proportion_dev=0.01)
    pipe = pipelines.naive_bayes()
    pipe.fit(train_X, train_y)
    sys_y = pipe.predict(test_X)
    assert len(sys_y) == len(test_y)
Ejemplo n.º 7
0
def test_hate_speech():
    task = vf.VuaFormat()
    task.load(hate_speech_data_dir, ['testData.csv'])
    train_X, train_y, test_X, test_y = utils.get_instances(
        task, split_train_dev=True, proportion_train=0.1, proportion_dev=0.01)
    pipe = pipelines.naive_bayes()
    pipe.fit(train_X, train_y)
    sys_y = pipe.predict(test_X)
    assert len(sys_y) == len(test_y)
Ejemplo n.º 8
0
def test_trac2018():
    task = vf.VuaFormat()
    task.load(trac_data_dir, 'devData.csv')
    train_X, train_y, test_X, test_y = utils.get_instances(
        task, split_train_dev=True, proportion_train=0.1, proportion_dev=0.01)
    pipe = pipelines.naive_bayes_counts()
    pipe.fit(train_X, train_y)
    sys_y = pipe.predict(test_X)
    assert len(sys_y) == len(test_y)
def encode_data(data_dir):
    print('Loading data...')
    task = of.Offenseval()
    task.load(data_dir=data_dir)
    train_X, train_y, test_X, test_y = utils.get_instances(task, split_train_dev=False)
    print(len(train_X), 'train sequences')
    print(len(test_X), 'data sequences')

    train_X, train_y, test_X, test_y = encode(train_X, train_y, test_X, test_y)

    return train_X, train_y, test_X, test_y
Ejemplo n.º 10
0
def run(task_name, data_dir, pipeline_name, print_predictions, error_analysis,
        remove_stopwords):
    logger.info('>> Running {} experiment'.format(task_name))
    tsk = task(task_name)
    logger.info('>> Loading data...')
    tsk.load(data_dir)
    logger.info('>> retrieving train/data instances...')
    train_X, train_y, test_X, test_y = utils.get_instances(
        tsk, split_train_dev=False)

    logger.info('>> Descriptive statistics dataset:')
    utils.descriptive_statistics(train_X, train_y, test_X, test_y)
    test_X_ref = test_X

    if remove_stopwords:
        if pipeline_name.startswith('cnn'):
            pipeline_name = pipeline_name.split('_')[0]
        pipeline_name = pipeline_name + '_stopwords'

    if pipeline_name.startswith('cnn'):
        pipe = cnn(pipeline_name)
        train_X, train_y, test_X, test_y = pipe.encode(train_X, train_y,
                                                       test_X, test_y)
        logger.info('>> testing CNN...')

    else:
        pipe = pipeline(pipeline_name)

    logger.info('>> training pipeline ' + pipeline_name)

    pipe.fit(train_X, train_y)
    if pipeline_name == 'naive_bayes_counts_lex':
        logger.info("   -- Found {} tokens in lexicon".format(
            pipe.tokens_from_lexicon))

    logger.info('>> testing...')
    sys_y = pipe.predict(test_X)
    # logger.info(utils.print_prediction(test_X, test_y, sys_y))

    if print_predictions:
        logger.info('>> predictions1')
        utils.print_all_predictions(test_X_ref, test_y, sys_y, logger)

    if error_analysis:
        # Used for error evaluation
        logger.info(utils.print_error_analysis(test_X, test_y, sys_y))
        # logger.info(utils.print_confusion_matrix(test_y, sys_y)) # Prints the confusion matrix

    utils.eval(test_y, sys_y, pipeline_name, data_dir)
    if pipeline_name.startswith('naive_bayes'):
        utils.important_features_per_class(pipe.named_steps.frm,
                                           pipe.named_steps.clf,
                                           n=10)
Ejemplo n.º 11
0
def test_representation():
    task = of.Offenseval()
    task.load(offenseval_data_dir)
    train_X, train_y, test_X, test_y = utils.get_instances(
        task, split_train_dev=True, proportion_train=0.1, proportion_dev=0.01)
    prep = preprocessing.Preprocessor(tokenize=True,
                                      normalize_tweet=False,
                                      lowercase=False,
                                      lemmatize=False)
    train_X = prep.transform(train_X)

    frmt = representation.count_vectorizer()
    train_X = frmt.fit_transform(train_X, train_y)
    assert not isinstance(train_X[0], str)
Ejemplo n.º 12
0
def test_preprocessors():
    task = of.Offenseval()
    task.load(offenseval_data_dir)
    train_X, train_y, test_X, test_y = utils.get_instances(
        task, split_train_dev=True, proportion_train=0.1, proportion_dev=0.01)

    prep = preprocessing.Preprocessor(tokenize=False,
                                      normalize_tweet=False,
                                      lowercase=False,
                                      lemmatize=False)
    train_X_prep = prep.transform(train_X)
    assert len(train_X_prep) == len(train_X)
    assert isinstance(train_X_prep[0], str)

    prep = preprocessing.Preprocessor(tokenize=True,
                                      normalize_tweet=True,
                                      lowercase=True,
                                      lemmatize=True)
    train_X_prep = prep.transform(train_X)
    assert len(train_X_prep) == len(train_X)
    assert isinstance(train_X_prep[0], str)