Example #1
0
def make_submission(with_model):
    """
    Generates a submission for the leaderboard
    """

    train_files = resources.train_data_files()
    train_observations = filename_to_id(train_files)
    train_label_dict = resources.train_data_labels()
    train_labels = np.array(
        [train_label_dict[ob_id] for ob_id in train_observations])

    test_files = resources.test_data_files()
    test_observations = filename_to_id(test_files)

    preprocess_data = preprocess(with_model, train_files, test_files)

    trained_model = train(with_model, train_files, train_observations,
                          train_labels, *preprocess_data)

    Y = predict(with_model, trained_model, test_files, test_observations,
                *preprocess_data)

    # Write the output:

    output_file = "{}/submission_{}.txt".format(resolve('..'),
                                                strftime("%Y-%m-%d_%H:%M:%S"))

    with open(output_file, 'w') as f:
        for (observation, result) in zip(test_observations, Y):
            f.write("{} {}\n".format(observation, result))

    info(">> Wrote submission output to {}".format(output_file))
Example #2
0
def _create_CoreNLP_trainxml():
    """
    Generates CoreNLP output for the training data (run once)
    """
    input_files = resources.train_data_files()
    command.run_corenlp(resolve('~', 'corenlp')
                       ,input_files
                       ,resolve('..', 'data', 'CoreNLP', 'train_data')
                       ,annotators=USE_ANNOTATORS)
Example #3
0
def all_sentences(for_data):
    """
    Returns a dict of all sentences data derived from CoreNLP. The key
    is the truncated filename (observation-ID), and the value is the output
    sentence data generated by parse_xml() for that particular file.

    @returns: {str: <sentence-data>}
    """
    assert(for_data in ('train', 'test'))

    if for_data == 'train':
        data_cache_file = CORENLP_TRAIN_DATA_CACHE
    else:
        data_cache_file = CORENLP_TEST_DATA_CACHE

    # If there's cached data, load it:
    if exists(data_cache_file):

        debug('> Loading cached CoreNLP data from {}'.format(data_cache_file))

        with open(data_cache_file, 'r') as f:
            return pickle.load(f)

    # Otherwise, generate the output from parse_xml()
    debug('> CoreNLP data {} not found; caching...'.format(data_cache_file))

    if for_data == 'train':
        filenames = resources.train_data_files('CoreNLP')
    else:
        filenames = resources.test_data_files('CoreNLP')

    #if include_test:
    #    filenames += resources.test_data_files('CoreNLP')

    # parse_xml(filename)[1] means to only keep the actual sentence data,
    # not the file name/observation identifier. Also, lops off the ".xml" part
    # from the CoreNLP output filename preserving the original filename
    data = {splitext(filename_to_id(filename))[0]: parse_xml(filename)[1] for filename in filenames}

    with open(data_cache_file, 'w') as f:
        pickle.dump(data, f)

    debug('> CoreNLP data cached to {}'.format(data_cache_file))

    return data
Example #4
0
def read_file(filetype, index=None):
    """
    Utility function to fetch a listing of the file or files that constitute 
    the training and test data set. 

    @param str filetype One of 'train' or 'test'
    @param int|None index If given, the file in the file list at the given index
    will be read from, otherwise all files will be read from
    @returns str|[str]
    """
    filelist = []

    if filetype == 'train':
        filelist = resources.train_data_files()
    elif filetype == 'test':
        filelist = resources.test_data_files()
    else:
        raise ValueError('type must be "train" or "test"; got ' + filetype)

    return map(utils.files.read_file, filelist) \
           if index is None else utils.files.read_file(filelist[index])
Example #5
0
def read_file(filetype, index=None):
    """
    Utility function to fetch a listing of the file or files that constitute 
    the training and test data set. 

    @param str filetype One of 'train' or 'test'
    @param int|None index If given, the file in the file list at the given index
    will be read from, otherwise all files will be read from
    @returns str|[str]
    """
    filelist = []

    if filetype == 'train':
        filelist = resources.train_data_files()
    elif filetype == 'test':
        filelist = resources.test_data_files()
    else:
        raise ValueError('type must be "train" or "test"; got ' + filetype)

    return map(utils.files.read_file, filelist) \
           if index is None else utils.files.read_file(filelist[index])
Example #6
0
def test(model_name,
         test_size=0.1,
         suppress_output=False,
         show_results=False,
         *args,
         **kwargs):
    """
    Runs a full test cycle for the given model

    Options:

    n_folds:         Number of cross-validation folds to produce
    suppress_output: If True, no output will be produced
    show_results:    If True, individual prediction results will be printed.
                     suppress_output must also be False for this option to work

    @returns (float:accuracy, int:correct_count, int:incorrect_count)
    """
    observations = resources.train_data_files('text')
    labels = resources.train_data_labels()

    # Fed into sklearn cross_validation
    Y = np.array([labels[ob_id] for ob_id in filename_to_id(observations)])

    # Divide the observation data into two sets for training and testing:
    #(train_files, test_files, hold_out_fold) = nfold_xval(observations, n=n_folds)

    (train_files, test_files, train_labels, true_labels) = \
        cross_validation.train_test_split(observations, Y, test_size=test_size)

    assert (len(train_files) == len(train_labels)
            and len(test_files) == len(true_labels))

    info("> test size: {}, |train| (kept): {}, |test| (held out): {}"\
         .format(test_size, len(train_files), len(test_files)))

    # Get any preprocessing data and pass it to train() anbd predict() later:
    data = preprocess(model_name, train_files, test_files)

    # Generate training features:
    trained_model = train(model_name \
                         ,train_files \
                         ,filename_to_id(train_files) \
                         ,train_labels \
                         ,*data)

    # Same as training: the observation ID is just the basename of the input
    test_observation_ids = filename_to_id(test_files)

    # Use the trained model to make predictions:
    predicted_labels = predict(model_name \
                              ,trained_model \
                              ,test_files \
                              ,test_observation_ids \
                              ,*data)

    accuracy = metrics.accuracy_score(true_labels, predicted_labels)
    cm = metrics.confusion_matrix(true_labels, predicted_labels)
    f1_score = metrics.f1_score(true_labels, predicted_labels)
    correct = cm[0][0] + cm[1][1]
    incorrect = cm[1][0] + cm[0][1]
    incorrect_observations = set()

    if not suppress_output:
        line = '*' * 80
        print
        print line
        print "Accuracy: {}%".format(accuracy * 100.0)
        print "F1-Score: {}".format(f1_score)
        print "Confusion matrix:\n", cm
        print "Incorrect labelled as 1: {}; Incorrect labelled as -1: {}".format(
            cm[1][0], cm[0][1])
        print "Incorrect:"
        for i in range(len(test_observation_ids)):
            if true_labels[i] != predicted_labels[i]:
                print "TRUE: {}, PREDICTED: {}, LEAD: {}".format(
                    true_labels[i], predicted_labels[i],
                    test_observation_ids[i])
                incorrect_observations.add(test_observation_ids[i])
        print
        print line
        print

    return (accuracy, correct, incorrect, incorrect_observations)