def load_my_data(directory, test_split=0.2, nb_words=None):
    # directory = '/home/habi/research/data/convincingness/step5-gold-data/'
    # directory = '/home/user-ukp/data2/convincingness/step7-learning-11-no-eq/'
    files = listdir(directory)
    # print(files)

    # folds
    folds = dict()
    for file_name in files:
        training_file_names = copy(files)
        # remove current file
        training_file_names.remove(file_name)
        folds[file_name] = {"training": training_file_names, "test": file_name}

    # print(folds)

    word_to_indices_map, word_index_to_embeddings_map = vocabulary_embeddings_extractor.load_all()

    # results: map with fold_name (= file_name) and two tuples: (train_x, train_y), (test_x, test_y)
    output_folds_with_train_test_data = dict()

    # load all data first
    all_loaded_files = dict()
    for file_name in folds.keys():
        # print(file_name)
        test_instances, test_labels, ids = load_single_file(directory, file_name, word_to_indices_map, nb_words)
        all_loaded_files[file_name] = test_instances, test_labels, ids
    print("Loaded", len(all_loaded_files), "files")

    # parse each csv file in the directory
    for file_name in folds.keys():
        # print(file_name)

        # add new fold
        output_folds_with_train_test_data[file_name] = dict()

        # fill fold with train data
        current_fold = output_folds_with_train_test_data[file_name]

        test_instances, test_labels, ids = all_loaded_files.get(file_name)

        # add tuple
        current_fold["test"] = test_instances, test_labels, ids

        # now collect all training instances
        all_training_instances = []
        all_training_labels = []
        all_training_ids = []
        for training_file_name in folds.get(file_name)["training"]:
            training_instances, training_labels, ids = all_loaded_files.get(training_file_name)
            all_training_instances.extend(training_instances)
            all_training_labels.extend(training_labels)
            all_training_ids.extend(ids)

        current_fold["training"] = all_training_instances, all_training_labels, all_training_ids

    # now we should have all data loaded

    return output_folds_with_train_test_data, word_index_to_embeddings_map
    # load the embeddings
    docid_to_idx_map = np.argsort(docids).flatten()
    test_items_feat, uids = concat_feature_sets(
        (test_ids), [X], ling_feat_spmatrix, embeddings, docid_to_idx_map)

    return test_items_feat, uids


if __name__ == '__main__':

    print(
        'This script trains a model on the UKPConvArgStrict dataset. So, before running this script, you '
        'need to run "python/analysis/habernal_comparison/run_preprocessing.py" to extract the linguistic features'
        'from this dataset.')

    word_to_indices_map, word_index_to_embeddings_map, index_to_word_map = vocabulary_embeddings_extractor.load_all(
        embeddings_dir + 'vocabulary.embeddings.all.pkl.bz2')
    embeddings = load_embeddings(word_index_to_embeddings_map)

    train_model(embeddings)

    # Load the model and the embeddings from file
    with open(pkl_file, 'rb') as fh:
        model = pickle.load(fh)

    # Now load some test documents for RANKING and extract their features
    input_dir = os.path.abspath(test_data_path)
    tmp_dir = os.path.abspath('./data/tempdata')
    output_dir = os.path.abspath('./data/new_ranking_libsvm')

    # use this directory to get a mapping from features to integers that matches the training set
    feature_dir = os.path.join(os.path.expanduser(training_data_path),
Beispiel #3
0
def load_my_data_separate_args(directory, test_split=0.2, nb_words=None, add_reversed_training_data=False,
                               embeddings_dir=''):
    # directory = '/home/habi/research/data/convincingness/step5-gold-data/'
    # directory = '/home/user-ukp/data2/convincingness/step7-learning-11-no-eq/'
    files = listdir(directory)
    
    for file_name in files:
        if file_name.split('.')[-1] != 'csv':                
            print("Skipping files without .csv suffix: %s" % directory + '/' + file_name)
            files.remove(file_name)
    
    # print(files)

    # folds
    folds = dict()
    for file_name in files:
        training_file_names = copy(files)
        # remove current file
        training_file_names.remove(file_name)
        folds[file_name] = {"training": training_file_names, "test": file_name}

    # print(folds)

    word_to_indices_map, word_index_to_embeddings_map, index_to_word_map = vocabulary_embeddings_extractor.load_all(
        embeddings_dir + 'vocabulary.embeddings.all.pkl.bz2')

    # results: map with fold_name (= file_name) and two tuples: (train_x, train_y), (test_x, test_y)
    output_folds_with_train_test_data = dict()

    # load all data first
    all_loaded_files = dict()
    for file_name in folds.keys():
        #print(file_name)
        test_instances_a1, test_instances_a2, test_labels, ids, turkerids, test_a1, test_a2 = \
                                    load_single_file_separate_args(directory, file_name, word_to_indices_map, nb_words)
        all_loaded_files[file_name] = test_instances_a1, test_instances_a2, test_labels, ids, turkerids, test_a1, test_a2
    print("Loaded", len(all_loaded_files), "files")

    # parse each csv file in the directory
    for file_name in folds.keys():
        #print("Test fold: ")
        #print(file_name)

        # add new fold
        output_folds_with_train_test_data[file_name] = dict()

        # fill fold with train data
        current_fold = output_folds_with_train_test_data[file_name]

        test_instances_a1, test_instances_a2, test_labels, ids, turkerids, test_a1, test_a2 = all_loaded_files.get(file_name)

        # add tuple
        current_fold["test"] = test_instances_a1, test_instances_a2, test_labels, ids, turkerids, test_a1, test_a2

        # now collect all training instances
        all_tr_instances_a1 = []
        all_tr_instances_a2 = []
        all_tr_labels = []
        all_tr_ids = []
        all_tr_turker_ids = []
        all_tr_a1 = []
        all_tr_a2 = []
        for training_file_name in folds.get(file_name)["training"]:
            tr_instances_a1, tr_instances_a2, training_labels, ids, turker_ids, tr_a1, tr_a2 = \
                                                                            all_loaded_files.get(training_file_name)
            #print("Training file: ")
            #print(training_file_name)
            all_tr_instances_a1.extend(tr_instances_a1)
            all_tr_instances_a2.extend(tr_instances_a2)
            all_tr_labels.extend(training_labels)
            all_tr_ids.extend(ids)
            all_tr_turker_ids.extend(turker_ids)
            all_tr_a1.extend(tr_a1)
            all_tr_a2.extend(tr_a2)

        current_fold["training"] = all_tr_instances_a1, all_tr_instances_a2, all_tr_labels, all_tr_ids, \
                all_tr_turker_ids, all_tr_a1, all_tr_a2

    # now we should have all data loaded

    return output_folds_with_train_test_data, word_index_to_embeddings_map, word_to_indices_map, index_to_word_map
def load_my_data(directory, test_split=0.2, nb_words=None, add_reversed_training_data=False):
    # directory = '/home/habi/research/data/convincingness/step5-gold-data/'
    # directory = '/home/user-ukp/data2/convincingness/step7-learning-11-no-eq/'
    files = listdir(directory)
    # print(files)

    # folds
    folds = dict()
    for file_name in files:
        training_file_names = copy(files)
        # remove current file
        training_file_names.remove(file_name)
        folds[file_name] = {"training": training_file_names, "test": file_name}

    # print(folds)

    word_to_indices_map, word_index_to_embeddings_map = vocabulary_embeddings_extractor.load_all()

    # results: map with fold_name (= file_name) and two tuples: (train_x, train_y), (test_x, test_y)
    output_folds_with_train_test_data = dict()

    # load all data first
    all_loaded_files = dict()
    for file_name in folds.keys():
        # print(file_name)
        test_instances, test_labels, ids, x_vectors_reversed, y_labels_reversed = load_single_file(directory, file_name,
                                                                                                   word_to_indices_map,
                                                                                                   nb_words)
        all_loaded_files[file_name] = test_instances, test_labels, ids, x_vectors_reversed, y_labels_reversed
    print("Loaded", len(all_loaded_files), "files")

    # parse each csv file in the directory
    for file_name in folds.keys():
        # print(file_name)

        # add new fold
        output_folds_with_train_test_data[file_name] = dict()

        # fill fold with train data
        current_fold = output_folds_with_train_test_data[file_name]

        test_instances, test_labels, ids, test_x_vectors_reversed, test_y_labels_reversed = all_loaded_files.get(
            file_name)

        # add tuple
        current_fold["test"] = test_instances, test_labels, ids

        # now collect all training instances
        all_training_instances = []
        all_training_labels = []
        all_training_ids = []
        for training_file_name in folds.get(file_name)["training"]:
            training_instances, training_labels, ids, x_vectors_reversed, y_labels_reversed = all_loaded_files.get(
                training_file_name)
            all_training_instances.extend(training_instances)
            all_training_labels.extend(training_labels)
            all_training_ids.extend(ids)

            if add_reversed_training_data:
                all_training_instances.extend(x_vectors_reversed)
                all_training_labels.extend(y_labels_reversed)
                all_training_ids.extend(ids)

        current_fold["training"] = all_training_instances, all_training_labels, all_training_ids

    # now we should have all data loaded

    return output_folds_with_train_test_data, word_index_to_embeddings_map