def load_my_data(directory, test_split=0.2, nb_words=None): # directory = '/home/habi/research/data/convincingness/step5-gold-data/' # directory = '/home/user-ukp/data2/convincingness/step7-learning-11-no-eq/' files = listdir(directory) # print(files) # folds folds = dict() for file_name in files: training_file_names = copy(files) # remove current file training_file_names.remove(file_name) folds[file_name] = {"training": training_file_names, "test": file_name} # print(folds) word_to_indices_map, word_index_to_embeddings_map = vocabulary_embeddings_extractor.load_all() # results: map with fold_name (= file_name) and two tuples: (train_x, train_y), (test_x, test_y) output_folds_with_train_test_data = dict() # load all data first all_loaded_files = dict() for file_name in folds.keys(): # print(file_name) test_instances, test_labels, ids = load_single_file(directory, file_name, word_to_indices_map, nb_words) all_loaded_files[file_name] = test_instances, test_labels, ids print("Loaded", len(all_loaded_files), "files") # parse each csv file in the directory for file_name in folds.keys(): # print(file_name) # add new fold output_folds_with_train_test_data[file_name] = dict() # fill fold with train data current_fold = output_folds_with_train_test_data[file_name] test_instances, test_labels, ids = all_loaded_files.get(file_name) # add tuple current_fold["test"] = test_instances, test_labels, ids # now collect all training instances all_training_instances = [] all_training_labels = [] all_training_ids = [] for training_file_name in folds.get(file_name)["training"]: training_instances, training_labels, ids = all_loaded_files.get(training_file_name) all_training_instances.extend(training_instances) all_training_labels.extend(training_labels) all_training_ids.extend(ids) current_fold["training"] = all_training_instances, all_training_labels, all_training_ids # now we should have all data loaded return output_folds_with_train_test_data, word_index_to_embeddings_map
# load the embeddings docid_to_idx_map = np.argsort(docids).flatten() test_items_feat, uids = concat_feature_sets( (test_ids), [X], ling_feat_spmatrix, embeddings, docid_to_idx_map) return test_items_feat, uids if __name__ == '__main__': print( 'This script trains a model on the UKPConvArgStrict dataset. So, before running this script, you ' 'need to run "python/analysis/habernal_comparison/run_preprocessing.py" to extract the linguistic features' 'from this dataset.') word_to_indices_map, word_index_to_embeddings_map, index_to_word_map = vocabulary_embeddings_extractor.load_all( embeddings_dir + 'vocabulary.embeddings.all.pkl.bz2') embeddings = load_embeddings(word_index_to_embeddings_map) train_model(embeddings) # Load the model and the embeddings from file with open(pkl_file, 'rb') as fh: model = pickle.load(fh) # Now load some test documents for RANKING and extract their features input_dir = os.path.abspath(test_data_path) tmp_dir = os.path.abspath('./data/tempdata') output_dir = os.path.abspath('./data/new_ranking_libsvm') # use this directory to get a mapping from features to integers that matches the training set feature_dir = os.path.join(os.path.expanduser(training_data_path),
def load_my_data_separate_args(directory, test_split=0.2, nb_words=None, add_reversed_training_data=False, embeddings_dir=''): # directory = '/home/habi/research/data/convincingness/step5-gold-data/' # directory = '/home/user-ukp/data2/convincingness/step7-learning-11-no-eq/' files = listdir(directory) for file_name in files: if file_name.split('.')[-1] != 'csv': print("Skipping files without .csv suffix: %s" % directory + '/' + file_name) files.remove(file_name) # print(files) # folds folds = dict() for file_name in files: training_file_names = copy(files) # remove current file training_file_names.remove(file_name) folds[file_name] = {"training": training_file_names, "test": file_name} # print(folds) word_to_indices_map, word_index_to_embeddings_map, index_to_word_map = vocabulary_embeddings_extractor.load_all( embeddings_dir + 'vocabulary.embeddings.all.pkl.bz2') # results: map with fold_name (= file_name) and two tuples: (train_x, train_y), (test_x, test_y) output_folds_with_train_test_data = dict() # load all data first all_loaded_files = dict() for file_name in folds.keys(): #print(file_name) test_instances_a1, test_instances_a2, test_labels, ids, turkerids, test_a1, test_a2 = \ load_single_file_separate_args(directory, file_name, word_to_indices_map, nb_words) all_loaded_files[file_name] = test_instances_a1, test_instances_a2, test_labels, ids, turkerids, test_a1, test_a2 print("Loaded", len(all_loaded_files), "files") # parse each csv file in the directory for file_name in folds.keys(): #print("Test fold: ") #print(file_name) # add new fold output_folds_with_train_test_data[file_name] = dict() # fill fold with train data current_fold = output_folds_with_train_test_data[file_name] test_instances_a1, test_instances_a2, test_labels, ids, turkerids, test_a1, test_a2 = all_loaded_files.get(file_name) # add tuple current_fold["test"] = test_instances_a1, test_instances_a2, test_labels, ids, turkerids, test_a1, test_a2 # now collect all training instances all_tr_instances_a1 = [] all_tr_instances_a2 = [] all_tr_labels = [] all_tr_ids = [] all_tr_turker_ids = [] all_tr_a1 = [] all_tr_a2 = [] for training_file_name in folds.get(file_name)["training"]: tr_instances_a1, tr_instances_a2, training_labels, ids, turker_ids, tr_a1, tr_a2 = \ all_loaded_files.get(training_file_name) #print("Training file: ") #print(training_file_name) all_tr_instances_a1.extend(tr_instances_a1) all_tr_instances_a2.extend(tr_instances_a2) all_tr_labels.extend(training_labels) all_tr_ids.extend(ids) all_tr_turker_ids.extend(turker_ids) all_tr_a1.extend(tr_a1) all_tr_a2.extend(tr_a2) current_fold["training"] = all_tr_instances_a1, all_tr_instances_a2, all_tr_labels, all_tr_ids, \ all_tr_turker_ids, all_tr_a1, all_tr_a2 # now we should have all data loaded return output_folds_with_train_test_data, word_index_to_embeddings_map, word_to_indices_map, index_to_word_map
def load_my_data(directory, test_split=0.2, nb_words=None, add_reversed_training_data=False): # directory = '/home/habi/research/data/convincingness/step5-gold-data/' # directory = '/home/user-ukp/data2/convincingness/step7-learning-11-no-eq/' files = listdir(directory) # print(files) # folds folds = dict() for file_name in files: training_file_names = copy(files) # remove current file training_file_names.remove(file_name) folds[file_name] = {"training": training_file_names, "test": file_name} # print(folds) word_to_indices_map, word_index_to_embeddings_map = vocabulary_embeddings_extractor.load_all() # results: map with fold_name (= file_name) and two tuples: (train_x, train_y), (test_x, test_y) output_folds_with_train_test_data = dict() # load all data first all_loaded_files = dict() for file_name in folds.keys(): # print(file_name) test_instances, test_labels, ids, x_vectors_reversed, y_labels_reversed = load_single_file(directory, file_name, word_to_indices_map, nb_words) all_loaded_files[file_name] = test_instances, test_labels, ids, x_vectors_reversed, y_labels_reversed print("Loaded", len(all_loaded_files), "files") # parse each csv file in the directory for file_name in folds.keys(): # print(file_name) # add new fold output_folds_with_train_test_data[file_name] = dict() # fill fold with train data current_fold = output_folds_with_train_test_data[file_name] test_instances, test_labels, ids, test_x_vectors_reversed, test_y_labels_reversed = all_loaded_files.get( file_name) # add tuple current_fold["test"] = test_instances, test_labels, ids # now collect all training instances all_training_instances = [] all_training_labels = [] all_training_ids = [] for training_file_name in folds.get(file_name)["training"]: training_instances, training_labels, ids, x_vectors_reversed, y_labels_reversed = all_loaded_files.get( training_file_name) all_training_instances.extend(training_instances) all_training_labels.extend(training_labels) all_training_ids.extend(ids) if add_reversed_training_data: all_training_instances.extend(x_vectors_reversed) all_training_labels.extend(y_labels_reversed) all_training_ids.extend(ids) current_fold["training"] = all_training_instances, all_training_labels, all_training_ids # now we should have all data loaded return output_folds_with_train_test_data, word_index_to_embeddings_map