def load_datasets_tira_evaluation(test_dataset_main_directory, preset_key): """Load the PAN dataset for **Tira** evaluation. This function loads the PAN training and test dataset and truth by calling the *ProcessDataFiles* module twice, then passes them along with Author IDs of the test dataset. """ # Define the dictionary of presets. Each “preset” is a dictionary of some values. PRESETS_DICTIONARY = { 'PAN18_English': { 'dataset_name': 'PAN 2018 English', 'xmls_subdirectory': '/en/', 'truth_subpath': '/en-truth/truth.txt', } } PRESET = PRESETS_DICTIONARY[preset_key] # Define the constant and the paths TRAINING_DATASET_MAIN_DIRECTORY =\ "U:/TA" # # TEMP (TIRA): For local testing on TIRA # TRAINING_DATASET_MAIN_DIRECTORY = "E:/author-profiling/pan18-author-profiling-training-dataset-2018-02-27" xmls_directory_train = os.path.join(TRAINING_DATASET_MAIN_DIRECTORY, PRESET['xmls_subdirectory']) truth_path_train = os.path.join(TRAINING_DATASET_MAIN_DIRECTORY, PRESET['truth_subpath']) xmls_directory_test = os.path.join(test_dataset_main_directory, PRESET['xmls_subdirectory']) # ↳ Note: truth_path_test will not be provided to the participants. # Load the PAN 2018 training dataset and truth from the files into lists print("Loading the %s training dataset and truth...", PRESET['dataset_name']) docs_train, y_train, author_ids_train, original_tweet_lengths_train = \ process_data_files.load_pan_data(xmls_directory_train, truth_path_train, False, None) # Load the PAN 2018 test dataset from the files into lists print("Loading the %s test dataset...", PRESET['dataset_name']) docs_test, y_test, author_ids_test, original_tweet_lengths_test = \ process_data_files.load_pan_data(xmls_directory_test, None, False, None) # ↳ Note: truth_path_test will not be provided to the participants. As a result, *truths_test* will be empty. return docs_train, docs_test, y_train, author_ids_test
def load_datasets_development(preset_key): """Load the PAN dataset for the development phase. This function loads the PAN training dataset and truth by calling the *ProcessDataFiles* module, then splits the dataset into training and test sets. """ directory = 'U:/TA' # Define the dictionary of presets. Each “preset” is a dictionary of some values. PRESETS_DICTIONARY = { 'PAN18_English': { 'dataset_name': 'PAN 2018 English', 'xmls_directory': directory + '/en/', 'truth_path': directory + '/truth/truth.txt', 'txts_destination_directory': directory, } } PRESET = PRESETS_DICTIONARY[preset_key] # Load the PAN 2018 training dataset and the truth from the files into lists print("Loading the %s training dataset and the truth...", PRESET['dataset_name']) merged_tweets_of_authors, truths, author_ids, original_tweet_lengths =\ process_data_files.load_pan_data(PRESET['xmls_directory'], PRESET['truth_path'], False, PRESET['txts_destination_directory']) # Split the dataset into balanced (stratified) training and test sets: docs_train, docs_test, y_train, y_test, author_ids_train, author_ids_test,\ original_tweet_lengths_train, original_tweet_lengths_test =\ train_test_split(merged_tweets_of_authors, truths, author_ids, original_tweet_lengths, test_size=0.4, random_state=42, stratify=truths) # ↳ *stratify=truths* selects a balanced sample from the data, with the same class proportion as the *truths* list. # • Sort all lists in the ascending order of *author_ids* (separately, for the training and test set) # This is only done for the sakes of consistency between the *load_datasets_development()* and # *load_datasets_tira_evaluation()* functions, because the output of the latter is sorted by *author_ids*, while the # former is shuffled by the *train_test_split()* function. # Sort the training set author_ids_train, docs_train, y_train, original_tweet_lengths_train = [ list(tuple) for tuple in zip(*sorted( zip(author_ids_train, docs_train, y_train, original_tweet_lengths_train))) ] # Sort the test set author_ids_test, docs_test, y_test, original_tweet_lengths_test = [ list(tuple) for tuple in zip(*sorted( zip(author_ids_test, docs_test, y_test, original_tweet_lengths_test))) ] # # TEMP: Used for producing a mimic of the **TIRA** environment # ProcessDataFiles.split_train_and_test_files(author_ids_train, author_ids_test, y_train, y_test, preset_key) return docs_train, docs_test, y_train, y_test