def prepare_character_based_experiment(): processed_dataset_subdirectory = processed_datasets_directory + "data_character_based/" lowercase = True # the whole dataset is lowercased except for some probably erroneous examples in the dev set # lowercase it also for character based to be consistent tokenize = False delexicalize = False delexicalize_after_grouping = False group_inputs = False collect_multi_references = False dev_dataset_path = dataset_directory + "train.json" dev_output_path = processed_dataset_subdirectory + "train" args = arguments.DatsetArguments(dev_dataset_path, dataset_type, dev_output_path, only_source, multi_reference, lowercase, tokenize, delexicalize, delexicalize_after_grouping, group_inputs, collect_multi_references, entities_to_delexicalize, log_level) NLGevaluation.data.read_datasets.process_dataset(args) # development data - single reference dataset_path = dataset_directory + "valid.json" output_path = processed_dataset_subdirectory + "dev" args = arguments.DatsetArguments(dataset_path, dataset_type, output_path, only_source, multi_reference, lowercase, tokenize, delexicalize, delexicalize_after_grouping, group_inputs, collect_multi_references, entities_to_delexicalize, log_level) NLGevaluation.data.read_datasets.process_dataset(args) collect_multi_references = True # development data multi-ref dev_dataset_path = dataset_directory + "valid.json" dev_output_path = processed_dataset_subdirectory + "dev_multi_ref" args = arguments.DatsetArguments(dev_dataset_path, dataset_type, dev_output_path, only_source, multi_reference, lowercase, tokenize, delexicalize, delexicalize_after_grouping, group_inputs, collect_multi_references, entities_to_delexicalize, log_level) NLGevaluation.data.read_datasets.process_dataset(args) # test data multi-ref dev_dataset_path = dataset_directory + "test.json" dev_output_path = processed_dataset_subdirectory + "test_multi_ref" args = arguments.DatsetArguments(dev_dataset_path, dataset_type, dev_output_path, only_source, multi_reference, lowercase, tokenize, delexicalize, delexicalize_after_grouping, group_inputs, collect_multi_references, entities_to_delexicalize, log_level) NLGevaluation.data.read_datasets.process_dataset(args)
def preprocess_data(input_file, output_name, output_folder, delexicalize, delexicalize_after_grouping, group_inputs, tokenize, record_delimiter, separate_inputs_for_number_of_records): processed_dataset_subdirectory = processed_datasets_directory + output_folder dataset_path = dataset_directory + input_file output_path = processed_dataset_subdirectory + output_name args = arguments.DatsetArguments(dataset_path, dataset_type, output_path, only_source, multi_reference, lowercase, tokenize, delexicalize, delexicalize_after_grouping, group_inputs, collect_multi_references, entities_to_delexicalize, log_level, record_delimiter=record_delimiter, separate_inputs_for_number_of_records= separate_inputs_for_number_of_records) NLGevaluation.data.read_datasets.process_dataset(args)
def prepare_word_based_experiment(): processed_dataset_subdirectory = processed_datasets_directory + "data_word_based/" delexicalize = False delexicalize_after_grouping = False group_inputs = False collect_multi_references = False dev_dataset_path = dataset_directory + "train.json" dev_output_path = processed_dataset_subdirectory + "train" args = arguments.DatsetArguments(dev_dataset_path, dataset_type, dev_output_path, only_source, multi_reference, lowercase, tokenize, delexicalize, delexicalize_after_grouping, group_inputs, collect_multi_references, entities_to_delexicalize, log_level) NLGevaluation.data.read_datasets.process_dataset(args) collect_multi_references = True # development data multi-ref dev_dataset_path = dataset_directory + "valid.json" dev_output_path = processed_dataset_subdirectory + "dev_multi_ref" args = arguments.DatsetArguments(dev_dataset_path, dataset_type, dev_output_path, only_source, multi_reference, lowercase, tokenize, delexicalize, delexicalize_after_grouping, group_inputs, collect_multi_references, entities_to_delexicalize, log_level) NLGevaluation.data.read_datasets.process_dataset(args) # test data multi-ref dev_dataset_path = dataset_directory + "test.json" dev_output_path = processed_dataset_subdirectory + "test_multi_ref" args = arguments.DatsetArguments(dev_dataset_path, dataset_type, dev_output_path, only_source, multi_reference, lowercase, tokenize, delexicalize, delexicalize_after_grouping, group_inputs, collect_multi_references, entities_to_delexicalize, log_level) NLGevaluation.data.read_datasets.process_dataset(args)
def prepare_word_based_delex_experiment(): processed_dataset_subdirectory = processed_datasets_directory + "data_word_based_delex/" dataset_path = dataset_directory + "train.json" output_path = processed_dataset_subdirectory + "train" delexicalize = True delexicalize_after_grouping = False group_inputs = False collect_multi_references = False #TODO add additional normalization from https://github.com/shawnwun/RNNLG/utils/nlp.py/normalize? args = arguments.DatsetArguments(dataset_path, dataset_type, output_path, only_source, multi_reference, lowercase, tokenize, delexicalize, delexicalize_after_grouping, group_inputs, collect_multi_references, entities_to_delexicalize, log_level) NLGevaluation.data.read_datasets.process_dataset(args) # development data dataset_path = dataset_directory + "valid.json" output_path = processed_dataset_subdirectory + "dev" args = arguments.DatsetArguments(dataset_path, dataset_type, output_path, only_source, multi_reference, lowercase, tokenize, delexicalize, delexicalize_after_grouping, group_inputs, collect_multi_references, entities_to_delexicalize, log_level) NLGevaluation.data.read_datasets.process_dataset(args) # test data dataset_path = dataset_directory + "test.json" output_path = processed_dataset_subdirectory + "test" args = arguments.DatsetArguments(dataset_path, dataset_type, output_path, only_source, multi_reference, lowercase, tokenize, delexicalize, delexicalize_after_grouping, group_inputs, collect_multi_references, entities_to_delexicalize, log_level) NLGevaluation.data.read_datasets.process_dataset(args)