def build_dataset(params): """ Builds (or loads) a Dataset instance. :param params: Parameters specifying Dataset options :return: Dataset object """ if params['REBUILD_DATASET']: # We build a new dataset instance if params['VERBOSE'] > 0: silence = False logging.info('Building ' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + ' dataset') else: silence = True base_path = params['DATA_ROOT_PATH'] name = params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params[ 'TRG_LAN'] ds = Dataset(name, base_path, silence=silence) # OUTPUT DATA # Let's load the train, val and test splits of the target language sentences (outputs) # the files include a sentence per line. ds.setOutput(base_path + '/' + params['TEXT_FILES']['train'] + params['TRG_LAN'], 'train', type='text', id=params['OUTPUTS_IDS_DATASET'][0], tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), build_vocabulary=True, pad_on_batch=params.get('PAD_ON_BATCH', True), sample_weights=params.get('SAMPLE_WEIGHTS', True), fill=params.get('FILL', 'end'), max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70), max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0), min_occ=params.get('MIN_OCCURRENCES_OUTPUT_VOCAB', 0)) if params.get('ALIGN_FROM_RAW', True) and not params.get('HOMOGENEOUS_BATCHES', False): ds.setRawOutput(base_path + '/' + params['TEXT_FILES']['train'] + params['TRG_LAN'], 'train', type='file-name', id='raw_' + params['OUTPUTS_IDS_DATASET'][0]) for split in ['val', 'test']: if params['TEXT_FILES'].get(split) is not None: ds.setOutput(base_path + '/' + params['TEXT_FILES'][split] + params['TRG_LAN'], split, type='text', id=params['OUTPUTS_IDS_DATASET'][0], pad_on_batch=params.get('PAD_ON_BATCH', True), tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), sample_weights=params.get('SAMPLE_WEIGHTS', True), max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70), max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0)) if params.get('ALIGN_FROM_RAW', True) and not params.get( 'HOMOGENEOUS_BATCHES', False): ds.setRawOutput( base_path + '/' + params['TEXT_FILES'][split] + params['TRG_LAN'], split, type='file-name', id='raw_' + params['OUTPUTS_IDS_DATASET'][0]) # INPUT DATA # We must ensure that the 'train' split is the first (for building the vocabulary) for split in ['train', 'val', 'test']: if params['TEXT_FILES'].get(split) is not None: if split == 'train': build_vocabulary = True else: build_vocabulary = False ds.setInput(base_path + '/' + params['TEXT_FILES'][split] + params['SRC_LAN'], split, type='text', id=params['INPUTS_IDS_DATASET'][0], pad_on_batch=params.get('PAD_ON_BATCH', True), tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), build_vocabulary=build_vocabulary, fill=params.get('FILL', 'end'), max_text_len=params.get('MAX_INPUT_TEXT_LEN', 70), max_words=params.get('INPUT_VOCABULARY_SIZE', 0), min_occ=params.get('MIN_OCCURRENCES_INPUT_VOCAB', 0)) if len(params['INPUTS_IDS_DATASET']) > 1: if 'train' in split: ds.setInput( base_path + '/' + params['TEXT_FILES'][split] + params['TRG_LAN'], split, type='text', id=params['INPUTS_IDS_DATASET'][1], required=False, tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), pad_on_batch=params.get('PAD_ON_BATCH', True), build_vocabulary=params['OUTPUTS_IDS_DATASET'][0], offset=1, fill=params.get('FILL', 'end'), max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70), max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0)) else: ds.setInput(None, split, type='ghost', id=params['INPUTS_IDS_DATASET'][-1], required=False) if params.get('ALIGN_FROM_RAW', True) and not params.get( 'HOMOGENEOUS_BATCHES', False): ds.setRawInput(base_path + '/' + params['TEXT_FILES'][split] + params['SRC_LAN'], split, type='file-name', id='raw_' + params['INPUTS_IDS_DATASET'][0]) if params.get('POS_UNK', False): if params.get('HEURISTIC', 0) > 0: ds.loadMapping(params['MAPPING']) # If we had multiple references per sentence keep_n_captions(ds, repeat=1, n=1, set_names=params['EVAL_ON_SETS']) # We have finished loading the dataset, now we can store it for using it in the future saveDataset(ds, params['DATASET_STORE_PATH']) else: # We can easily recover it with a single line ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl') return ds
def build_dataset(params, vocabulary=dict(), vocabulary_len=dict()): """ Builds (or loads) a Dataset instance. :param params: Parameters specifying Dataset options :return: Dataset object """ if params['REBUILD_DATASET']: # We build a new dataset instance if params['VERBOSE'] > 0: silence = False logging.info('Building ' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + ' dataset') else: silence = True base_path = params['DATA_ROOT_PATH'] name = params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params[ 'TRG_LAN'] doc_size = 0 if 'SECOND_DIM_SIZE' in params: doc_size = params['SECOND_DIM_SIZE'] ds = Dataset(name, base_path, silence=silence, vocabulary=vocabulary, vocabulary_len=vocabulary_len, doc_size=doc_size) # OUTPUT DATA # Let's load the train, val and test splits of the target language sentences (outputs) # the files include a sentence per line. if params['MODEL_TYPE'] == 'Predictor': if 'PRED_VOCAB' in params: ds.setOutput( base_path + '/' + params['TEXT_FILES']['train'] + params['TRG_LAN'], 'train', type='text', id=params['OUTPUTS_IDS_DATASET'][0], tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), # if you want new vocabulary set build_vocabulary to True build_vocabulary=params['OUTPUTS_IDS_DATASET'][0], pad_on_batch=params.get('PAD_ON_BATCH', True), sample_weights=params.get('SAMPLE_WEIGHTS', True), fill=params.get('FILL', 'end'), max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70), max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0), min_occ=params.get('MIN_OCCURRENCES_OUTPUT_VOCAB', 0), bpe_codes=params.get('BPE_CODES_PATH', None)) else: ds.setOutput( base_path + '/' + params['TEXT_FILES']['train'] + params['TRG_LAN'], 'train', type='text', id=params['OUTPUTS_IDS_DATASET'][0], tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), # if you want new vocabulary set build_vocabulary to True build_vocabulary=True, pad_on_batch=params.get('PAD_ON_BATCH', True), sample_weights=params.get('SAMPLE_WEIGHTS', True), fill=params.get('FILL', 'end'), max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70), max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0), min_occ=params.get('MIN_OCCURRENCES_OUTPUT_VOCAB', 0), bpe_codes=params.get('BPE_CODES_PATH', None)) elif params['MODEL_TYPE'] == 'EstimatorSent' or params[ 'MODEL_TYPE'] == 'EncSent' or 'EstimatorDoc' in params[ 'MODEL_TYPE'] or 'EncDoc' in params['MODEL_TYPE']: ds.setOutput(base_path + '/' + params['TEXT_FILES']['train'] + params['PRED_SCORE'], 'train', type='real', id=params['OUTPUTS_IDS_DATASET'][0], tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), build_vocabulary=False, pad_on_batch=params.get('PAD_ON_BATCH', False), sample_weights=params.get('SAMPLE_WEIGHTS', False), fill=params.get('FILL', 'end'), max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70), max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0), min_occ=params.get('MIN_OCCURRENCES_OUTPUT_VOCAB', 0), bpe_codes=params.get('BPE_CODES_PATH', None)) elif params['MODEL_TYPE'] == 'EstimatorWord' or params[ 'MODEL_TYPE'] == 'EncWord' or params[ 'MODEL_TYPE'] == 'EncWordAtt' or params[ 'MODEL_TYPE'] == 'EncPhraseAtt' or params[ 'MODEL_TYPE'] == 'EstimatorPhrase': ds.setOutput(base_path + '/' + params['TEXT_FILES']['train'] + params['PRED_SCORE'], 'train', type='text', id=params['OUTPUTS_IDS_DATASET'][0], tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), build_vocabulary=True, pad_on_batch=params.get('PAD_ON_BATCH', True), sample_weights=params.get('SAMPLE_WEIGHTS', False), fill=params.get('FILL', 'end'), max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70), max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0), min_occ=params.get('MIN_OCCURRENCES_OUTPUT_VOCAB', 0), bpe_codes=params.get('BPE_CODES_PATH', None)) if params.get('ALIGN_FROM_RAW', True) and not params.get('HOMOGENEOUS_BATCHES', False): ds.setRawOutput(base_path + '/' + params['TEXT_FILES']['train'] + params['TRG_LAN'], 'train', type='file-name', id='raw_' + params['OUTPUTS_IDS_DATASET'][0]) val_test_list = params.get('EVAL_ON_SETS', ['val']) no_ref = params.get('NO_REF', False) if no_ref: val_test_list = [] for split in val_test_list: if params['TEXT_FILES'].get(split) is not None: if params['MODEL_TYPE'] == 'Predictor': ds.setOutput( base_path + '/' + params['TEXT_FILES'][split] + params['TRG_LAN'], split, type='text', id=params['OUTPUTS_IDS_DATASET'][0], pad_on_batch=params.get('PAD_ON_BATCH', True), tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), sample_weights=params.get('SAMPLE_WEIGHTS', True), max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70), max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0), bpe_codes=params.get('BPE_CODES_PATH', None)) elif params['MODEL_TYPE'] == 'EstimatorSent' or params[ 'MODEL_TYPE'] == 'EncSent' or 'EstimatorDoc' in params[ 'MODEL_TYPE'] or 'EncDoc' in params['MODEL_TYPE']: ds.setOutput( base_path + '/' + params['TEXT_FILES'][split] + params['PRED_SCORE'], split, type='real', id=params['OUTPUTS_IDS_DATASET'][0], pad_on_batch=params.get('PAD_ON_BATCH', True), tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), sample_weights=params.get('SAMPLE_WEIGHTS', False), max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70), max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0), bpe_codes=params.get('BPE_CODES_PATH', None)) elif params['MODEL_TYPE'] == 'EstimatorWord' or params[ 'MODEL_TYPE'] == 'EncWord' or params[ 'MODEL_TYPE'] == 'EncWordAtt' or params[ 'MODEL_TYPE'] == 'EncPhraseAtt' or params[ 'MODEL_TYPE'] == 'EstimatorPhrase': ds.setOutput( base_path + '/' + params['TEXT_FILES'][split] + params['PRED_SCORE'], split, type='text', id=params['OUTPUTS_IDS_DATASET'][0], pad_on_batch=params.get('PAD_ON_BATCH', True), tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), sample_weights=params.get('SAMPLE_WEIGHTS', False), max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70), max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0), bpe_codes=params.get('BPE_CODES_PATH', None)) if params.get('ALIGN_FROM_RAW', True) and not params.get( 'HOMOGENEOUS_BATCHES', False): ds.setRawOutput( base_path + '/' + params['TEXT_FILES'][split] + params['TRG_LAN'], split, type='file-name', id='raw_' + params['OUTPUTS_IDS_DATASET'][0]) # INPUT DATA # We must ensure that the 'train' split is the first (for building the vocabulary) max_src_in_len = params.get('MAX_SRC_INPUT_TEXT_LEN', None) if max_src_in_len == None: params['MAX_SRC_INPUT_TEXT_LEN'] = params['MAX_INPUT_TEXT_LEN'] max_trg_in_len = params.get('MAX_TRG_INPUT_TEXT_LEN', None) if max_trg_in_len == None: params['MAX_TRG_INPUT_TEXT_LEN'] = params['MAX_INPUT_TEXT_LEN'] data_type_src = 'text' data_type_trg = 'text' if 'EstimatorDoc' in params['MODEL_TYPE'] or 'EncDoc' in params[ 'MODEL_TYPE']: data_type_src = 'doc' data_type_trg = 'doc' # here we set to doc meaning just the 3d input if params['MODEL_TYPE'] == 'EstimatorPhrase' or params[ 'MODEL_TYPE'] == 'EncPhraseAtt': data_type_trg = 'doc' ext = params['TRG_LAN'] target_dict = 'target_text' #if params['MODEL_TYPE'] != 'Predictor': # ext = 'mt' for split in ['train', 'val', 'test']: if params['TEXT_FILES'].get(split) is not None: if split == 'train': build_vocabulary = True else: build_vocabulary = False if 'PRED_VOCAB' in params: ds.setInput( base_path + '/' + params['TEXT_FILES'][split] + params['SRC_LAN'], split, type=data_type_src, id=params['INPUTS_IDS_DATASET'][0], pad_on_batch=params.get('PAD_ON_BATCH', True), tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), build_vocabulary=params['INPUTS_IDS_DATASET'][0], fill=params.get('FILL', 'end'), max_text_len=params.get('MAX_SRC_INPUT_TEXT_LEN', 70), max_words=params.get('INPUT_VOCABULARY_SIZE', 0), min_occ=params.get('MIN_OCCURRENCES_INPUT_VOCAB', 0), bpe_codes=params.get('BPE_CODES_PATH', None)) else: ds.setInput( base_path + '/' + params['TEXT_FILES'][split] + params['SRC_LAN'], split, type=data_type_src, id=params['INPUTS_IDS_DATASET'][0], pad_on_batch=params.get('PAD_ON_BATCH', True), tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), build_vocabulary=build_vocabulary, fill=params.get('FILL', 'end'), max_text_len=params.get('MAX_SRC_INPUT_TEXT_LEN', 70), max_words=params.get('INPUT_VOCABULARY_SIZE', 0), min_occ=params.get('MIN_OCCURRENCES_INPUT_VOCAB', 0), bpe_codes=params.get('BPE_CODES_PATH', None)) if len(params['INPUTS_IDS_DATASET']) == 2: if 'PRED_VOCAB' not in params and 'train' in split: ds.setInput( base_path + '/' + params['TEXT_FILES'][split] + ext, split, type=data_type_trg, id=params['INPUTS_IDS_DATASET'][1], required=False, tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), pad_on_batch=params.get('PAD_ON_BATCH', True), build_vocabulary=build_vocabulary, offset=0, fill=params.get('FILL', 'end'), max_text_len=params.get('MAX_TRG_INPUT_TEXT_LEN', 3), max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0), bpe_codes=params.get('BPE_CODES_PATH', None)) else: # ds.setInput(None, # split, # type='ghost', # id=params['INPUTS_IDS_DATASET'][-1], # required=False) ds.setInput( base_path + '/' + params['TEXT_FILES'][split] + ext, split, type=data_type_trg, id=params['INPUTS_IDS_DATASET'][1], required=False, tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), pad_on_batch=params.get('PAD_ON_BATCH', True), build_vocabulary=target_dict, offset=0, fill=params.get('FILL', 'end'), max_text_len=params.get('MAX_TRG_INPUT_TEXT_LEN', 3), max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0), bpe_codes=params.get('BPE_CODES_PATH', None)) if len(params['INPUTS_IDS_DATASET']) > 2: if 'PRED_VOCAB' not in params and 'train' in split: ds.setInput( base_path + '/' + params['TEXT_FILES'][split] + ext, split, type=data_type_trg, id=params['INPUTS_IDS_DATASET'][1], required=False, tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), pad_on_batch=params.get('PAD_ON_BATCH', True), build_vocabulary=build_vocabulary, offset=1, fill=params.get('FILL', 'end'), max_text_len=params.get('MAX_TRG_INPUT_TEXT_LEN', 3), max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0), bpe_codes=params.get('BPE_CODES_PATH', None)) ds.setInput( base_path + '/' + params['TEXT_FILES'][split] + ext, split, type=data_type_trg, id=params['INPUTS_IDS_DATASET'][2], required=False, tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), pad_on_batch=params.get('PAD_ON_BATCH', True), build_vocabulary=target_dict, offset=-1, fill=params.get('FILL', 'end'), max_text_len=params.get('MAX_TRG_INPUT_TEXT_LEN', 3), max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0), bpe_codes=params.get('BPE_CODES_PATH', None)) ds.setInput( base_path + '/' + params['TEXT_FILES'][split] + ext, split, type=data_type_trg, id=params['INPUTS_IDS_DATASET'][3], required=False, tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), pad_on_batch=params.get('PAD_ON_BATCH', True), build_vocabulary=target_dict, offset=0, fill=params.get('FILL', 'end'), max_text_len=params.get('MAX_TRG_INPUT_TEXT_LEN', 3), max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0), bpe_codes=params.get('BPE_CODES_PATH', None)) else: # ds.setInput(None, # split, # type='ghost', # id=params['INPUTS_IDS_DATASET'][-1], # required=False) ds.setInput( base_path + '/' + params['TEXT_FILES'][split] + ext, split, type=data_type_trg, id=params['INPUTS_IDS_DATASET'][1], required=False, tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), pad_on_batch=params.get('PAD_ON_BATCH', True), build_vocabulary=target_dict, offset=1, fill=params.get('FILL', 'end'), max_text_len=params.get('MAX_TRG_INPUT_TEXT_LEN', 3), max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0), bpe_codes=params.get('BPE_CODES_PATH', None)) ds.setInput( base_path + '/' + params['TEXT_FILES'][split] + ext, split, type=data_type_trg, id=params['INPUTS_IDS_DATASET'][2], required=False, tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), pad_on_batch=params.get('PAD_ON_BATCH', True), build_vocabulary=target_dict, offset=-1, fill=params.get('FILL', 'end'), max_text_len=params.get('MAX_TRG_INPUT_TEXT_LEN', 3), max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0), bpe_codes=params.get('BPE_CODES_PATH', None)) ds.setInput( base_path + '/' + params['TEXT_FILES'][split] + ext, split, type=data_type_trg, id=params['INPUTS_IDS_DATASET'][3], required=False, tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), pad_on_batch=params.get('PAD_ON_BATCH', True), build_vocabulary=target_dict, offset=0, fill=params.get('FILL', 'end'), max_text_len=params.get('MAX_TRG_INPUT_TEXT_LEN', 3), max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0), bpe_codes=params.get('BPE_CODES_PATH', None)) if params.get('ALIGN_FROM_RAW', True) and not params.get( 'HOMOGENEOUS_BATCHES', False): ds.setRawInput(base_path + '/' + params['TEXT_FILES'][split] + params['SRC_LAN'], split, type='file-name', id='raw_' + params['INPUTS_IDS_DATASET'][0]) if params.get('POS_UNK', False): if params.get('HEURISTIC', 0) > 0: ds.loadMapping(params['MAPPING']) # If we had multiple references per sentence if not params.get('NO_REF', False): keep_n_captions(ds, repeat=1, n=1, set_names=params['EVAL_ON_SETS']) # We have finished loading the dataset, now we can store it for using it in the future saveDataset(ds, params['DATASET_STORE_PATH']) else: # We can easily recover it with a single line ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl') return ds