def build_glossary(glossary_text_file, dest_filename, separator='\t'): """ Preprocess a glossary file with the format word <separator> desired_replacement and stores them in a suitable format (.pkl) :param glossary_text_file: Path to the glossary file. :param dest_filename: Output filename. :param separator: Separator between words and replacements """ glossary = dict() print("Reading glossary from %s" % glossary_text_file) for glossary_line in open(glossary_text_file).read().splitlines(): split_line = glossary_line.split(separator) glossary[split_line[0]] = ' '.join(split_line[1:]) print("Done. Saving glossary into %s" % dest_filename) dict2pkl(glossary, dest_filename)
def train_model(params): """ Training function. Sets the training parameters from params. Build or loads the model and launches the training. :param params: Dictionary of network hyperparameters. :return: None """ if params['RELOAD'] > 0: logging.info('Resuming training.') check_params(params) # Load data dataset = build_dataset(params) params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # Build model if (params['RELOAD'] == 0): # build new model video_model = VideoDesc_Model(params, type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], vocabularies=dataset.vocabulary, store_path=params['STORE_PATH']) dict2pkl(params, params['STORE_PATH'] + '/config') # Define the inputs and outputs mapping from our Dataset instance to our model inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): if len(video_model.ids_inputs) > i: pos_source = dataset.ids_inputs.index(id_in) id_dest = video_model.ids_inputs[i] inputMapping[id_dest] = pos_source video_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): if len(video_model.ids_outputs) > i: pos_target = dataset.ids_outputs.index(id_out) id_dest = video_model.ids_outputs[i] outputMapping[id_dest] = pos_target video_model.setOutputsMapping(outputMapping) else: # resume from previously trained model video_model = loadModel(params['STORE_PATH'], params['RELOAD']) video_model.setOptimizer() ########### ########### Callbacks callbacks = buildCallbacks(params, video_model, dataset) ########### ########### Training total_start_time = timer() logger.debug('Starting training!') training_params = { 'n_epochs': params['MAX_EPOCH'], 'batch_size': params['BATCH_SIZE'], 'homogeneous_batches': params['HOMOGENEOUS_BATCHES'], 'maxlen': params['MAX_OUTPUT_TEXT_LEN'], 'lr_decay': params['LR_DECAY'], 'lr_gamma': params['LR_GAMMA'], 'epochs_for_save': params['EPOCHS_FOR_SAVE'], 'verbose': params['VERBOSE'], 'eval_on_sets': params['EVAL_ON_SETS_KERAS'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'extra_callbacks': callbacks, 'reload_epoch': params['RELOAD'], 'epoch_offset': params['RELOAD'], 'data_augmentation': params['DATA_AUGMENTATION'], 'patience': params.get('PATIENCE', 0), 'metric_check': params.get('STOP_METRIC', None) } video_model.trainNet(dataset, training_params) total_end_time = timer() time_difference = total_end_time - total_start_time logging.info('In total is {0:.2f}s = {1:.2f}m'.format( time_difference, time_difference / 60.0))
i += 1 if (i % 1000) == 0 and args.verbose > 0: print i if cur_source != -1: d[cur_source] = tmp_dict # Set dict for previous word cur_source = line[0] tmp_dict = dict() tmp_dict[line[1]] = pow(np.e, float(line[2])) else: tmp_dict[line[1]] = pow(np.e, float(line[2])) d[cur_source] = tmp_dict del tmp_dict e = {} j = 0 for elt in d: if (j % 1000) == 0 and args.verbose > 0: print j j += 1 e[elt] = sorted(d[elt], key=d[elt].get)[::-1] f1 = {} j = 0 for elt in e: if (j % 1000) == 0 and args.verbose > 0: print j j += 1 f1[elt] = e[elt][0] dict2pkl(f1, args.dest)
def train_model(params, load_dataset=None): """ Training function. Sets the training parameters from params. Build or loads the model and launches the training. :param dict params: Dictionary of network hyperparameters. :param str load_dataset: Load dataset from file or build it from the parameters. :return: None """ if params['RELOAD'] > 0: logger.info('Resuming training.') # Load data if load_dataset is None: if params['REBUILD_DATASET']: logger.info('Rebuilding dataset.') dataset = build_dataset(params) else: logger.info('Updating dataset.') dataset = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl') epoch_offset = 0 if dataset.len_train == 0 else int( params['RELOAD'] * params['BATCH_SIZE'] / dataset.len_train) params['EPOCH_OFFSET'] = params['RELOAD'] if params[ 'RELOAD_EPOCH'] else epoch_offset for split, filename in iteritems(params['TEXT_FILES']): dataset = update_dataset_from_file( dataset, params['DATA_ROOT_PATH'] + '/' + filename + params['SRC_LAN'], params, splits=list([split]), output_text_filename=params['DATA_ROOT_PATH'] + '/' + filename + params['TRG_LAN'], remove_outputs=False, compute_state_below=True, recompute_references=True) dataset.name = params['DATASET_NAME'] + '_' + params[ 'SRC_LAN'] + params['TRG_LAN'] saveDataset(dataset, params['DATASET_STORE_PATH']) else: logger.info('Reloading and using dataset.') dataset = loadDataset(load_dataset) else: # Load data if load_dataset is None: dataset = build_dataset(params) else: dataset = loadDataset(load_dataset) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # Build model set_optimizer = True if params['RELOAD'] == 0 else False clear_dirs = True if params['RELOAD'] == 0 else False # build new model nmt_model = TranslationModel(params, model_type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], vocabularies=dataset.vocabulary, store_path=params['STORE_PATH'], set_optimizer=set_optimizer, clear_dirs=clear_dirs) # Define the inputs and outputs mapping from our Dataset instance to our model inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): pos_source = dataset.ids_inputs.index(id_in) id_dest = nmt_model.ids_inputs[i] inputMapping[id_dest] = pos_source nmt_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): pos_target = dataset.ids_outputs.index(id_out) id_dest = nmt_model.ids_outputs[i] outputMapping[id_dest] = pos_target nmt_model.setOutputsMapping(outputMapping) if params['RELOAD'] > 0: nmt_model = updateModel(nmt_model, params['STORE_PATH'], params['RELOAD'], reload_epoch=params['RELOAD_EPOCH']) nmt_model.setParams(params) nmt_model.setOptimizer() if params.get('EPOCH_OFFSET') is None: params['EPOCH_OFFSET'] = params['RELOAD'] if params['RELOAD_EPOCH'] else \ int(params['RELOAD'] * params['BATCH_SIZE'] / dataset.len_train) # Store configuration as pkl dict2pkl(params, params['STORE_PATH'] + '/config') # Callbacks callbacks = buildCallbacks(params, nmt_model, dataset) # Training total_start_time = timer() logger.debug('Starting training!') training_params = { 'n_epochs': params['MAX_EPOCH'], 'batch_size': params['BATCH_SIZE'], 'homogeneous_batches': params['HOMOGENEOUS_BATCHES'], 'maxlen': params['MAX_OUTPUT_TEXT_LEN'], 'joint_batches': params['JOINT_BATCHES'], 'lr_decay': params.get('LR_DECAY', None), # LR decay parameters 'initial_lr': params.get('LR', 1.0), 'reduce_each_epochs': params.get('LR_REDUCE_EACH_EPOCHS', True), 'start_reduction_on_epoch': params.get('LR_START_REDUCTION_ON_EPOCH', 0), 'lr_gamma': params.get('LR_GAMMA', 0.9), 'lr_reducer_type': params.get('LR_REDUCER_TYPE', 'linear'), 'lr_reducer_exp_base': params.get('LR_REDUCER_EXP_BASE', 0), 'lr_half_life': params.get('LR_HALF_LIFE', 50000), 'lr_warmup_exp': params.get('WARMUP_EXP', -1.5), 'min_lr': params.get('MIN_LR', 1e-9), 'epochs_for_save': params['EPOCHS_FOR_SAVE'], 'verbose': params['VERBOSE'], 'eval_on_sets': params['EVAL_ON_SETS_KERAS'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'extra_callbacks': callbacks, 'reload_epoch': params['RELOAD'], 'epoch_offset': params.get('EPOCH_OFFSET', 0), 'data_augmentation': params['DATA_AUGMENTATION'], 'patience': params.get('PATIENCE', 0), # early stopping parameters 'metric_check': params.get('STOP_METRIC', None) if params.get('EARLY_STOP', False) else None, 'eval_on_epochs': params.get('EVAL_EACH_EPOCHS', True), 'each_n_epochs': params.get('EVAL_EACH', 1), 'start_eval_on_epoch': params.get('START_EVAL_ON_EPOCH', 0), 'tensorboard': params.get('TENSORBOARD', False), 'n_gpus': params.get('N_GPUS', 1), 'tensorboard_params': { 'log_dir': params.get('LOG_DIR', 'tensorboard_logs'), 'histogram_freq': params.get('HISTOGRAM_FREQ', 0), 'batch_size': params.get('TENSORBOARD_BATCH_SIZE', params['BATCH_SIZE']), 'write_graph': params.get('WRITE_GRAPH', True), 'write_grads': params.get('WRITE_GRADS', False), 'write_images': params.get('WRITE_IMAGES', False), 'embeddings_freq': params.get('EMBEDDINGS_FREQ', 0), 'embeddings_layer_names': params.get('EMBEDDINGS_LAYER_NAMES', None), 'embeddings_metadata': params.get('EMBEDDINGS_METADATA', None), 'label_word_embeddings_with_vocab': params.get('LABEL_WORD_EMBEDDINGS_WITH_VOCAB', False), 'word_embeddings_labels': params.get('WORD_EMBEDDINGS_LABELS', None), } } nmt_model.trainNet(dataset, training_params) total_end_time = timer() time_difference = total_end_time - total_start_time logger.info('In total is {0:.2f}s = {1:.2f}m'.format( time_difference, time_difference / 60.0))
def train_model(params): """ Training function. Sets the training parameters from params. Build or loads the model and launches the training. :param params: Dictionary of network hyperparameters. :return: None """ if params['RELOAD'] > 0: logging.info('Resuming training.') # Load data dataset = build_dataset(params) params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] # Build model if (params['RELOAD'] == 0): # build new model video_model = Captioning_Model(params, model_type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], vocabularies=dataset.vocabulary, store_path=params['STORE_PATH']) dict2pkl(params, params['STORE_PATH'] + '/config') # Define the inputs and outputs mapping from our Dataset instance to our model inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): if len(video_model.ids_inputs) > i: pos_source = dataset.ids_inputs.index(id_in) id_dest = video_model.ids_inputs[i] inputMapping[id_dest] = pos_source video_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): if len(video_model.ids_outputs) > i: pos_target = dataset.ids_outputs.index(id_out) id_dest = video_model.ids_outputs[i] outputMapping[id_dest] = pos_target video_model.setOutputsMapping(outputMapping) else: # resume from previously trained model video_model = loadModel(params['STORE_PATH'], params['RELOAD']) video_model.setOptimizer() # Callbacks callbacks = buildCallbacks(params, video_model, dataset) # Training total_start_time = timer() logger.debug('Starting training!') training_params = {'n_epochs': params['MAX_EPOCH'], 'batch_size': params['BATCH_SIZE'], 'homogeneous_batches': params['HOMOGENEOUS_BATCHES'], 'maxlen': params['MAX_OUTPUT_TEXT_LEN'], 'joint_batches': params['JOINT_BATCHES'], 'lr_decay': params.get('LR_DECAY', None), # LR decay parameters 'initial_lr': params.get('LR', 1.0), 'reduce_each_epochs': params.get('LR_REDUCE_EACH_EPOCHS', True), 'start_reduction_on_epoch': params.get('LR_START_REDUCTION_ON_EPOCH', 0), 'lr_gamma': params.get('LR_GAMMA', 0.9), 'lr_reducer_type': params.get('LR_REDUCER_TYPE', 'linear'), 'lr_reducer_exp_base': params.get('LR_REDUCER_EXP_BASE', 0), 'lr_half_life': params.get('LR_HALF_LIFE', 50000), 'lr_warmup_exp': params.get('WARMUP_EXP', -1.5), 'min_lr': params.get('MIN_LR', 1e-9), 'epochs_for_save': params['EPOCHS_FOR_SAVE'], 'verbose': params['VERBOSE'], 'eval_on_sets': params['EVAL_ON_SETS_KERAS'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'extra_callbacks': callbacks, 'reload_epoch': params['RELOAD'], 'epoch_offset': params.get('EPOCH_OFFSET', 0), 'data_augmentation': params['DATA_AUGMENTATION'], 'patience': params.get('PATIENCE', 0), # early stopping parameters 'metric_check': params.get('STOP_METRIC', None) if params.get('EARLY_STOP', False) else None, 'eval_on_epochs': params.get('EVAL_EACH_EPOCHS', True), 'each_n_epochs': params.get('EVAL_EACH', 1), 'start_eval_on_epoch': params.get('START_EVAL_ON_EPOCH', 0), 'tensorboard': params.get('TENSORBOARD', False), 'n_gpus': params.get('N_GPUS', 1), 'tensorboard_params': {'log_dir': params.get('LOG_DIR', 'tensorboard_logs'), 'histogram_freq': params.get('HISTOGRAM_FREQ', 0), 'batch_size': params.get('TENSORBOARD_BATCH_SIZE', params['BATCH_SIZE']), 'write_graph': params.get('WRITE_GRAPH', True), 'write_grads': params.get('WRITE_GRADS', False), 'write_images': params.get('WRITE_IMAGES', False), 'embeddings_freq': params.get('EMBEDDINGS_FREQ', 0), 'embeddings_layer_names': params.get('EMBEDDINGS_LAYER_NAMES', None), 'embeddings_metadata': params.get('EMBEDDINGS_METADATA', None), 'label_word_embeddings_with_vocab': params.get('LABEL_WORD_EMBEDDINGS_WITH_VOCAB', False), 'word_embeddings_labels': params.get('WORD_EMBEDDINGS_LABELS', None), } } video_model.trainNet(dataset, training_params) total_end_time = timer() time_difference = total_end_time - total_start_time logging.info('In total is {0:.2f}s = {1:.2f}m'.format(time_difference, time_difference / 60.0))
def train_model(params): """ Training function. Sets the training parameters from params. Build or loads the model and launches the training. :param params: Dictionary of network hyperparameters. :return: None """ if params['RELOAD'] > 0: logging.info('Resuming training.') check_params(params) ########### Load data dataset = build_dataset(params) if not '-vidtext-embed' in params['DATASET_NAME']: params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] else: params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][1]] ########### ########### Build model if params['MODE'] == 'finetuning': # video_model = loadModel(params['PRE_TRAINED_MODEL_STORE_PATHS'], params['RELOAD']) video_model = VideoDesc_Model(params, type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'] + '_reloaded', vocabularies=dataset.vocabulary, store_path=params['STORE_PATH'], set_optimizer=False, clear_dirs=False) video_model = updateModel(video_model, params['RELOAD_PATH'], params['RELOAD'], reload_epoch=False) video_model.setParams(params) # Define the inputs and outputs mapping from our Dataset instance to our model inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): if len(video_model.ids_inputs) > i: pos_source = dataset.ids_inputs.index(id_in) id_dest = video_model.ids_inputs[i] inputMapping[id_dest] = pos_source video_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): if len(video_model.ids_outputs) > i: pos_target = dataset.ids_outputs.index(id_out) id_dest = video_model.ids_outputs[i] outputMapping[id_dest] = pos_target video_model.setOutputsMapping(outputMapping) video_model.setOptimizer() params['MAX_EPOCH'] += params['RELOAD'] else: if params['RELOAD'] == 0 or params[ 'LOAD_WEIGHTS_ONLY']: # build new model video_model = VideoDesc_Model(params, type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], vocabularies=dataset.vocabulary, store_path=params['STORE_PATH'], set_optimizer=True) dict2pkl(params, params['STORE_PATH'] + '/config') # Define the inputs and outputs mapping from our Dataset instance to our model inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): if len(video_model.ids_inputs) > i: pos_source = dataset.ids_inputs.index(id_in) id_dest = video_model.ids_inputs[i] inputMapping[id_dest] = pos_source video_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): if len(video_model.ids_outputs) > i: pos_target = dataset.ids_outputs.index(id_out) id_dest = video_model.ids_outputs[i] outputMapping[id_dest] = pos_target video_model.setOutputsMapping(outputMapping) # Only load weights from pre-trained model if params['LOAD_WEIGHTS_ONLY'] and params['RELOAD'] > 0: for i in range(0, len(params['RELOAD'])): old_model = loadModel( params['PRE_TRAINED_MODEL_STORE_PATHS'][i], params['RELOAD'][i]) video_model = transferWeights(old_model, video_model, params['LAYERS_MAPPING'][i]) video_model.setOptimizer() params['RELOAD'] = 0 else: # resume from previously trained model video_model = loadModel(params['PRE_TRAINED_MODEL_STORE_PATHS'], params['RELOAD']) video_model.params['LR'] = params['LR'] video_model.setOptimizer() if video_model.model_path != params['STORE_PATH']: video_model.setName(params['MODEL_NAME'], models_path=params['STORE_PATH'], clear_dirs=False) # Update optimizer either if we are loading or building a model video_model.params = params video_model.setOptimizer() ########### ########### Test model saving/loading functions # saveModel(video_model, params['RELOAD']) # video_model = loadModel(params['STORE_PATH'], params['RELOAD']) ########### ########### Callbacks callbacks = buildCallbacks(params, video_model, dataset) ########### ########### Training total_start_time = timer() logger.debug('Starting training!') training_params = { 'n_epochs': params['MAX_EPOCH'], 'batch_size': params['BATCH_SIZE'], 'homogeneous_batches': params['HOMOGENEOUS_BATCHES'], 'maxlen': params['MAX_OUTPUT_TEXT_LEN'], 'lr_decay': params['LR_DECAY'], 'lr_gamma': params['LR_GAMMA'], 'epochs_for_save': params['EPOCHS_FOR_SAVE'], 'verbose': params['VERBOSE'], 'eval_on_sets': params['EVAL_ON_SETS_KERAS'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'extra_callbacks': callbacks, 'reload_epoch': params['RELOAD'], 'epoch_offset': params['RELOAD'], 'data_augmentation': params['DATA_AUGMENTATION'], 'patience': params.get('PATIENCE', 0), # early stopping parameters 'metric_check': params.get('STOP_METRIC', None), 'eval_on_epochs': params.get('EVAL_EACH_EPOCHS', True), 'each_n_epochs': params.get('EVAL_EACH', 1), 'start_eval_on_epoch': params.get('START_EVAL_ON_EPOCH', 0) } video_model.trainNet(dataset, training_params) total_end_time = timer() time_difference = total_end_time - total_start_time logging.info('In total is {0:.2f}s = {1:.2f}m'.format( time_difference, time_difference / 60.0))