def invoke_model(parameters): model_params = load_parameters() model_name = model_params["MODEL_TYPE"] for parameter in parameters.keys(): model_params[parameter] = parameters[parameter][0] logger.debug("Assigning to %s the value %s" % (str(parameter), parameters[parameter][0])) model_name += '_' + str(parameter) + '_' + str(parameters[parameter][0]) model_params["SKIP_VECTORS_HIDDEN_SIZE"] = model_params["TARGET_TEXT_EMBEDDING_SIZE"] model_params["MODEL_NAME"] = model_name # models and evaluation results will be stored here model_params[ "STORE_PATH"] = '/home/lvapeab/smt/software/egocentric-video-description/meta-optimizers/spearmint/trained_models/' + \ model_params["MODEL_NAME"] + '/' check_params(model_params) assert model_params['MODE'] == 'training', 'You can only launch Spearmint when training!' logging.info('Running training.') train_model(model_params) results_path = model_params['STORE_PATH'] + '/' + model_params['EVAL_ON_SETS'][0] + '.' + model_params['METRICS'][0] # Recover the highest metric score metric_pos_cmd = "head -n 1 " + results_path + \ " |awk -v metric=" + metric_name + \ " 'BEGIN{FS=\",\"}" \ "{for (i=1; i<=NF; i++) if ($i == metric) print i;}'" metric_pos = \ subprocess.Popen(metric_pos_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True).communicate()[0][:-1] cmd = "tail -n +2 " + results_path + \ " |awk -v m_pos=" + str(metric_pos) + \ " 'BEGIN{FS=\",\"}{print $m_pos}'|sort -gr|head -n 1" ps = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, env=d) metric_value = float(ps.communicate()[0]) print "Best %s: %f" % (metric_name, metric_value) return 1. - metric_value if maximize else metric_value # Spearmint minimizes a function
def load_tests_params(): params = load_parameters() params['BATCH_SIZE'] = 10 params['DROPOUT_P'] = 0.1 params['RECURRENT_INPUT_DROPOUT_P'] = 0.01 params['RECURRENT_DROPOUT_P'] = 0.01 params['USE_NOISE'] = True params['NOISE_AMOUNT'] = 0.01 params['USE_BATCH_NORMALIZATION'] = True params['BATCH_NORMALIZATION_MODE'] = 1 params['SOURCE_TEXT_EMBEDDING_SIZE'] = 8 params['TARGET_TEXT_EMBEDDING_SIZE'] = 8 params['DECODER_HIDDEN_SIZE'] = 4 params['ENCODER_HIDDEN_SIZE'] = 4 params['RELOAD'] = 0 params['MAX_EPOCH'] = 1 params['USE_CUDNN'] = False params['MODEL_TYPE'] = 'Transformer' params['N_LAYERS_ENCODER'] = 2 params['N_LAYERS_DECODER'] = 2 params['MULTIHEAD_ATTENTION_ACTIVATION'] = 'relu' params['MODEL_SIZE'] = 8 params['FF_SIZE'] = params['MODEL_SIZE'] * 4 params['N_HEADS'] = 2 params['REBUILD_DATASET'] = True params['OPTIMIZED_SEARCH'] = True params['POS_UNK'] = False return params
def load_in(encoding, direction): if encoding == "char": print(encoding) args = parse_args_char(direction) else: args = parse_args_bpe(direction) if args["config"] is None: logging.info("Reading parameters from config.py") from config import load_parameters params = load_parameters() else: logging.info("Loading parameters from %s" % str(args["config"])) params = pkl2dict(args["config"]) try: for arg in args["changes"]: try: k, v = arg["split"]('=') except ValueError: print( 'Overwritten arguments must have the form key=Value. \n Currently are: %s' % str(args["changes"])) exit(1) try: params[k] = ast.literal_eval(v) except ValueError: params[k] = v except ValueError: print('Error processing arguments: (', k, ",", v, ")") exit(2) params = check_params(params) model, dataset = loadmodel(encoding, args) # dit moet een functie worden # sample_ensemble(args, params, models, dataset) return args, params, model, dataset
def test_load_dataset(): params = load_parameters() ds = loadDataset('./Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl') assert isinstance(ds, Dataset) assert isinstance(ds.vocabulary, dict) assert ds.vocabulary.keys() >= 3 for voc in ds.vocabulary: assert len(ds.vocabulary[voc].keys()) == 2
def test_load_dataset(self): params = load_parameters() ds = loadDataset('./Dataset_'+ params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl') self.assertIsInstance(ds, Dataset) self.assertIsInstance(ds.vocabulary, dict) self.assertGreaterEqual(ds.vocabulary.keys(), 3) for voc in ds.vocabulary: self.assertEqual(len(ds.vocabulary[voc].keys()), 2)
def test_load_textfiles(self): params = load_parameters() filename = params['DATA_ROOT_PATH'] + params['TEXT_FILES']['val'] + params['TRG_LAN'] hyp = open(filename, 'r') refs, hypo = load_textfiles([open(filename, 'r')], hyp) self.assertIsInstance(refs, dict) self.assertIsInstance(hypo, dict) self.assertEqual(refs, hypo)
def test_load_textfiles(): params = load_parameters() filename = params['DATA_ROOT_PATH'] + params['TEXT_FILES']['val'] + params['TRG_LAN'] hyp = codecs.open(filename, 'r', encoding='utf-8') refs, hypo = load_textfiles([codecs.open(filename, 'r', encoding='utf-8').readlines()], hyp) assert isinstance(refs, dict) assert isinstance(hypo, dict) assert refs == hypo
def test_load_textfiles(): params = load_parameters() filename = params['DATA_ROOT_PATH'] + params['TEXT_FILES']['val'] + params[ 'TRG_LAN'] hyp = open(filename, 'r') refs, hypo = load_textfiles([open(filename, 'r')], hyp) assert isinstance(refs, dict) assert isinstance(hypo, dict) assert refs == hypo
def test_update_dataset_from_file(): params = load_parameters() for rebuild_dataset in [True, False]: params['REBUILD_DATASET'] = rebuild_dataset params['DATASET_STORE_PATH'] = '.' for splits in [[], None, ['val']]: ds = build_dataset(params) assert isinstance(ds, Dataset) for output_text_filename in [ None, os.path.join( params['DATA_ROOT_PATH'], params['TEXT_FILES']['test'] + params['TRG_LAN']) ]: for remove_outputs in [True, False]: for compute_state_below in [True, False]: for recompute_references in [True, False]: ds2 = update_dataset_from_file( copy.deepcopy(ds), os.path.join( params['DATA_ROOT_PATH'], params['TEXT_FILES']['test'] + params['SRC_LAN']), params, splits=splits, output_text_filename=output_text_filename, remove_outputs=remove_outputs, compute_state_below=compute_state_below, recompute_references=recompute_references) assert isinstance(ds2, Dataset) # Final check: We update the val set with the test data. We check that dimensions match. split = 'val' len_test = 2996 ds2 = update_dataset_from_file( copy.deepcopy(ds), params['DATA_ROOT_PATH'] + params['TEXT_FILES']['test'] + params['SRC_LAN'], params, splits=[split], output_text_filename=os.path.join( params['DATA_ROOT_PATH'], params['TEXT_FILES']['test'] + params['TRG_LAN']), remove_outputs=False, compute_state_below=True, recompute_references=True) assert isinstance(ds2, Dataset) assert eval('ds2.len_' + split) == len_test assert eval('all(ds2.loaded_' + split + ')') assert len(eval('ds2.X_' + split + str([params['INPUTS_IDS_DATASET'][0]]))) == len_test assert len(eval('ds2.Y_' + split + str([params['OUTPUTS_IDS_DATASET'][0]]))) == len_test if __name__ == '__main__': pytest.main([__file__])
def test_load_dataset(): params = load_parameters() ds = loadDataset(os.path.join('datasets', 'Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params[ 'TRG_LAN'] + '.pkl')) assert isinstance(ds, Dataset) assert isinstance(ds.vocabulary, dict) assert len(list(ds.vocabulary)) >= 3 for voc in ds.vocabulary: assert len(list(ds.vocabulary[voc])) == 2
def test_CocoScore(self): params = load_parameters() filename = params['DATA_ROOT_PATH'] + params['TEXT_FILES']['val'] + params['TRG_LAN'] hyp = open(filename, 'r') refs, hypo = load_textfiles([open(filename, 'r')], hyp) final_scores = CocoScore(refs, hypo, metrics_list=None, language=params['TRG_LAN']) self.assertIsInstance(final_scores, dict) self.assertIn('Bleu_1', final_scores.keys()) self.assertIn('Bleu_2', final_scores.keys()) self.assertIn('Bleu_3', final_scores.keys()) self.assertIn('Bleu_4', final_scores.keys()) self.assertIn('TER', final_scores.keys()) self.assertIn('METEOR', final_scores.keys()) self.assertIn('ROUGE_L', final_scores.keys()) self.assertIn('CIDEr', final_scores.keys()) self.assertAlmostEqual(final_scores['Bleu_1'], 1.0, delta=1e-6) self.assertAlmostEqual(final_scores['Bleu_2'], 1.0, delta=1e-6) self.assertAlmostEqual(final_scores['Bleu_3'], 1.0, delta=1e-6) self.assertAlmostEqual(final_scores['Bleu_4'], 1.0, delta=1e-6) self.assertAlmostEqual(final_scores['TER'], 0.0, delta=1e-6) self.assertAlmostEqual(final_scores['METEOR'], 1.0, delta=1e-6) self.assertAlmostEqual(final_scores['ROUGE_L'], 1.0, delta=1e-6) self.assertAlmostEqual(final_scores['CIDEr'], 10.0, delta=1e-1) final_scores = CocoScore(refs, hypo, metrics_list=['BLeu'], language=params['TRG_LAN']) self.assertIsInstance(final_scores, dict) self.assertIn('Bleu_1', final_scores.keys()) self.assertIn('Bleu_2', final_scores.keys()) self.assertIn('Bleu_3', final_scores.keys()) self.assertIn('Bleu_4', final_scores.keys()) self.assertNotIn('TER', final_scores.keys()) self.assertNotIn('METEOR', final_scores.keys()) self.assertNotIn('ROUGE_L', final_scores.keys()) self.assertNotIn('CIDEr', final_scores.keys()) self.assertAlmostEqual(final_scores['Bleu_1'], 1.0, delta=1e-6) self.assertAlmostEqual(final_scores['Bleu_2'], 1.0, delta=1e-6) self.assertAlmostEqual(final_scores['Bleu_3'], 1.0, delta=1e-6) self.assertAlmostEqual(final_scores['Bleu_4'], 1.0, delta=1e-6) final_scores = CocoScore(refs, hypo, metrics_list=['BLEU', 'ter'], language=params['TRG_LAN']) self.assertIsInstance(final_scores, dict) self.assertIn('Bleu_1', final_scores.keys()) self.assertIn('Bleu_2', final_scores.keys()) self.assertIn('Bleu_3', final_scores.keys()) self.assertIn('Bleu_4', final_scores.keys()) self.assertIn('TER', final_scores.keys()) self.assertNotIn('METEOR', final_scores.keys()) self.assertNotIn('ROUGE_L', final_scores.keys()) self.assertNotIn('CIDEr', final_scores.keys()) self.assertAlmostEqual(final_scores['Bleu_1'], 1.0, delta=1e-6) self.assertAlmostEqual(final_scores['Bleu_2'], 1.0, delta=1e-6) self.assertAlmostEqual(final_scores['Bleu_3'], 1.0, delta=1e-6) self.assertAlmostEqual(final_scores['Bleu_4'], 1.0, delta=1e-6) self.assertAlmostEqual(final_scores['TER'], 0.0, delta=1e-6)
def test_build_datset(): params = load_parameters() for verbose in range(2): params['REBUILD_DATASET'] = True params['VERBOSE'] = verbose ds = build_dataset(params) assert isinstance(ds, Dataset) len_splits = [('train', 9900), ('val', 100), ('test', 2996)] for split, len_split in len_splits: assert eval('ds.len_' + split) == len_split assert eval('all(ds.loaded_' + split + ')') assert len(eval('ds.X_' + split + str([params['INPUTS_IDS_DATASET'][0]]))) == len_split assert len(eval('ds.Y_' + split + str([params['OUTPUTS_IDS_DATASET'][0]]))) == len_split
def test_update_dataset_from_file(self): params = load_parameters() params['REBUILD_DATASET'] = True params['DATASET_STORE_PATH'] = './' ds = build_dataset(params) self.assertIsInstance(ds, Dataset) for splits in [[], ['val']]: for output_text_filename in [ None, params['DATA_ROOT_PATH'] + params['TEXT_FILES']['test'] + params['TRG_LAN'] ]: for remove_outputs in [True, False]: for compute_state_below in [True, False]: for recompute_references in [True, False]: ds2 = update_dataset_from_file( copy.deepcopy(ds), params['DATA_ROOT_PATH'] + params['TEXT_FILES']['test'] + params['SRC_LAN'], params, splits=splits, output_text_filename=output_text_filename, remove_outputs=remove_outputs, compute_state_below=compute_state_below, recompute_references=recompute_references) self.assertIsInstance(ds2, Dataset) # Final check: We update the val set with the test data. We check that dimensions match. split = 'val' len_test = 2996 ds2 = update_dataset_from_file( copy.deepcopy(ds), params['DATA_ROOT_PATH'] + params['TEXT_FILES']['test'] + params['SRC_LAN'], params, splits=[split], output_text_filename=params['DATA_ROOT_PATH'] + params['TEXT_FILES']['test'] + params['TRG_LAN'], remove_outputs=False, compute_state_below=True, recompute_references=True) self.assertIsInstance(ds2, Dataset) self.assertEqual(eval('ds2.len_' + split), len_test) self.assertTrue(eval('all(ds2.loaded_' + split + ')')) self.assertEqual( len(eval('ds2.X_' + split + str([params['INPUTS_IDS_DATASET'][0]]))), len_test) self.assertEqual( len( eval('ds2.Y_' + split + str([params['OUTPUTS_IDS_DATASET'][0]]))), len_test)
def invoke_model(parameters): """ Loads a model, trains it and evaluates it. :param parameters: Model parameters :return: Metric to minimize value. """ model_params = load_parameters() model_name = model_params["MODEL_TYPE"] for parameter in list(parameters): model_params[parameter] = parameters[parameter][0] logger.debug("Assigning to %s the value %s" % (str(parameter), parameters[parameter][0])) model_name += '_' + str(parameter) + '_' + str( parameters[parameter][0]) model_params["MODEL_NAME"] = model_name # models and evaluation results will be stored here model_params["STORE_PATH"] = os.path.join('trained_models', model_params["MODEL_NAME"]) check_params(model_params) assert model_params[ 'MODE'] == 'training', 'You can only launch Spearmint when training!' logger.info('Running training.') train_model(model_params) results_path = os.path.join( model_params['STORE_PATH'], model_params['EVAL_ON_SETS'][0] + '.' + model_params['METRICS'][0]) # Recover the highest metric score metric_pos_cmd = "head -n 1 " + results_path + \ " |awk -v metric=" + metric_name + \ " 'BEGIN{FS=\",\"}" \ "{for (i=1; i<=NF; i++) if ($i == metric) print i;}'" metric_pos = subprocess.Popen(metric_pos_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True).communicate()[0][:-1] cmd = "tail -n +2 " + results_path + \ " |awk -v m_pos=" + str(metric_pos) + \ " 'BEGIN{FS=\",\"}{print $m_pos}'|sort -gr|head -n 1" ps = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, env=d) metric_value = float(ps.communicate()[0]) print("Best %s: %f" % (metric_name, metric_value)) return 1. - metric_value if maximize else metric_value # Spearmint minimizes a function
def test_keep_n_captions(): params = load_parameters() params['REBUILD_DATASET'] = True ds = build_dataset(params) len_splits = {'train': 9900, 'val': 100, 'test': 2996} for splits in [[], None, ['val'], ['val', 'test']]: prepare_references(ds, 1, n=1, set_names=splits) if splits is not None: for split in splits: len_split = len_splits[split] assert eval('ds.len_' + split) == len_split assert eval('all(ds.loaded_' + split + ')') assert len(eval('ds.X_' + split + str([params['INPUTS_IDS_DATASET'][0]]))) == len_split assert len(eval('ds.Y_' + split + str([params['OUTPUTS_IDS_DATASET'][0]]))) == len_split
def test_update_parameters(self): params = load_parameters() updates = { 'ENCODER_HIDDEN_SIZE': 0, 'BIDIRECTIONAL_ENCODER': False, 'NEW_PARAMETER': 'new_value', 'ADDITIONAL_OUTPUT_MERGE_MODE': 'Concat' } new_params = update_parameters(params, updates, restrict=False) for k, new_val in updates.iteritems(): self.assertEqual(new_params[k], updates[k]) new_params = update_parameters(params, updates, restrict=True) for k, _ in updates.iteritems(): self.assertEqual(new_params[k], params.get(k, 'new_value')) self.assertEqual(new_params['NEW_PARAMETER'], updates['NEW_PARAMETER'])
def test_update_parameters(): params = load_parameters() updates = { 'ENCODER_HIDDEN_SIZE': 0, 'BIDIRECTIONAL_ENCODER': False, 'NEW_PARAMETER': 'new_value', 'ADDITIONAL_OUTPUT_MERGE_MODE': 'Concat' } new_params = update_parameters(params, updates, restrict=False) for k, new_val in iteritems(updates): assert new_params[k] == updates[k] new_params = update_parameters(params, updates, restrict=True) for k, _ in iteritems(updates): assert new_params[k] == params.get(k, 'new_value') assert new_params['NEW_PARAMETER'] == updates['NEW_PARAMETER']
def test_build_datset(self): params = load_parameters() params['REBUILD_DATASET'] = True params['DATASET_STORE_PATH'] = './' ds = build_dataset(params) self.assertIsInstance(ds, Dataset) len_splits = [('train', 9900), ('val', 100), ('test', 2996)] for split, len_split in len_splits: self.assertEqual(eval('ds.len_' + split), len_split) self.assertTrue(eval('all(ds.loaded_' + split + ')')) self.assertEqual( len( eval('ds.X_' + split + str([params['INPUTS_IDS_DATASET'][0]]))), len_split) self.assertEqual( len( eval('ds.Y_' + split + str([params['OUTPUTS_IDS_DATASET'][0]]))), len_split)
def load_tests_params(): params = load_parameters() params['BATCH_SIZE'] = 10 params['DROPOUT_P'] = 0.1 params['RECURRENT_INPUT_DROPOUT_P'] = 0.01 params['RECURRENT_DROPOUT_P'] = 0.01 params['USE_NOISE'] = True params['NOISE_AMOUNT'] = 0.01 params['USE_BATCH_NORMALIZATION'] = True params['BATCH_NORMALIZATION_MODE'] = 1 params['SOURCE_TEXT_EMBEDDING_SIZE'] = 8 params['TARGET_TEXT_EMBEDDING_SIZE'] = 8 params['DECODER_HIDDEN_SIZE'] = 4 params['ENCODER_HIDDEN_SIZE'] = 4 params['RELOAD'] = 0 params['MAX_EPOCH'] = 2 return params
def test_keep_n_captions(): params = load_parameters() params['REBUILD_DATASET'] = True params['DATASET_STORE_PATH'] = './' ds = build_dataset(params) len_splits = {'train': 9900, 'val': 100, 'test': 2996} for splits in [[], None, ['val'], ['val', 'test']]: keep_n_captions(ds, 1, n=1, set_names=splits) if splits is not None: for split in splits: len_split = len_splits[split] assert eval('ds.len_' + split) == len_split assert eval('all(ds.loaded_' + split + ')') assert len(eval('ds.X_' + split + str([params['INPUTS_IDS_DATASET'][0]]))) == len_split assert len(eval('ds.Y_' + split + str([params['OUTPUTS_IDS_DATASET'][0]]))) == len_split if __name__ == '__main__': pytest.main([__file__])
def resume_training(latest_epoch, use_gpu): params = load_parameters() params['MODEL_TYPE'] = 'AttentionRNNEncoderDecoder' params['USE_CUDNN'] = use_gpu params['N_GPUS'] = 2 params['MAX_EPOCH'] = latest_epoch + 1000 params['BATCH_SIZE'] = 128 params['EARLY_STOP'] = True params['PATIENCE'] = 10 params['SAVE_EACH_EVALUATION'] = True params['STORE_PATH'] = PATH + "model/" params['ATTENTION_MODE'] = "add" params['N_LAYERS_ENCODER'] = 2 params['N_LAYERS_DECODER'] = 2 params['SOURCE_TEXT_EMBEDDING_SIZE'] = 512 params['TARGET_TEXT_EMBEDDING_SIZE'] = 512 params['SKIP_VECTORS_HIDDEN_SIZE'] = 512 params['ATTENTION_SIZE'] = 512 params['ENCODER_HIDDEN_SIZE'] = 512 params['DECODER_HIDDEN_SIZE'] = 512 params['ENCODER_RNN_TYPE'] = "LSTM" params['DECODER_RNN_TYPE'] = "ConditionalLSTM" params['METRICS'] = ['coco'] params['KERAS_METRICS'] = ['perplexity'] params['APPLY_DETOKENIZATION'] = True params['LENGTH_PENALTY'] = True params['LENGTH_NORM_FACTOR'] = 1.0 params['RELOAD'] = latest_epoch params['BEAM_SIZE'] = 1 params['BEAM_SEARCH'] = True params['PLOT_EVALUATION'] = True params['MAX_PLOT_Y'] = 1. params['MODE'] = 'training' params['TENSORBOARD'] = True result = pyfiglet.figlet_format("RESUME TRAINING".format(mode), font="digital") print(result) train_model(params, load_dataset=os.getcwd() + "/dataset/Dataset_tutorial_dataset.pkl")
def test_build(self): params = load_parameters() params['DATASET_STORE_PATH'] = './' params['REBUILD_DATASET'] = True dataset = build_dataset(params) params['INPUT_VOCABULARY_SIZE'] = \ dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = \ dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] for encoder_rnn_type in ['LSTM', 'GRU']: for decoder_rnn_type in [ 'LSTM', 'GRU', 'ConditionalLSTM', 'ConditionalGRU' ]: params['ENCODER_RNN_TYPE'] = encoder_rnn_type params['DECODER_RNN_TYPE'] = decoder_rnn_type for n_layers in range(2): params['N_LAYERS_DECODER'] = n_layers params['N_LAYERS_ENCODER'] = n_layers nmt_model = \ TranslationModel(params, model_type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], vocabularies=dataset.vocabulary, store_path=params['STORE_PATH'], clear_dirs=False) self.assertIsInstance(nmt_model, Model_Wrapper) # Check Inputs inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): pos_source = dataset.ids_inputs.index(id_in) id_dest = nmt_model.ids_inputs[i] inputMapping[id_dest] = pos_source nmt_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): pos_target = dataset.ids_outputs.index(id_out) id_dest = nmt_model.ids_outputs[i] outputMapping[id_dest] = pos_target nmt_model.setOutputsMapping(outputMapping) return True
def resume_training(latest_epoch): params = load_parameters() params['RELOAD'] = latest_epoch params['MODEL_TYPE'] = 'Transformer' params['USE_CUDNN'] = use_gpu params['EARLY_STOP'] = True params['PATIENCE'] = 10 params['SAVE_EACH_EVALUATION'] = True params['STORE_PATH'] = MODEL_PATH params['N_LAYERS_ENCODER'] = 2 params['N_LAYERS_DECODER'] = 2 params['N_HEADS'] = 100 params['POS_UNK'] = False # current Transformer model requires this params[ 'ATTEND_ON_OUTPUT'] = True # current Transformer model requires this params['MODEL_SIZE'] = 100 params['SOURCE_TEXT_EMBEDDING_SIZE'] = 100 params['TARGET_TEXT_EMBEDDING_SIZE'] = 100 params['SKIP_VECTORS_HIDDEN_SIZE'] = 100 params['ENCODER_HIDDEN_SIZE'] = 100 params['DECODER_HIDDEN_SIZE'] = 100 params['APPLY_DETOKENIZATION'] = True params['LENGTH_PENALTY'] = True params['LENGTH_NORM_FACTOR'] = 0.8 params['MAX_INPUT_TEXT_LEN'] = 128 params['MAX_OUTPUT_TEXT_LEN'] = 128 params['STOP_METRIC'] = 'perplexity' params['BEAM_SIZE'] = 20 params['N_GPUS'] = 2 params['START_EVAL_ON_EPOCH'] = 1 params['BATCH_SIZE'] = 128 params['EVAL_EACH'] = 1 params['MAX_EPOCH'] = 100 params['PLOT_EVALULATION'] = True params['APPLY_DETOKENIZATION'] = True params['MODE'] = 'training' params['BEAM_SEARCH'] = True params['TENSORBOARD'] = True train_model(params, load_dataset=MODEL_PATH + "/dataset/Dataset_tutorial_dataset.pkl")
def _init_parameters_with_DKGE(self): entity_emb, relation_emb, entity_context, relation_context, \ entity_gcn_weight, relation_gcn_weight, gate_entity, gate_relation, \ v_entity, v_relation = config.load_parameters(200) entity_mapping_dict, relation_mapping_dict = config.construct_snapshots_mapping_dict( )[4:] self.entity_gcn_weight.data = entity_gcn_weight self.relation_gcn_weight.data = relation_gcn_weight self.gate_entity.data = gate_entity self.gate_relation.data = gate_relation self.v_ent.data = v_entity self.v_rel.data = v_relation for id1, id2 in entity_mapping_dict.items(): self.entity_emb.data[id2] = entity_emb[id1] self.entity_context.weight.data[id2] = entity_context[id1] for id1, id2 in relation_mapping_dict.items(): self.relation_emb.data[id2] = relation_emb[id1] self.relation_context.weight.data[id2] = relation_context[id1]
def load_tests_params(): params = load_parameters() params['BATCH_SIZE'] = 10 params['WEIGHT_DECAY'] = 1e-4 params['RECURRENT_WEIGHT_DECAY'] = 1e-4 params['DROPOUT_P'] = 0.01 params['RECURRENT_INPUT_DROPOUT_P'] = 0.01 params['RECURRENT_DROPOUT_P'] = 0.01 params['USE_NOISE'] = True params['NOISE_AMOUNT'] = 0.01 params['USE_BATCH_NORMALIZATION'] = True params['BATCH_NORMALIZATION_MODE'] = 1 params['SOURCE_TEXT_EMBEDDING_SIZE'] = 8 params['TARGET_TEXT_EMBEDDING_SIZE'] = 8 params['DECODER_HIDDEN_SIZE'] = 4 params['ENCODER_HIDDEN_SIZE'] = 4 params['ATTENTION_SIZE'] = params['DECODER_HIDDEN_SIZE'] params['SKIP_VECTORS_HIDDEN_SIZE'] = params['DECODER_HIDDEN_SIZE'] params['DOUBLE_STOCHASTIC_ATTENTION_REG'] = 0.7 params['RELOAD'] = 0 params['MAX_EPOCH'] = 2 return params
def resume_training(latest_epoch): params = load_parameters() params['RELOAD'] = latest_epoch params['MODEL_TYPE'] = 'AttentionRNNEncoderDecoder' params['USE_CUDNN'] = use_gpu params['EARLY_STOP'] = True params['PATIENCE'] = 10 params['SAVE_EACH_EVALUATION'] = True params['STORE_PATH'] = MODEL_PATH params['SOURCE_TEXT_EMBEDDING_SIZE'] = 32 params['TARGET_TEXT_EMBEDDING_SIZE'] = 32 params['SKIP_VECTORS_HIDDEN_SIZE'] = 32 params['ATTENTION_SIZE'] = 32 params['ENCODER_HIDDEN_SIZE'] = 32 params['DECODER_HIDDEN_SIZE'] = 32 params['N_LAYERS_ENCODER'] = 4 params['N_LAYERS_DECODER'] = 4 params['APPLY_DETOKENIZATION'] = True params['MAX_INPUT_TEXT_LEN'] = 24 params['MAX_OUTPUT_TEXT_LEN'] = 24 params['STOP_METRIC'] = 'perplexity' params['POS_UNK'] = True params['BEAM_SIZE'] = 20 params['N_GPUS'] = 2 params['START_EVAL_ON_EPOCH'] = 1 params['BATCH_SIZE'] = 256 params['EVAL_EACH'] = 1 params['MAX_EPOCH'] = 300 params['PLOT_EVALULATION'] = True params['APPLY_DETOKENIZATION'] = True params['MODE'] = 'training' params['BEAM_SEARCH'] = True params['TENSORBOARD'] = True params['LR'] = 0.1 train_model(params, load_dataset=MODEL_PATH + "/dataset/Dataset_tutorial_dataset.pkl")
def test_CocoScore(): params = load_parameters() filename = params['DATA_ROOT_PATH'] + params['TEXT_FILES']['val'] + params['TRG_LAN'] hyp = codecs.open(filename, 'r', encoding='utf-8') refs = [codecs.open(filename, 'r', encoding='utf-8').readlines()] refs, hypo = load_textfiles(refs, hyp) final_scores = CocoScore(refs, hypo, metrics_list=None, language=params['TRG_LAN']) assert isinstance(final_scores, dict) assert 'Bleu_1' in list(final_scores) assert 'Bleu_2' in list(final_scores) assert 'Bleu_3' in list(final_scores) assert 'Bleu_4' in list(final_scores) assert 'TER' in list(final_scores) assert 'METEOR' in list(final_scores) assert 'ROUGE_L' in list(final_scores) assert 'CIDEr' in list(final_scores) assert final_scores['Bleu_1'] - 1.0 <= 1e-6 assert final_scores['Bleu_2'] - 1.0 <= 1e-6 assert final_scores['Bleu_3'] - 1.0 <= 1e-6 assert final_scores['Bleu_4'] - 1.0 <= 1e-6 assert final_scores['TER'] - 0.0 <= 1e-6 assert final_scores['METEOR'] - 1.0 <= 1e-6 assert final_scores['ROUGE_L'] - 1.0 <= 1e-6 assert final_scores['CIDEr'] - 10.0 <= 1e-1 final_scores = CocoScore(refs, hypo, metrics_list=['BLeu'], language=params['TRG_LAN']) assert isinstance(final_scores, dict) assert 'Bleu_1' in list(final_scores) assert 'Bleu_2' in list(final_scores) assert 'Bleu_3' in list(final_scores) assert 'Bleu_4' in list(final_scores) assert 'TER' not in list(final_scores) assert 'METEOR' not in list(final_scores) assert 'ROUGE_L' not in list(final_scores) assert 'CIDEr' not in list(final_scores) assert final_scores['Bleu_1'] - 1.0 <= 1e-6 assert final_scores['Bleu_2'] - 1.0 <= 1e-6 assert final_scores['Bleu_3'] - 1.0 <= 1e-6 assert final_scores['Bleu_4'] - 1.0 <= 1e-6 final_scores = CocoScore(refs, hypo, metrics_list=['BLEU', 'ter'], language=params['TRG_LAN']) assert isinstance(final_scores, dict) assert 'Bleu_1' in list(final_scores) assert 'Bleu_2' in list(final_scores) assert 'Bleu_3' in list(final_scores) assert 'Bleu_4' in list(final_scores) assert 'TER' in list(final_scores) assert 'METEOR' not in list(final_scores) assert 'ROUGE_L' not in list(final_scores) assert 'CIDEr' not in list(final_scores) assert final_scores['Bleu_1'] - 1.0 <= 1e-6 assert final_scores['Bleu_2'] - 1.0 <= 1e-6 assert final_scores['Bleu_3'] - 1.0 <= 1e-6 assert final_scores['Bleu_4'] - 1.0 <= 1e-6 assert final_scores['TER'] - 0.0 <= 1e-6 hyp = codecs.open(filename, 'r', encoding='utf-8') refs = [codecs.open(filename, 'r', encoding='utf-8').readlines(), codecs.open(filename, 'r', encoding='utf-8').readlines(), codecs.open(filename, 'r', encoding='utf-8').readlines()] refs, hypo = load_textfiles(refs, hyp) final_scores = CocoScore(refs, hypo, metrics_list=None, language=params['TRG_LAN']) assert isinstance(final_scores, dict) assert 'Bleu_1' in list(final_scores) assert 'Bleu_2' in list(final_scores) assert 'Bleu_3' in list(final_scores) assert 'Bleu_4' in list(final_scores) assert 'TER' in list(final_scores) assert 'METEOR' in list(final_scores) assert 'ROUGE_L' in list(final_scores) assert 'CIDEr' in list(final_scores) assert final_scores['Bleu_1'] - 1.0 <= 1e-6 assert final_scores['Bleu_2'] - 1.0 <= 1e-6 assert final_scores['Bleu_3'] - 1.0 <= 1e-6 assert final_scores['Bleu_4'] - 1.0 <= 1e-6 assert final_scores['TER'] - 0.0 <= 1e-6 assert final_scores['METEOR'] - 1.0 <= 1e-6 assert final_scores['ROUGE_L'] - 1.0 <= 1e-6 assert final_scores['CIDEr'] - 10.0 <= 1e-1
def main(): args = parse_args() server_address = (args.address, args.port) httpd = HTTPServer(server_address, NMTHandler) logger.setLevel(args.logging_level) parameters = load_parameters() if args.config is not None: logger.info("Loading parameters from %s" % str(args.config)) parameters = update_parameters(parameters, pkl2dict(args.config)) if args.online: online_parameters = load_parameters_online() parameters = update_parameters(parameters, online_parameters) try: for arg in args.changes: try: k, v = arg.split('=') except ValueError: print( 'Overwritten arguments must have the form key=Value. \n Currently are: %s' % str(args.changes)) exit(1) try: parameters[k] = ast.literal_eval(v) except ValueError: parameters[k] = v except ValueError: print('Error processing arguments: (', k, ",", v, ")") exit(2) dataset = loadDataset(args.dataset) # For converting predictions into sentences # Dataset backwards compatibility bpe_separator = dataset.BPE_separator if hasattr( dataset, "BPE_separator") and dataset.BPE_separator is not None else '@@' # Build BPE tokenizer if necessary if 'bpe' in parameters['TOKENIZATION_METHOD'].lower(): logger.info('Building BPE') if not dataset.BPE_built: dataset.build_bpe(parameters.get( 'BPE_CODES_PATH', parameters['DATA_ROOT_PATH'] + '/training_codes.joint'), separator=bpe_separator) # Build tokenization function tokenize_f = eval('dataset.' + parameters.get('TOKENIZATION_METHOD', 'tokenize_bpe')) detokenize_function = eval( 'dataset.' + parameters.get('DETOKENIZATION_METHOD', 'detokenize_bpe')) dataset.build_moses_tokenizer(language=parameters['SRC_LAN']) dataset.build_moses_detokenizer(language=parameters['TRG_LAN']) tokenize_general = dataset.tokenize_moses detokenize_general = dataset.detokenize_moses # Prediction parameters params_prediction = dict() params_prediction['max_batch_size'] = parameters.get('BATCH_SIZE', 20) params_prediction['n_parallel_loaders'] = parameters.get( 'PARALLEL_LOADERS', 1) params_prediction['beam_size'] = parameters.get('BEAM_SIZE', 6) params_prediction['maxlen'] = parameters.get('MAX_OUTPUT_TEXT_LEN_TEST', 100) params_prediction['optimized_search'] = parameters['OPTIMIZED_SEARCH'] params_prediction['model_inputs'] = parameters['INPUTS_IDS_MODEL'] params_prediction['model_outputs'] = parameters['OUTPUTS_IDS_MODEL'] params_prediction['dataset_inputs'] = parameters['INPUTS_IDS_DATASET'] params_prediction['dataset_outputs'] = parameters['OUTPUTS_IDS_DATASET'] params_prediction['search_pruning'] = parameters.get( 'SEARCH_PRUNING', False) params_prediction['normalize_probs'] = True params_prediction['alpha_factor'] = parameters.get('ALPHA_FACTOR', 1.0) params_prediction['coverage_penalty'] = True params_prediction['length_penalty'] = True params_prediction['length_norm_factor'] = parameters.get( 'LENGTH_NORM_FACTOR', 0.0) params_prediction['coverage_norm_factor'] = parameters.get( 'COVERAGE_NORM_FACTOR', 0.0) params_prediction['pos_unk'] = parameters.get('POS_UNK', False) params_prediction['heuristic'] = parameters.get('HEURISTIC', 0) params_prediction['state_below_index'] = -1 params_prediction['output_text_index'] = 0 params_prediction['state_below_maxlen'] = -1 if parameters.get( 'PAD_ON_BATCH', True) else parameters.get('MAX_OUTPUT_TEXT_LEN', 50) params_prediction['output_max_length_depending_on_x'] = parameters.get( 'MAXLEN_GIVEN_X', True) params_prediction[ 'output_max_length_depending_on_x_factor'] = parameters.get( 'MAXLEN_GIVEN_X_FACTOR', 3) params_prediction['output_min_length_depending_on_x'] = parameters.get( 'MINLEN_GIVEN_X', True) params_prediction[ 'output_min_length_depending_on_x_factor'] = parameters.get( 'MINLEN_GIVEN_X_FACTOR', 2) params_prediction['attend_on_output'] = parameters.get( 'ATTEND_ON_OUTPUT', 'transformer' in parameters['MODEL_TYPE'].lower()) # Manage pos_unk strategies if parameters['POS_UNK']: mapping = None if dataset.mapping == dict() else dataset.mapping else: mapping = None if 'transformer' in parameters['MODEL_TYPE'].lower(): params_prediction['pos_unk'] = False params_prediction['coverage_penalty'] = False # Training parameters parameters_training = dict() if args.online: logger.info('Loading models from %s' % str(args.models)) parameters_training = { # Traning parameters 'n_epochs': parameters['MAX_EPOCH'], 'shuffle': False, 'loss': parameters.get('LOSS', 'categorical_crossentropy'), 'batch_size': parameters.get('BATCH_SIZE', 1), 'homogeneous_batches': False, 'optimizer': parameters.get('OPTIMIZER', 'SGD'), 'lr': parameters.get('LR', 0.1), 'lr_decay': parameters.get('LR_DECAY', None), 'lr_gamma': parameters.get('LR_GAMMA', 1.), 'epochs_for_save': -1, 'verbose': args.verbose, 'eval_on_sets': parameters.get('EVAL_ON_SETS_KERAS', None), 'n_parallel_loaders': parameters['PARALLEL_LOADERS'], 'extra_callbacks': [], # callbacks, 'reload_epoch': parameters['RELOAD'], 'epoch_offset': parameters['RELOAD'], 'data_augmentation': parameters['DATA_AUGMENTATION'], 'patience': parameters.get('PATIENCE', 0), 'metric_check': parameters.get('STOP_METRIC', None), 'eval_on_epochs': parameters.get('EVAL_EACH_EPOCHS', True), 'each_n_epochs': parameters.get('EVAL_EACH', 1), 'start_eval_on_epoch': parameters.get('START_EVAL_ON_EPOCH', 0), 'additional_training_settings': { 'k': parameters.get('K', 1), 'tau': parameters.get('TAU', 1), 'lambda': parameters.get('LAMBDA', 0.5), 'c': parameters.get('C', 0.5), 'd': parameters.get('D', 0.5) } } model_instances = [ TranslationModel( parameters, model_type=parameters['MODEL_TYPE'], verbose=parameters['VERBOSE'], model_name=parameters['MODEL_NAME'] + '_' + str(i), vocabularies=dataset.vocabulary, store_path=parameters['STORE_PATH'], set_optimizer=False) for i in range(len(args.models)) ] models = [ updateModel(model, path, -1, full_path=True) for (model, path) in zip(model_instances, args.models) ] else: models = [loadModel(m, -1, full_path=True) for m in args.models] for nmt_model in models: nmt_model.setParams(parameters) nmt_model.setOptimizer() parameters['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ parameters['INPUTS_IDS_DATASET'][0]] parameters['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ parameters['OUTPUTS_IDS_DATASET'][0]] # Get word2index and index2word dictionaries index2word_y = dataset.vocabulary[parameters['OUTPUTS_IDS_DATASET'] [0]]['idx2words'] word2index_y = dataset.vocabulary[parameters['OUTPUTS_IDS_DATASET'] [0]]['words2idx'] index2word_x = dataset.vocabulary[parameters['INPUTS_IDS_DATASET'] [0]]['idx2words'] word2index_x = dataset.vocabulary[parameters['INPUTS_IDS_DATASET'] [0]]['words2idx'] excluded_words = None interactive_beam_searcher = NMTSampler(models, dataset, parameters, params_prediction, parameters_training, tokenize_f, detokenize_function, tokenize_general, detokenize_general, mapping=mapping, word2index_x=word2index_x, word2index_y=word2index_y, index2word_y=index2word_y, eos_symbol=args.eos_symbol, excluded_words=excluded_words, online=args.online, verbose=args.verbose) httpd.sampler = interactive_beam_searcher logger.info('Server starting at %s' % str(server_address)) httpd.serve_forever()
import sys sys.path.append('../nmt-keras') sys.path.append('../nmt-keras/nmt_keras') import utils from config import load_parameters from data_engine.prepare_data import keep_n_captions from keras_wrapper.cnn_model import loadModel from keras_wrapper.dataset import loadDataset from keras_wrapper.utils import decode_predictions_beam_search from model_zoo import TranslationModel params = load_parameters() dataset = loadDataset('query_to_reply/Dataset_Cornell_base.pkl') dataset.setInput('data/Ross_test.query', 'test', type='text', id='source_text', pad_on_batch=True, tokenization='tokenize_basic', fill='end', max_text_len=100, min_occ=0) dataset.setInput(None, 'test', type='ghost', id='state_below', required=False) ## get model predictions params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]]
def test_train(): params = load_parameters() params['REBUILD_DATASET'] = True params['DATASET_STORE_PATH'] = './' dataset = build_dataset(params) params['INPUT_VOCABULARY_SIZE'] = \ dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = \ dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] params['SOURCE_TEXT_EMBEDDING_SIZE'] = 2 params['TARGET_TEXT_EMBEDDING_SIZE'] = 2 params['ENCODER_HIDDEN_SIZE'] = 2 params['DECODER_HIDDEN_SIZE'] = 2 params['ATTENTION_SIZE'] = 2 params['SKIP_VECTORS_HIDDEN_SIZE'] = 2 params['DEEP_OUTPUT_LAYERS'] = [('linear', 2)] params['STORE_PATH'] = './' nmt_model = \ TranslationModel(params, model_type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], vocabularies=dataset.vocabulary, store_path=params['STORE_PATH'], clear_dirs=False) # Check Inputs inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): pos_source = dataset.ids_inputs.index(id_in) id_dest = nmt_model.ids_inputs[i] inputMapping[id_dest] = pos_source nmt_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): pos_target = dataset.ids_outputs.index(id_out) id_dest = nmt_model.ids_outputs[i] outputMapping[id_dest] = pos_target nmt_model.setOutputsMapping(outputMapping) callbacks = buildCallbacks(params, nmt_model, dataset) training_params = { 'n_epochs': 1, 'batch_size': 50, 'homogeneous_batches': False, 'maxlen': 10, 'joint_batches': params['JOINT_BATCHES'], 'lr_decay': params['LR_DECAY'], 'lr_gamma': params['LR_GAMMA'], 'epochs_for_save': 1, 'verbose': params['VERBOSE'], 'eval_on_sets': params['EVAL_ON_SETS_KERAS'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'extra_callbacks': callbacks, 'reload_epoch': 0, 'epoch_offset': 0, 'data_augmentation': False, 'patience': 1, # early stopping parameters 'metric_check': 'Bleu_4', 'eval_on_epochs': True, 'each_n_epochs': 1, 'start_eval_on_epoch': 0 } nmt_model.trainNet(dataset, training_params) return True
def check_params(params): if 'Glove' in params['MODEL_TYPE'] and params['GLOVE_VECTORS'] is None: logger.warning("You set a model that uses pretrained word vectors but you didn't specify a vector file." "We'll train WITHOUT pretrained embeddings!") if params["USE_DROPOUT"] and params["USE_BATCH_NORMALIZATION"]: logger.warning("It's not recommended to use both dropout and batch normalization") if params['MODE'] == 'sampling': assert len(params["EVAL_ON_SETS"]) == 1, 'It is only possible to sample over 1 set' if 'Bidirectional' in params["MODEL_TYPE"]: assert params["LSTM_ENCODER_HIDDEN_SIZE"]*2 == params["IMG_EMBEDDING_HIDDEN_SIZE"], "LSTM_ENCODER_HIDDEN_SIZE must be IMG_EMBEDDING_HIDDEN_SIZE/2" if __name__ == "__main__": params = load_parameters() check_params(params) if(params['MODE'] == 'training'): logging.info('Running training.') main(params) elif(params['MODE'] == 'sampling'): logging.info('Running sampling.') apply_VQA_model(params) logging.info('Done!')