indexes = open("../wang/data/" + mode + "/dev.filtered", 'r').read().strip('\n').split('\n') ign_indexes_val = [int(x) - 1 for x in indexes] indexes = open("../wang/data/" + mode + "/test.filtered", 'r').read().strip('\n').split('\n') ign_indexes_test = [int(x) - 1 for x in indexes] else: ign_indexes_train, ign_indexes_val, ign_indexes_test = [], [], [] print("Reading vocabulary...") vocab_dict, word_embedding_array = DataProcessor( ).prepare_vocab_embeddingdict() print("Reading training set...") train_data = DataProcessor().prepare_news_data( vocab_dict, data_type="training") # subsampled train_batch = batch_load_data(train_data) # data in whole batch with padded matrixes print("Reading validation set...") val_batch = batch_load_data(DataProcessor().prepare_news_data( vocab_dict, data_type="validation", normalizer=train_data.normalizer, pca_model=train_data.pca_model)) print("Reading test set...") test_batch = batch_load_data(DataProcessor().prepare_news_data( vocab_dict, data_type="test", normalizer=train_data.normalizer, pca_model=train_data.pca_model)) del train_data
FLAGS.mlp_size = 100 FLAGS.use_dropout = False FLAGS.use_dropout_outatt = False FLAGS.dropout = 0.8 prev_drpt = FLAGS.use_dropout # set sentence, doc length to maximum output = open("tunning_"+FLAGS.data_mode+"/"+FLAGS.data_mode + "_hp_grid_tuning_%s.txt" % args.file_suffix,'w') vocab_dict, word_embedding_array = DataProcessor().prepare_vocab_embeddingdict() train_data = DataProcessor().prepare_news_data(vocab_dict, data_type="training") # subsampled # data in whole batch with padded matrixes val_batch = batch_load_data(DataProcessor().prepare_news_data(vocab_dict, data_type="validation")) setup_by_id = {} results_by_id = {} results_by_id_mrr = {} setup_id = 0 best_global_acc = -1 best_global_mrr = -1 best_setup_id = -1 best_setup_id_mrr = -1 parameter_grid = { "batch_size" : [64], "learning_rate" : [0.0001], "mlp_size":[100], "sentembed_size":[348],
#dir ubuntu FLAGS.raw_data_dir = "/home/usuario/datasets" if args.dataset=='eus': FLAGS.max_audio_length = 680 # obtained from sequence lengths histogram FLAGS.max_freq_length = 201 elif args.dataset=='quz': FLAGS.max_audio_length = 100 # TBD FLAGS.max_freq_length = 100 # set sentence, doc length to maximum output = open("tunning_"+FLAGS.data_mode+"/"+FLAGS.data_mode + "_hp_grid_tuning_%s.txt" % args.file_suffix,'w') vocab_dict,inverted_vocab = get_vocab() train_data = DataProcessor(vocab_dict,inverted_vocab,data_type="train") val_batch = batch_load_data(DataProcessor(vocab_dict,inverted_vocab,data_type="val")) setup_by_id = {} results_by_id_wer = {} results_by_id_cer = {} setup_id = 0 best_global_wer = 200 best_global_cer = 200 best_setup_id_wer = -1 best_setup_id_cer = -1 ## FLAGS.___ = ___ # set as constant so it doesn't clutter output #FLAGS.use_conv2d = True #FLAGS.use_dropout = False #FLAGS.birnn = False #FLAGS.lookahead_conv = False
FLAGS.batch_size = 20 FLAGS.learning_rate = 0.0001 FLAGS.size = 600 FLAGS.max_gradient_norm = 10 FLAGS.sentembed_size = 350 vocab_dict, word_embedding_array = DataProcessor( ).prepare_vocab_embeddingdict() train_data = DataProcessor().prepare_news_data( vocab_dict, data_type="training") # subsampled # data without padding validation_data = DataProcessor().prepare_news_data( vocab_dict, data_type="validation") # complete # data in whole batch with padded matrixes val_batch = batch_load_data(validation_data) # we don't neeed this anymore del validation_data ############################################################################################### assignments = { 'batch_size': 64, 'log_learning_rate': -6.907755278982137, 'size': 2500, 'sentembed_size': 1024 } value = evaluate_model(assignments, train_data, val_batch, metric) conn = Connection(client_token="<your_API_token>")