'rnn_dropout_prob', 'max_gradient_norm', 'minibatch_size', 'beam_width', 'geomeanpplx', 'duration', sep='\t') datasources = data.load_datasources(corpus) datasources['train'] = datasources['train'].shuffle(0).take(capgen_size) vocab = datasources['train'].tokenize_sents().text_sents.get_vocab( config.min_token_freq) dataset = data.Dataset( vocab=vocab, train_datasource=datasources['train'], val_datasource=datasources['val'], test_datasource=capgen_test, ) dataset.compile_sents() test_index_sents = dataset.test.index_sents if not lib.file_exists(config.hyperpar_dir + '/langmodtrans/' + corpus + '/1_search.txt'): with open(config.hyperpar_dir + '/langmodtrans/' + corpus + '/1_search.txt', 'w', encoding='utf-8') as f: print('#', 'init_method', 'max_init_weight',
corpus_size = round(10**corpus_size_factor_exponent * capgen_size) full_timer = lib.Timer() datasources = data.load_datasources(corpus) datasources['train'] = datasources['train'].without_images().shuffle(run).take(corpus_size) langmod_vocab = datasources['train'].tokenize_sents().text_sents.get_vocab(config.min_token_freq) capgen_vocab = capgen_test.tokenize_sents().text_sents.get_vocab(config.min_token_freq).intersection(langmod_vocab) capgen_full_vocab = capgen_test.tokenize_sents().text_sents.get_vocab() capgen_num_out_of_vocab_tokens = capgen_full_vocab.size - capgen_vocab.size dataset = data.Dataset( vocab = langmod_vocab, train_datasource = datasources['train'], val_datasource = datasources['val'], ) dataset.compile_sents() capgen_num_unknowns_per_sent = np.sum(dataset.val.index_sents.targets == data.Vocab.UNKNOWN_INDEX, axis=1).tolist() with model_neural_trad.TradNeuralModel( vocab_size = langmod_vocab.size, init_method = langmod_init_method, max_init_weight = langmod_max_init_weight, embed_size = langmod_embed_size, rnn_size = langmod_rnn_size, post_image_size = langmod_post_image_size, pre_output_size = langmod_pre_output_size, post_image_activation = langmod_post_image_activation,
corpus_size_factor_exponent, run) lib.create_dir(config.results_dir + '/langmodtrans/' + corpus + '/' + dir_name) corpus_size = round(10**corpus_size_factor_exponent * capgen_size) datasources = data.load_datasources(corpus) datasources['train'] = datasources['train'].without_images( ).shuffle(run).take(corpus_size) langmod_vocab = datasources['train'].tokenize_sents( ).text_sents.get_vocab(config.min_token_freq) dataset = data.Dataset( vocab=langmod_vocab, train_datasource=datasources['train'], val_datasource=datasources['val'], test_datasource=capgen_test, ) dataset.compile_sents() selected_test_sents = dataset.test.shuffle(run).take( one_per_group=True).tokenize_sents().compile_sents( langmod_vocab) selected_index_sents = selected_test_sents.index_sents with open(config.results_dir + '/langmodtrans/' + corpus + '/' + dir_name + '/1_corpus_indexes.txt', 'w', encoding='utf-8') as f: print(*dataset.train.individual_indexes, sep='\n', file=f)
print() continue full_timer = lib.Timer() dir_name = '{}_{}_{}'.format(architecture, dataset_name, run) lib.create_dir(config.results_dir + '/whereimage/' + architecture + '/' + dir_name) datasources = data.load_datasources(dataset_name) vocab = datasources['train'].tokenize_sents().text_sents.get_vocab( config.min_token_freq) dataset = data.Dataset( vocab=vocab, train_datasource=datasources['train'], val_datasource=datasources['val'], test_datasource=datasources['test'], ) dataset.compile_sents() selected_test_sents = dataset.test.shuffle(run).take( one_per_group=True).tokenize_sents().compile_sents(vocab) selected_index_sents = selected_test_sents.index_sents with open(config.results_dir + '/whereimage/' + architecture + '/' + dir_name + '/selected_test.txt', 'w', encoding='utf-8') as f: print(*selected_test_sents.individual_indexes, sep='\n', file=f)