def __init__(self, DATA_DIR, dset, is_train, verbose, reward='RECALL'): # this method returns simulator, state/action vocabularies, and the maximum number of actions n_words = 100 # 374000 # words for the vocabulary vocab_path = os.path.join( DATA_DIR, 'data/D_cbow_pdw.pkl' ) # Path to the python dictionary containing the vocabulary. wordemb_path = os.path.join( DATA_DIR, 'data/D_cbow_pdw.pkl' ) # Path to the python dictionary containing the word embeddings. dataset_path = os.path.join( DATA_DIR, 'data/msa_dataset.hdf5' ) # path to load the hdf5 dataset containing queries and ground-truth documents. docs_path = os.path.join( DATA_DIR, 'data/msa_corpus.hdf5') # Path to load the articles and links. docs_path_term = os.path.join( DATA_DIR, 'data/msa_corpus.hdf5') # Path to load the articles and links. ############################ # Search Engine Parameters # ############################ n_threads = 1 # 20 # number of parallel process that will execute the queries on the search engine. index_name = 'index' # index name for the search engine. Used when engine is 'lucene'. index_name_term = 'index_terms' # index name for the search engine. Used when engine is 'lucene'. use_cache = False # If True, cache (query-retrieved docs) pairs. Watch for memory usage. max_terms_per_doc = 15 # Maximum number of candidate terms from each feedback doc. Must be always less than max_words_input . self.actions_space = dict() self.states_space = dict() self.current_queries = [] self.D_gt_id = [] # self.word2vec = Word2Vec.load_word2vec_format(GLOVE_FILE, binary=binary) self.word_index = None # self.embedding_dim = self.word2vec.syn0.shape[1] self.vocab = utils.load_vocab(vocab_path, n_words) vocabinv = {} for k, v in self.vocab.items(): vocabinv[v] = k self.reward = reward self.is_train = is_train self.search = Search(engine=lucene_search.LuceneSearch( DATA_DIR, self.vocab, n_threads, max_terms_per_doc, index_name, index_name_term, docs_path, docs_path_term, use_cache)) t0 = time() dh5 = dataset_hdf5.DatasetHDF5(dataset_path) self.qi = dh5.get_queries(dset) self.dt = dh5.get_doc_ids(dset) # print("Loading queries and docs {}".format(time() - t0)) self.reset()
def __init__(self, cfg, dset, is_train, verbose, rewardtype='RECALL'): # this method returns simulator, state/action vocabularies, and the maximum number of actions n_words = 374000 # 100 words for the vocabulary DATA_DIR = cfg['data']['base_path'] vocab_path = os.path.join( DATA_DIR, 'data/D_cbow_pdw_8B.pkl' ) # Path to the python dictionary containing the vocabulary. wordemb_path = os.path.join( DATA_DIR, 'data/D_cbow_pdw_8B.pkl' ) # Path to the python dictionary containing the word embeddings. dataset_path = os.path.join( DATA_DIR, 'data/msa_dataset.hdf5' ) # path to load the hdf5 dataset containing queries and ground-truth documents. docs_path = os.path.join( DATA_DIR, 'data/msa_corpus.hdf5') # Path to load the articles and links. docs_path_term = os.path.join( DATA_DIR, 'data/msa_corpus.hdf5') # Path to load the articles and links. ############################ # Search Engine Parameters # ############################ n_threads = 1 # 20 # number of parallel process that will execute the queries on the search engine. index_name = 'index' # index name for the search engine. Used when engine is 'lucene'. index_name_term = 'index_terms' # index name for the search engine. Used when engine is 'lucene'. use_cache = False # If True, cache (query-retrieved docs) pairs. Watch for memory usage. max_terms_per_doc = 15 # Maximum number of candidate terms from each feedback doc. Must be always less than max_words_input . #self.batch_size_train=2 # The batch size during training. self.cfg = cfg t0 = time() self.vocab = utils.load_vocab(vocab_path, n_words) vocabinv = {} for k, v in self.vocab.items(): vocabinv[v] = k self.vocabinv = vocabinv self.rewardtype = rewardtype self.is_train = is_train self.search = Search(engine=lucene_search.LuceneSearch( DATA_DIR, self.vocab, n_threads, max_terms_per_doc, index_name, index_name_term, docs_path, docs_path_term, use_cache)) self.batch_size = cfg['agent']['batch_size'] t0 = time() dh5 = dataset_hdf5.DatasetHDF5(dataset_path) self.qi = dh5.get_queries(dset) cfg['data']['querydb_size'] = len(self.qi) self.dt = dh5.get_doc_ids(dset) print("Loading queries and docs {}".format(time() - t0)) self.counsteps = 0
def train(): if prm.optimizer.lower() == 'adam': optimizer=adam elif prm.optimizer.lower() == 'sgd': optimizer=sgd elif prm.optimizer.lower() == 'rmsprop': optimizer=rmsprop elif prm.optimizer.lower() == 'adadelta': optimizer=adadelta options = locals().copy() print 'parameters:', str(options) prm_k = vars(prm).keys() prm_d = vars(prm) prm_k.sort() for x in prm_k: if not x.startswith('__'): print x,'=', prm_d[x] print 'loading Vocabulary...' vocab = utils.load_vocab(prm.vocab_path, prm.n_words) options['vocab'] = vocab options['vocabinv'] = {} for k,v in vocab.items(): options['vocabinv'][v] = k print 'Loading Environment...' if prm.engine.lower() == 'lucene': import lucene_search options['engine'] = lucene_search.LuceneSearch() elif prm.engine.lower() == 'elastic': import elastic_search options['engine'] = elastic_search.ElasticSearch() print 'Loading Dataset...' dh5 = dataset_hdf5.DatasetHDF5(prm.dataset_path) qi_train = dh5.get_queries(dset='train') dt_train = dh5.get_doc_ids(dset='train') qi_valid = dh5.get_queries(dset='valid') dt_valid = dh5.get_doc_ids(dset='valid') qi_test = dh5.get_queries(dset='test') dt_test = dh5.get_doc_ids(dset='test') if prm.train_size == -1: train_size = len(qi_train) else: train_size = min(prm.train_size, len(qi_train)) if prm.valid_size == -1: valid_size = len(qi_valid) else: valid_size = min(prm.valid_size, len(qi_valid)) if prm.test_size == -1: test_size = len(qi_test) else: test_size = min(prm.test_size, len(qi_test)) print '%d train examples' % len(qi_train) print '%d valid examples' % len(qi_valid) print '%d test examples' % len(qi_test) # This create the initial parameters as np ndarrays. # Dict name (string) -> np ndarray params, exclude_params = init_params(options) if prm.wordemb_path: print 'loading pre-trained word embeddings' params = load_wemb(params, vocab) options['W'] = params['W'] if prm.reload_model: load_params(prm.reload_model, params) print 'Building model' # This create Theano Shared Variable from the parameters. # Dict name (string) -> Theano Tensor Shared Variable # params and tparams have different copy of the weights. tparams = init_tparams(params) for kk, value in tparams.iteritems(): tparams[kk] = theano.shared(value, name=kk) iin, out, updates, f_pred, consider_constant \ = build_model(tparams, options) #get only parameters that are not in the exclude_params list tparams_ = OrderedDict([(kk, vv) for kk, vv in tparams.iteritems() if kk not in exclude_params]) grads = tensor.grad(out[0], wrt=itemlist(tparams_), consider_constant=consider_constant) lr = tensor.scalar(name='lr') f_grad_shared, f_update = optimizer(lr, tparams_, grads, iin, out, updates) history_errs = [] best_p = None if prm.validFreq == -1: validFreq = len(qi_train) / prm.batch_size_train else: validFreq = prm.validFreq if prm.saveFreq == -1: saveFreq = len(qi_train) / prm.batch_size_train else: saveFreq = prm.saveFreq uidx = 0 # the number of update done estop = False # early stop start_time = time.time() print 'Optimization' try: for eidx in xrange(prm.max_epochs): n_samples = 0 # Get new shuffled index for the training set. kf = get_minibatches_idx(len(qi_train), prm.batch_size_train, shuffle=True) for _, train_index in kf: st = time.time() uidx += 1 qi, qi_i, qi_lst, D_gt_id, D_gt_url = get_samples(qi_train, dt_train, train_index, options) # share the current queries with the search engine. options['current_queries'] = qi_lst n_samples += len(qi) is_train = 1. out = f_grad_shared(qi_i, D_gt_id, is_train) cost = out.pop(0) cost_ent = out.pop(0) lr_t = f_update(prm.lrate) if np.isnan(cost) or np.isinf(cost): print 'NaN detected' return 1., 1., 1. if np.mod(uidx, prm.dispFreq) == 0: print '\n================================================================================' print 'Epoch', eidx, 'Update', uidx, 'Cost', cost, 'LR_t', lr_t print 'Time Minibatch Update: ' + str(time.time() - st) print 'Input Query: ', qi[0].replace('\n','\\n') print print 'Target Docs: ', str(D_gt_url[0]) print print 'Input Query Vocab: ', utils.idx2text(qi_i[0], options['vocabinv']) for ii in range(prm.n_iterations): prob = out.pop(0) ans = out.pop(0) metrics = out.pop(0) bl = out.pop(0) cost_bl = out.pop(0) D_id = out.pop(0) print print 'Iteration', ii print 'Baseline Value', bl.mean(), 'Cost', cost_bl print ' '.join(prm.metrics_map.keys()) print metrics.mean(0) print print 'Retrieved Docs: ', str([options['engine'].id_title_map[d_id] for d_id in D_id[0]]) print print 'Reformulated Query:', options['reformulated_queries'][ii][0] print print 'Query ANS: ', for kk, word in enumerate(options['current_queries'][0][:ans.shape[1]]): if word not in options['vocab'] and word != '': word += '<unk>' if ans[0,kk] == 1: word = word.upper() print str(word), print print print 'prob[:,:,0].max(1).mean(), prob[:,:,0].mean(), prob[:,:,0].min(1).mean()', prob[:,:,0].max(1).mean(), prob[:,:,0].mean(), prob[:,:,0].min(1).mean() print 'prob[:,:,1].max(1).mean(), prob[:,:,1].mean(), prob[:,:,1].min(1).mean()', prob[:,:,1].max(1).mean(), prob[:,:,1].mean(), prob[:,:,1].min(1).mean() print '==================================================================================\n' if np.mod(uidx, validFreq) == 0 or uidx == 1: kf_train = get_minibatches_idx(len(qi_train), prm.batch_size_pred, shuffle=True, max_samples=train_size) kf_valid = get_minibatches_idx(len(qi_valid), prm.batch_size_pred, shuffle=True, max_samples=valid_size) kf_test = get_minibatches_idx(len(qi_test), prm.batch_size_pred, shuffle=True, max_samples=test_size) print '\nEvaluating - Training Set' train_metrics = pred_error(f_pred, qi_train, dt_train, options, kf_train) print '\nEvaluating - Validation Set' valid_metrics = pred_error(f_pred, qi_valid, dt_valid, options, kf_valid) print '\nEvaluating - Test Set' test_metrics = pred_error(f_pred, qi_test, dt_test, options, kf_test) his = [train_metrics, valid_metrics, test_metrics] history_errs.append(his) metric_idx = prm.metrics_map[prm.reward.upper()] if (uidx == 0 or valid_metrics[-1, metric_idx] >= np.array(history_errs)[:,1,-1,metric_idx].max()): best_p = unzip(tparams) bad_counter = 0 print '=====================================================================================================' print ' '.join(prm.metrics_map.keys()) print print 'Train:' print train_metrics print print 'Valid:' print valid_metrics print print 'Test:' print test_metrics print print '=====================================================================================================' if (len(history_errs) > prm.patience and valid_metrics[-1, metric_idx] <= np.array(history_errs)[:-prm.patience, 1,-1,metric_idx].max()): bad_counter += 1 if bad_counter > prm.patience: print 'Early Stop!' estop = True break if prm.saveto and np.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = best_p else: params = unzip(tparams) np.savez(prm.saveto, history_errs=history_errs, **params) print 'Done' print 'Seen %d samples' % n_samples if estop: break except KeyboardInterrupt: print "Training interupted" return