Exemple #1
0
    def __init__(self, DATA_DIR, dset, is_train, verbose, reward='RECALL'):
        # this method returns simulator, state/action vocabularies, and the maximum number of actions
        n_words = 100  # 374000 # words for the vocabulary
        vocab_path = os.path.join(
            DATA_DIR, 'data/D_cbow_pdw.pkl'
        )  # Path to the python dictionary containing the vocabulary.
        wordemb_path = os.path.join(
            DATA_DIR, 'data/D_cbow_pdw.pkl'
        )  # Path to the python dictionary containing the word embeddings.
        dataset_path = os.path.join(
            DATA_DIR, 'data/msa_dataset.hdf5'
        )  # path to load the hdf5 dataset containing queries and ground-truth documents.
        docs_path = os.path.join(
            DATA_DIR,
            'data/msa_corpus.hdf5')  # Path to load the articles and links.
        docs_path_term = os.path.join(
            DATA_DIR,
            'data/msa_corpus.hdf5')  # Path to load the articles and links.
        ############################
        # Search Engine Parameters #
        ############################
        n_threads = 1  # 20 # number of parallel process that will execute the queries on the search engine.
        index_name = 'index'  # index name for the search engine. Used when engine is 'lucene'.
        index_name_term = 'index_terms'  # index name for the search engine. Used when engine is 'lucene'.
        use_cache = False  # If True, cache (query-retrieved docs) pairs. Watch for memory usage.
        max_terms_per_doc = 15  # Maximum number of candidate terms from each feedback doc. Must be always less than max_words_input .

        self.actions_space = dict()
        self.states_space = dict()
        self.current_queries = []
        self.D_gt_id = []

        # self.word2vec = Word2Vec.load_word2vec_format(GLOVE_FILE, binary=binary)
        self.word_index = None
        # self.embedding_dim = self.word2vec.syn0.shape[1]

        self.vocab = utils.load_vocab(vocab_path, n_words)
        vocabinv = {}
        for k, v in self.vocab.items():
            vocabinv[v] = k
        self.reward = reward
        self.is_train = is_train
        self.search = Search(engine=lucene_search.LuceneSearch(
            DATA_DIR, self.vocab, n_threads, max_terms_per_doc, index_name,
            index_name_term, docs_path, docs_path_term, use_cache))

        t0 = time()
        dh5 = dataset_hdf5.DatasetHDF5(dataset_path)
        self.qi = dh5.get_queries(dset)
        self.dt = dh5.get_doc_ids(dset)
        # print("Loading queries and docs {}".format(time() - t0))
        self.reset()
    def __init__(self, cfg, dset, is_train, verbose, rewardtype='RECALL'):
        # this method returns simulator, state/action vocabularies, and the maximum number of actions
        n_words = 374000  # 100 words for the vocabulary
        DATA_DIR = cfg['data']['base_path']
        vocab_path = os.path.join(
            DATA_DIR, 'data/D_cbow_pdw_8B.pkl'
        )  # Path to the python dictionary containing the vocabulary.
        wordemb_path = os.path.join(
            DATA_DIR, 'data/D_cbow_pdw_8B.pkl'
        )  # Path to the python dictionary containing the word embeddings.
        dataset_path = os.path.join(
            DATA_DIR, 'data/msa_dataset.hdf5'
        )  # path to load the hdf5 dataset containing queries and ground-truth documents.
        docs_path = os.path.join(
            DATA_DIR,
            'data/msa_corpus.hdf5')  # Path to load the articles and links.
        docs_path_term = os.path.join(
            DATA_DIR,
            'data/msa_corpus.hdf5')  # Path to load the articles and links.
        ############################
        # Search Engine Parameters #
        ############################
        n_threads = 1  # 20 # number of parallel process that will execute the queries on the search engine.
        index_name = 'index'  # index name for the search engine. Used when engine is 'lucene'.
        index_name_term = 'index_terms'  # index name for the search engine. Used when engine is 'lucene'.
        use_cache = False  # If True, cache (query-retrieved docs) pairs. Watch for memory usage.
        max_terms_per_doc = 15  # Maximum number of candidate terms from each feedback doc. Must be always less than max_words_input .
        #self.batch_size_train=2 # The batch size during training.
        self.cfg = cfg
        t0 = time()

        self.vocab = utils.load_vocab(vocab_path, n_words)
        vocabinv = {}
        for k, v in self.vocab.items():
            vocabinv[v] = k
        self.vocabinv = vocabinv

        self.rewardtype = rewardtype
        self.is_train = is_train
        self.search = Search(engine=lucene_search.LuceneSearch(
            DATA_DIR, self.vocab, n_threads, max_terms_per_doc, index_name,
            index_name_term, docs_path, docs_path_term, use_cache))

        self.batch_size = cfg['agent']['batch_size']
        t0 = time()
        dh5 = dataset_hdf5.DatasetHDF5(dataset_path)
        self.qi = dh5.get_queries(dset)
        cfg['data']['querydb_size'] = len(self.qi)
        self.dt = dh5.get_doc_ids(dset)
        print("Loading queries and docs {}".format(time() - t0))
        self.counsteps = 0
Exemple #3
0
def train():

    if prm.optimizer.lower() == 'adam':
        optimizer=adam
    elif prm.optimizer.lower() == 'sgd':
        optimizer=sgd
    elif prm.optimizer.lower() == 'rmsprop':
        optimizer=rmsprop
    elif prm.optimizer.lower() == 'adadelta':
        optimizer=adadelta

    options = locals().copy()

    print 'parameters:', str(options)
    prm_k = vars(prm).keys()
    prm_d = vars(prm)
    prm_k.sort()
    for x in prm_k:
        if not x.startswith('__'):
            print x,'=', prm_d[x]

    print 'loading Vocabulary...'
    vocab = utils.load_vocab(prm.vocab_path, prm.n_words)
    options['vocab'] = vocab

    options['vocabinv'] = {}
    for k,v in vocab.items():
        options['vocabinv'][v] = k

    print 'Loading Environment...'
    if prm.engine.lower() == 'lucene':
        import lucene_search
        options['engine'] = lucene_search.LuceneSearch()
    elif prm.engine.lower() == 'elastic':
        import elastic_search
        options['engine'] = elastic_search.ElasticSearch()

    print 'Loading Dataset...'
    dh5 = dataset_hdf5.DatasetHDF5(prm.dataset_path)
    qi_train = dh5.get_queries(dset='train')
    dt_train = dh5.get_doc_ids(dset='train')
    qi_valid = dh5.get_queries(dset='valid')
    dt_valid = dh5.get_doc_ids(dset='valid')
    qi_test = dh5.get_queries(dset='test')
    dt_test = dh5.get_doc_ids(dset='test')
    
    if prm.train_size == -1:
        train_size = len(qi_train)
    else:
        train_size = min(prm.train_size, len(qi_train))

    if prm.valid_size == -1:
        valid_size = len(qi_valid)
    else:
        valid_size = min(prm.valid_size, len(qi_valid))

    if prm.test_size == -1:
        test_size = len(qi_test)
    else:
        test_size = min(prm.test_size, len(qi_test))

    print '%d train examples' % len(qi_train)
    print '%d valid examples' % len(qi_valid)
    print '%d test examples' % len(qi_test)

    # This create the initial parameters as np ndarrays.
    # Dict name (string) -> np ndarray
    params, exclude_params = init_params(options)

    if prm.wordemb_path:
        print 'loading pre-trained word embeddings'
        params = load_wemb(params, vocab)
        options['W'] = params['W']

    if prm.reload_model:
        load_params(prm.reload_model, params)

    print 'Building model'
    # This create Theano Shared Variable from the parameters.
    # Dict name (string) -> Theano Tensor Shared Variable
    # params and tparams have different copy of the weights.
    tparams = init_tparams(params)
    for kk, value in tparams.iteritems():
        tparams[kk] = theano.shared(value, name=kk)

    iin, out, updates, f_pred, consider_constant \
            = build_model(tparams, options)

    #get only parameters that are not in the exclude_params list
    tparams_ = OrderedDict([(kk, vv) for kk, vv in tparams.iteritems() if kk not in exclude_params])

    grads = tensor.grad(out[0], wrt=itemlist(tparams_), consider_constant=consider_constant)

    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = optimizer(lr, tparams_, grads, iin, out, updates)

    history_errs = []
    best_p = None

    if prm.validFreq == -1:
        validFreq = len(qi_train) / prm.batch_size_train
    else:
        validFreq = prm.validFreq

    if prm.saveFreq == -1:
        saveFreq = len(qi_train) / prm.batch_size_train
    else:
        saveFreq = prm.saveFreq

    uidx = 0  # the number of update done
    estop = False  # early stop
    start_time = time.time()

    print 'Optimization'
    
    try:
        for eidx in xrange(prm.max_epochs):
            n_samples = 0

            # Get new shuffled index for the training set.
            kf = get_minibatches_idx(len(qi_train), prm.batch_size_train, shuffle=True)

            for _, train_index in kf:
                st = time.time()

                uidx += 1
                qi, qi_i, qi_lst, D_gt_id, D_gt_url = get_samples(qi_train, dt_train, train_index, options)

                # share the current queries with the search engine.
                options['current_queries'] = qi_lst

                n_samples += len(qi)

                is_train = 1.

                out = f_grad_shared(qi_i, D_gt_id, is_train)

                cost = out.pop(0)
                cost_ent = out.pop(0)

                lr_t = f_update(prm.lrate)

                if np.isnan(cost) or np.isinf(cost):
                    print 'NaN detected'
                    return 1., 1., 1.
    
                if np.mod(uidx, prm.dispFreq) == 0:

                    print '\n================================================================================'
                    print 'Epoch', eidx, 'Update', uidx, 'Cost', cost, 'LR_t', lr_t
                    print 'Time Minibatch Update: ' + str(time.time() - st)
                    print 'Input Query:       ', qi[0].replace('\n','\\n')
                    print
                    print 'Target Docs:       ', str(D_gt_url[0])
                    print
                    print 'Input Query Vocab: ', utils.idx2text(qi_i[0], options['vocabinv'])
                    for ii in range(prm.n_iterations):
                        prob = out.pop(0)
                        ans = out.pop(0)
                        metrics = out.pop(0)
                        bl = out.pop(0)
                        cost_bl = out.pop(0)
                        D_id = out.pop(0)
                        print 
                        print 'Iteration', ii
                        print 'Baseline Value', bl.mean(), 'Cost', cost_bl
                        print '  '.join(prm.metrics_map.keys())
                        print metrics.mean(0)
                        print
                        print 'Retrieved Docs:    ', str([options['engine'].id_title_map[d_id] for d_id in D_id[0]])
                        print
                        print 'Reformulated Query:', options['reformulated_queries'][ii][0]
                        print
                        print 'Query ANS:         ',
                        for kk, word in enumerate(options['current_queries'][0][:ans.shape[1]]):                         
                            if word not in options['vocab'] and word != '':
                                word += '<unk>'
                            if ans[0,kk] == 1:
                                word = word.upper()
                            print str(word), 
                        print
                        print
                        print 'prob[:,:,0].max(1).mean(), prob[:,:,0].mean(), prob[:,:,0].min(1).mean()', prob[:,:,0].max(1).mean(), prob[:,:,0].mean(), prob[:,:,0].min(1).mean()
                        print 'prob[:,:,1].max(1).mean(), prob[:,:,1].mean(), prob[:,:,1].min(1).mean()', prob[:,:,1].max(1).mean(), prob[:,:,1].mean(), prob[:,:,1].min(1).mean()
                    print '==================================================================================\n'


                if np.mod(uidx, validFreq) == 0 or uidx == 1:
             
                    kf_train = get_minibatches_idx(len(qi_train), prm.batch_size_pred, shuffle=True, max_samples=train_size)
                    kf_valid = get_minibatches_idx(len(qi_valid), prm.batch_size_pred, shuffle=True, max_samples=valid_size)
                    kf_test = get_minibatches_idx(len(qi_test), prm.batch_size_pred, shuffle=True, max_samples=test_size)

                    print '\nEvaluating - Training Set'
                    train_metrics = pred_error(f_pred, qi_train, dt_train, options, kf_train)

                    print '\nEvaluating - Validation Set'
                    valid_metrics = pred_error(f_pred, qi_valid, dt_valid, options, kf_valid)

                    print '\nEvaluating - Test Set'
                    test_metrics = pred_error(f_pred, qi_test, dt_test, options, kf_test)


                    his = [train_metrics, valid_metrics, test_metrics]
                    history_errs.append(his)
                    metric_idx = prm.metrics_map[prm.reward.upper()]
                    if (uidx == 0 or
                        valid_metrics[-1, metric_idx] >= np.array(history_errs)[:,1,-1,metric_idx].max()):

                        best_p = unzip(tparams)
                        bad_counter = 0


                    print '====================================================================================================='
                    print '  '.join(prm.metrics_map.keys())
                    print
                    print 'Train:'
                    print train_metrics
                    print
                    print 'Valid:'
                    print valid_metrics
                    print
                    print 'Test:'
                    print test_metrics
                    print
                    print '====================================================================================================='
                    if (len(history_errs) > prm.patience and
                        valid_metrics[-1, metric_idx] <= np.array(history_errs)[:-prm.patience,
                                                               1,-1,metric_idx].max()):
                        bad_counter += 1
                        if bad_counter > prm.patience:
                            print 'Early Stop!'
                            estop = True
                            break

                if prm.saveto and np.mod(uidx, saveFreq) == 0:
                    print 'Saving...',

                    if best_p is not None:
                        params = best_p
                    else:
                        params = unzip(tparams)
                    np.savez(prm.saveto, history_errs=history_errs, **params)

                    print 'Done'

            print 'Seen %d samples' % n_samples

            if estop:
                break

    except KeyboardInterrupt:
        print "Training interupted"
    return
Exemple #4
0
from tensorflow.contrib.learn.python.learn.preprocessing import categorical_vocabulary
from sklearn.decomposition import PCA
from QueryReformulatorEnv import QueryReformulatorEnv
import unicodedata
# Parameters
# ==================================================
with open('config.yml', 'r') as ymlfile:
    cfg = yaml.load(ymlfile)

# Data Preparation
# ==================================================

# Load data
print 'Loading queries Dataset...'
t0 = time()
dh5 = dataset_hdf5.DatasetHDF5(cfg['data']['query_dataset_path'])
qi_train = dh5.get_queries(dset='train')
dt_train = dh5.get_doc_ids(dset='train')
qi_valid = dh5.get_queries(dset='valid')
dt_valid = dh5.get_doc_ids(dset='valid')
qi_test = dh5.get_queries(dset='test')
dt_test = dh5.get_doc_ids(dset='test')
print("Loading queries and docs {}".format(time() - t0))
print '%d train examples' % len(qi_train)
print '%d valid examples' % len(qi_valid)
print '%d test examples' % len(qi_test)
#print 'qi_train',qi_train
#print 'dt_train',dt_train

# Build vocabulary
t0 = time()