Ejemplo n.º 1
0
 def __init__(self, stops, minsize=3):
     """initialize index variables"""
     self.ix = None
     self.tokenizer = StandardAnalyzer(stoplist=stops, minsize=minsize)
     self.umls = umls.UMLSLookup()
     self.term_dict = {}
     self.token2cuis = {}
     self.concept_dict = {"__NULL__": 0}
     self.synsets = {}
Ejemplo n.º 2
0
	def disambiguate_query(self, ix, term_dict, concept_dict, token2cuis, query, table_name): 
		"""shallow word-sense disambiguation: disambiguate polysemous terms based on shallow word-concept connectivity within UMLS"""
		qcuis = {}
		umls_lookup = umls.UMLSLookup()
		# tokenize query 
		q = self.tokenize_query(query)
		# convert query into gensim doc2idx format
		q2idx = ix.doc2idx(q)
		# get cuis from query tokens
		for idx in q2idx:
			if idx in token2cuis and token2cuis[idx] != ["__NULL__"]: 
				for cui in token2cuis[idx]:
					if cui in qcuis:  # increase cui count
						qcuis[cui] += 1
					else:  # initialize cui count
						qcuis[cui] = 1
		# perform shallow word-sense disambiguation
		enc_query = []
		for idx in q2idx:
			if idx in term_dict:  # disambiguate only for terms contained within term_dict
				max_edges = 0  # relative maximum connections (edges)
				if len(token2cuis[idx]) == 1:  # monosemous term
					ref_cui = token2cuis[idx][0]
					# encode (term, cui) pair
					enc_query.append([term_dict[idx], concept_dict[ref_cui]])
				else:  # polysemous term
					candidates = []
					# loop over cadidate concepts
					for subj_cui in token2cuis[idx]:
						num_edges = 0  # number of edges
						if qcuis[subj_cui] == 1:  # subj_cui is only associated with current term (idx)
							obj_cuis = list(set(qcuis.keys()).difference({subj_cui}))
						else:  # subj_cui is associated with other terms in the query too
							obj_cuis = list(qcuis.keys())
						num_edges += umls_lookup.compute_num_edges(obj_cuis, subj_cui, table_name)  # remember that subj and obj are inverted within UMLS <s, p, o> triples
						# verify connectivity
						if num_edges > max_edges:
							# set candidates to subj_cui
							candidates = [subj_cui]
							# update max_edges
							max_edges = num_edges
						else:
							# append subj_cui to candidates
							candidates.append(subj_cui)
					# keep head candidate - when disambiguation is not complete, it allows to get the most likely concept based on QuickUMLS ordering
					ref_cui = candidates[0]
					# encode (term, cui) pair
					enc_query.append([term_dict[idx], concept_dict[ref_cui]])
			else:  # term oov
				continue
		return enc_query
Ejemplo n.º 3
0
	def get_syns(self, term2cui, term_dict):
		"""get synonymic relations between words within corpus (derived from a semantic lexicon)"""
		syns = {}
		umls_lookup = umls.UMLSLookup()
		analyzer = SimpleAnalyzer()
		for term, cui in term2cui.items():
			if term in term_dict:
				if cui != '__NULL__':
					# get synset composed of single-word terms (reference term excluded)
					synset = {syn[0].lower() for syn in umls_lookup.lookup_synonyms(cui, preferred=False) if len(list(analyzer(syn[0]))) == 1 and syn[0].lower() in term_dict and syn[0].lower() != term}
					if len(synset) > 0:
						syns[term] = list(synset)
					else:
						syns[term] = list()
				else:
					syns[term] = list()
		return syns
Ejemplo n.º 4
0
	def cui2source(self, term2cui, source='MSH'):
		"""keep only CUIs presenting an entry in the given 'source' lexicon"""
		cui2source = {}
		umls_lookup = umls.UMLSLookup()
		for term, cui in tqdm(term2cui.items()):
			if cui == '__NULL__':  # skip __NULL__ concepts
				cui2source[term] = '__NULL__'
			else:
				# lookup codes and sources from UMLS 
				codes_and_sources = umls_lookup.lookup_code(cui=cui, preferred=False)
				source_code = [code for code, src, _ in codes_and_sources if src == source]
				if source_code:  # CUI in source - keep it
					cui2source[term] = cui
				else:  # CUI not in source - discard it
					cui2source[term] = '__NULL__'
		# return cui2source
		return cui2source
Ejemplo n.º 5
0
def main(_):
    os.chdir(os.path.dirname(os.path.realpath('__file__')))
    # load options
    opts = Options()
    # set folders
    corpus_folder = 'corpus/' + opts.corpus_name + '/' + opts.corpus_name
    index_folder = 'corpus/' + opts.corpus_name + '/index'
    model_folder = 'corpus/' + opts.corpus_name + '/models/' + opts.model_name
    data_folder = 'corpus/' + opts.corpus_name + '/data'
    query_folder = 'corpus/' + opts.corpus_name + '/queries'
    qrels_folder = 'corpus/' + opts.corpus_name + '/qrels'
    rankings_folder = 'corpus/' + opts.corpus_name + '/rankings/' + opts.model_name

    # create folders
    if not os.path.exists(data_folder):
        os.makedirs(data_folder)
    if not os.path.exists(index_folder):
        os.makedirs(index_folder)
    if not os.path.exists(rankings_folder):
        os.makedirs(rankings_folder)
    if not os.path.exists(model_folder):
        os.makedirs(model_folder)
    if not os.path.exists(query_folder) or not os.path.exists(qrels_folder):
        print(
            'folders containing queries and qrels are required - please add them'
        )
        return False

    # establish connection with UMLS db
    umls_lookup = umls.UMLSLookup()
    # load queries
    q = tf_utils.read_ohsu_queries(query_folder + '/' + opts.query_fname)
    """
	PRE PROCESSING
	"""

    # pre process distributional data
    if not os.path.exists(data_folder + '/words.json'):
        # compute required data
        words = tf_utils.process_corpus(corpus_folder, data_folder)
        # build dataset to train CBOW + RMC model
        data, cfs, word_dict, reverse_word_dict = tf_utils.build_dataset(
            words, opts.min_cut_freq, data_folder)
        del words  # free memory from unnecessary data
        print('Most common words (+ UNK)', count[:10])
        print('Total number of words (+ UNK) within {}: {}'.format(
            opts.corpus_name, len(data)))
        print('Number of unique words (+ UNK) for {}: {}'.format(
            opts.corpus_name, len(count)))
    else:
        # load required data
        print('load processed data required to train CBOW + RMC model')
        with open(data_folder + '/data.json', 'r') as df:
            data = json.load(df)
        with open(data_folder + '/docs.json', 'r') as cf:
            corpus = json.load(cf)
        with open(data_folder + '/idfs.json', 'r') as wf:
            idfs = json.load(wf)
        with open(data_folder + '/cfs.json', 'r') as cff:
            cfs = json.load(cff)
        with open(data_folder + '/word_dict.json', 'r') as wdf:
            word_dict = json.load(wdf)
        # compute reverse word dictionary
        reverse_word_dict = dict(zip(word_dict.values(), word_dict.keys()))

    # pre process relational data
    if not os.path.exists(data_folder + '/term2cui.json'):
        # map terms to cuis using QuickUMLS
        term2cui = tf_utils.get_term2cui(word_dict,
                                         data_folder,
                                         threshold=opts.threshold,
                                         stypes_fname=opts.stypes_fname)
    else:
        # load (term, cui) pairs
        print('load (term, cui) pairs')
        with open(data_folder + '/term2cui.json', 'r') as tcf:
            term2cui = json.load(tcf)
    # get synonyms for each word within vocabulary given semantic lexicon
    print(
        'get synonyms for each word within vocabulary given semantic lexicon')
    syns = tf_utils.get_syns(term2cui, word_dict, umls_lookup)
    # get synonyms as an array of synonym pairs
    syns = [
        list(itertools.product([word], synset))
        for word, synset in syns.items()
    ]
    syns = [pairs for pairs in syns if pairs]
    syns = np.array([pair for pairs in syns for pair in pairs])
    print('Total number of synonymy relations within {}: {}'.format(
        opts.corpus_name, syns.shape[0]))

    # load required data to perform retrieval
    print('load required data to perform retrieval')
    with open(data_folder + '/docs.json', 'r') as cf:
        corpus = json.load(cf)
    with open(data_folder + '/idfs.json', 'r') as wf:
        idfs = json.load(wf)
    # get docs and docnos from corpus
    docnos = list(corpus.keys())
    docs = list(corpus.values())
    del corpus  # free memory space
    """
	NETWORK TRAINING
	"""

    # begin training
    with tf.Graph().as_default(), tf.Session() as sess:
        # set graph-level random seed
        tf.set_random_seed(opts.seed)
        # start data index
        tf_globals.initialize()
        # setup the model
        model = JointRCM(len(word_dict), syns, opts)
        # create model saving operation - keeps as many saved models as number of epochs
        saver = tf.train.Saver(max_to_keep=opts.epochs)
        # initialize the variables using global_variables_initializer()
        sess.run(tf.global_variables_initializer())

        print('start training')
        print('number of batches per epoch: {}'.format(
            len(data) // opts.batch_size))
        best_score_per_epoch = []
        for epoch in range(opts.epochs):
            # train CBOW
            print('training epoch {}'.format(epoch + 1))
            # loop over (len(data) // opts.batch_size) batches
            for i in tqdm(range(len(data) // opts.batch_size)):
                batch_inputs, batch_labels = tf_utils.generate_batch(
                    data, opts.batch_size, opts.context_window)
                feed_dict = {
                    model.inputs: batch_inputs,
                    model.labels: batch_labels
                }
                # run cbow train_op
                sess.run(model.cbow_train_op, feed_dict=feed_dict)
                if (i + 1) % opts.minimize_rcm_every == 0:
                    # run rcm train_op
                    sess.run(model.rcm_train_op)
            # store trained CBOW
            print('storing model at epoch {}'.format(epoch + 1))
            model_checkpoint_path = os.path.join(
                os.getcwd(), model_folder,
                opts.model_name + str(epoch + 1) + '.ckpt')
            save_path = saver.save(sess, model_checkpoint_path)
            print("model saved in file: {}".format(save_path))
            """
			DOCUMENT RETRIEVAL 
			"""

            # get embs after training epoch
            word_embs = sess.run(model.word_embs)
            # evaluate CBOW for IR tasks
            print('evaluating at epoch {}'.format(epoch + 1))
            # compute doc embeddings and return list of filtered doc ids
            doc_embs, filt_ids = tf_utils.compute_doc_embs(
                docs, word_dict, word_embs, idfs)
            # set query embs and ids
            q_embs = []
            q_ids = []
            # loop over queries and generate rankings
            for qid, qtext in q.items():
                # prepare queries for semantic matching
                q_proj = tf_utils.prepare_query(qtext[opts.field], word_dict,
                                                word_embs)
                if q_proj is None:
                    print('query {} does not contain known terms'.format(qid))
                else:
                    q_embs.append(q_proj)
                    q_ids.append(qid)
            q_embs = np.array(q_embs)
            # perform search and evaluate model effectiveness
            tf_utils.semantic_search(docnos, doc_embs, q_ids, q_embs,
                                     rankings_folder,
                                     opts.model_name + '_' + str(epoch + 1),
                                     filt_ids)
            scores = tf_utils.evaluate(
                ['Rprec', 'P_5', 'P_10', 'P_20', 'ndcg', 'map'],
                rankings_folder, opts.model_name + '_' + str(epoch + 1),
                qrels_folder, opts.qrels_fname)
            best_score_per_epoch.append(scores[opts.ref_measure])
    print('best model (in terms of {}) found at epoch: {}'.format(
        opts.ref_measure,
        np.argsort(best_score_per_epoch)[-1] + 1))
def main():
    os.chdir(os.path.dirname(os.path.realpath('__file__')))
    # set folders
    corpus_folder = 'corpus/' + FLAGS.corpus_name + '/' + FLAGS.corpus_name
    index_folder = 'corpus/' + FLAGS.corpus_name + '/index'
    # model_folder = 'corpus/' + FLAGS.corpus_name + '/models/' + FLAGS.model_name
    data_folder = 'corpus/' + FLAGS.corpus_name + '/data'
    query_folder = 'corpus/' + FLAGS.corpus_name + '/queries'
    qrels_folder = 'corpus/' + FLAGS.corpus_name + '/qrels'
    rankings_folder = 'corpus/' + FLAGS.corpus_name + '/rankings/' + FLAGS.model_name

    # create folders
    if not os.path.exists(rankings_folder):
        os.makedirs(rankings_folder)
        # if not os.path.exists(model_folder):
        # os.makedirs(model_folder)
    if not os.path.exists(query_folder) or not os.path.exists(qrels_folder):
        print(
            'folders containing queries and qrels are required - please add them'
        )
        return False

    # set random seed - enable reproducibility
    np.random.seed(FLAGS.seed)
    # establish connection with UMLS db
    umls_lookup = umls.UMLSLookup()

    # load required data
    print(
        'load processed data required to retrofit word vectors and perform retrieval tasks'
    )
    with open(data_folder + '/docs.json', 'r') as df:
        corpus = json.load(df)
    with open(data_folder + '/idfs.json', 'r') as wf:
        idfs = json.load(wf)
    with open(data_folder + '/cfs.json', 'r') as cff:
        cfs = json.load(cff)
    with open(data_folder + '/word_dict.json', 'r') as wdf:
        word_dict = json.load(wdf)
    # compute reverse word dict
    reverse_word_dict = dict(zip(word_dict.values(), word_dict.keys()))
    # store docnos and docs as separate lists
    docnos = list(corpus.keys())
    docs = list(corpus.values())
    del corpus  # free memory space

    # pre process relational data
    if not os.path.exists(data_folder + '/term2cui.json'):
        # map terms to cuis using QuickUMLS
        term2cui = tf_utils.get_term2cui(word_dict,
                                         data_folder,
                                         threshold=FLAGS.threshold,
                                         stypes_fname=FLAGS.stypes_fname)
    else:
        # laod (term, cui) pairs
        print('load (term, cui) pairs')
        with open(data_folder + '/term2cui.json', 'r') as tcf:
            term2cui = json.load(tcf)
    """
	SEMANTIC PROCESSING
	"""

    # load semantic model
    print('load semantic model')
    with tf.Session() as sess:
        # restore model and get required tensors
        saver = tf.train.import_meta_graph(FLAGS.semantic_model + '.ckpt.meta')
        saver.restore(sess, FLAGS.semantic_model + '.ckpt')
        word_embs = sess.run(tf.get_default_graph().get_tensor_by_name(
            'embeddings/word_embs:0'))
    """
	RETROFITTING
	"""

    if FLAGS.retrofit:
        # get synonyms for each word within vocabulary
        print('get synonyms')
        syns = tf_utils.get_syns(term2cui, word_dict, umls_lookup)
        if FLAGS.syn_weights:
            # convert collection frequencies from list to dict
            cfs = dict(cfs)
        else:
            cfs = None
        # retrofit word vectors
        print('retrofit word vectors for {} iterations'.format(
            FLAGS.iterations))
        word_embs = retrofit(word_embs,
                             syns,
                             reverse_word_dict,
                             FLAGS.iterations,
                             alpha=1.0,
                             beta=FLAGS.beta,
                             cfs=cfs)

    # compute doc embeddings
    print('compute document vectors w/ retrofitted word vectors')
    doc_embs, filt_ids = tf_utils.compute_doc_embs(docs, word_dict, word_embs,
                                                   idfs)

    if not FLAGS.reranking:
        """
		RETRIEVAL
		"""
        print('perform retrieval over the entire collection')
        # load queries
        q = tf_utils.read_ohsu_queries(query_folder + '/' + FLAGS.query_fname)
        # set query embs and ids
        q_embs = []
        q_ids = []
        # loop over queries and generate rankings
        for qid, qtext in q.items():
            # prepare queries for semantic matching
            q_proj = tf_utils.prepare_query(qtext[FLAGS.qfield], word_dict,
                                            word_embs)
            if q_proj is None:
                print('query {} does not contain known terms'.format(qid))
            else:
                q_embs.append(q_proj)
                q_ids.append(qid)
        q_embs = np.array(q_embs)
        # perform search and evaluate model effectiveness
        tf_utils.semantic_search(docnos, doc_embs, q_ids, q_embs,
                                 rankings_folder, FLAGS.model_name)
        scores = tf_utils.evaluate(
            ['Rprec', 'P_5', 'P_10', 'P_20', 'ndcg', 'map'], rankings_folder,
            FLAGS.model_name, qrels_folder, FLAGS.qrels_fname)

    else:
        """
		RE-RANKING
		"""
        print('perform re-ranking over top 1000 documents from a baseline run')
        # parse and store qrels
        with open(qrels_folder + '/' + FLAGS.qrels_fname + '.txt',
                  'r') as qrelf:
            qrels = pytrec_eval.parse_qrel(qrelf)
        # initialize evaluator over qrels
        evaluator = pytrec_eval.RelevanceEvaluator(
            qrels, {'P'})  # evaluate on Precision

        # parse input run
        print('parse input run')
        with open(FLAGS.run_path, 'r') as runf:
            run = pytrec_eval.parse_run(runf)

        # load queries
        q = tf_utils.read_ohsu_queries(query_folder + '/' + FLAGS.query_fname)
        # get query ids
        qids = list(q.keys())
        # shuffle query ids
        np.random.shuffle(qids)

        if FLAGS.fixed_gamma:
            # perform re-ranking based on a fixed value of gamma
            print('perform re-ranking w/ gamma=%.2f' % (FLAGS.fixed_gamma))
            # initialize combined (output) run
            crun = trec_utils.OnlineTRECRun(FLAGS.model_name + '_gamma_' +
                                            str(FLAGS.fixed_gamma))
            # combine rankings using fixed gamma
            comb_run = tf_utils.compute_combined_run(
                run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs,
                SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.fixed_gamma)
            # store test ranking in combined run
            for qid, doc_ids_and_scores in comb_run.items():
                crun.add_ranking(
                    qid, [(score, docno)
                          for docno, score in doc_ids_and_scores.items()])
            # close and store run
            crun.close_and_write(out_path=rankings_folder + '/' +
                                 FLAGS.model_name + '_gamma_' +
                                 str(FLAGS.fixed_gamma) + '.txt',
                                 overwrite=True)
            print('combined run stored in {}'.format(rankings_folder))
            # evalaute combined run
            print('evaluate run combined w/ gamma=%.2f' % (FLAGS.fixed_gamma))
            tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder,
                              FLAGS.model_name + '_gamma_' +
                              str(FLAGS.fixed_gamma), qrels_folder,
                              FLAGS.qrels_fname)
        else:
            # learn optimal weight to combine runs
            print("learn optimal weight to combine runs with sweep: {}".format(
                FLAGS.sweep))
            # set variable to store scores and weights
            scores_and_weights = []
            # initialize kfold with FLAGS.num_folds
            kfold = sklearn.model_selection.KFold(n_splits=FLAGS.num_folds)
            for fold, (train_qids, test_qids) in enumerate(kfold.split(qids)):
                print('fold n. {}'.format(fold))
                # restrict queries to train_qids and test_qids
                qtrain = {qids[ix]: q[qids[ix]] for ix in train_qids}
                qtest = {qids[ix]: q[qids[ix]] for ix in test_qids}
                # obtain best combination on training queries
                train_score, best_train_weight = max(
                    tf_utils.perform_reranking(
                        run, FLAGS.qfield, qtrain, docnos, doc_embs, word_dict,
                        word_embs, FLAGS.sweep,
                        SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.ref_measure,
                        evaluator))
                print(
                    'fold %d: best_train_weight=%.2f, %s =%.4f' %
                    (fold, best_train_weight, FLAGS.ref_measure, train_score))
                # compute combined run with best combination on test queries
                test_crun = tf_utils.compute_combined_run(
                    run, FLAGS.qfield, qtest, docnos, doc_embs, word_dict,
                    word_embs, SCORE_NORMALIZERS[FLAGS.normalizer],
                    best_train_weight)
                # evaluate test run
                test_res = evaluator.evaluate(test_crun)
                # compute aggregated measure score for test queries
                test_score = pytrec_eval.compute_aggregated_measure(
                    FLAGS.ref_measure, [
                        qscore[FLAGS.ref_measure]
                        for qscore in test_res.values()
                    ])
                # store averaged scores w/ best weights
                scores_and_weights.append(
                    (np.mean([train_score, test_score]), best_train_weight))

            # get (best) weight that produces the highest averaged score
            best_score, best_weight = max(scores_and_weights)
            print('found best weight=%.2f' % (best_weight))
            # initialize combined (output) run
            crun = trec_utils.OnlineTRECRun(FLAGS.model_name +
                                            '_best_weight_' +
                                            str(FLAGS.best_weight))
            # compute combined run based on test weight
            comb_run = tf_utils.compute_combined_run(
                run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs,
                SCORE_NORMALIZERS[FLAGS.normalizer], best_weight)
            # store ranking in crun
            for qid, doc_ids_and_scores in comb_run.items():
                crun.add_ranking(
                    qid, [(score, doc_id)
                          for doc_id, score in doc_ids_and_scores.items()])
            # close and store run
            crun.close_and_write(out_path=rankings_folder + '/' +
                                 FLAGS.model_name + '_best_weight_' +
                                 str(FLAGS.best_weight) + '.txt',
                                 overwrite=True)
            print('combined run stored in {}'.format(rankings_folder))
            # evalaute combined run
            print(
                'evaluate run combined w/ {}-fold cross validation and best weight={}'
                .format(FLAGS.num_folds, FLAGS.best_weight))
            tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder,
                              FLAGS.model_name + '_best_weight_' +
                              str(FLAGS.best_weight), qrels_folder,
                              FLAGS.qrels_fname)
def main():
    os.chdir(os.path.dirname(os.path.realpath('__file__')))
    # load options
    opts = Options()
    # set folders
    query_folder = 'corpus/' + opts.corpus_name + '/queries'
    qrels_folder = 'corpus/' + opts.corpus_name + '/qrels'
    rankings_folder = 'corpus/' + opts.corpus_name + '/rankings/' + opts.model_name

    # create folders
    if not os.path.exists(rankings_folder):
        os.makedirs(rankings_folder)
    if not os.path.exists(query_folder) or not os.path.exists(qrels_folder):
        print(
            'folders containing queries and qrels are required - please add them'
        )
        return False

    # load utils functions - set random seed
    utils = Utils(opts.seed)
    # load UMLS lookup functions
    umls_lookup = umls.UMLSLookup()

    # load queries
    print('load {} queries'.format(opts.corpus_name))
    queries = utils.read_queries(query_folder + '/' + opts.qfname)

    # load BoW run
    bow_model = read_ranking(opts.bow_model_path)

    # load models
    print('load models')
    txt_d2v_model = gensim.models.Doc2Vec.load(opts.txt_d2v_model_path)
    concept_d2v_model = gensim.models.Doc2Vec.load(opts.concept_d2v_model_path)
    retro_model = np.load(opts.retro_model_path, allow_pickle=True).item()

    ##### QUERY EXPANSION #####

    N_top_docs = opts.num_top_docs
    N_top_words_doc = opts.num_top_words_per_doc
    N_top_words_query = opts.num_top_words

    queries_t = {}
    queries_c = {}
    queries_r = {}

    print('perform query expansion for each model')
    for qid, qtext in tqdm(queries.items()):

        # get N_top_docs for given query
        print('get top {} docs for query {}'.format(N_top_docs, qid))
        query_top_docs = get_query_top_docs(bow_model, qid, N_top_docs)
        '''
		for each doc in query_top_docs, pick N_top_words_doc, add to pool
		sort the pool, get N_top_words_query to add in given query
		'''

        top_concept_t = {}
        top_concept_c = {}
        top_concept_r = {}

        for top_doc_id in query_top_docs:
            top_t = top_words_of_doc(txt_d2v_model, top_doc_id,
                                     N_top_words_doc)

            if top_doc_id in concept_d2v_model.docvecs:
                top_c = top_words_of_doc(concept_d2v_model, top_doc_id,
                                         N_top_words_doc)

            if top_doc_id in retro_model:
                top_retro_doc = retro_model[top_doc_id]
                if opts.beta < 0.5:  # prioritize concepts
                    top_r = top_words_of_vector(concept_d2v_model,
                                                top_retro_doc, N_top_words_doc)
                else:  # prioritize words
                    top_r = top_words_of_vector(txt_d2v_model, top_retro_doc,
                                                N_top_words_doc)

            for i in range(N_top_words_doc):
                if len(top_t
                       ) == N_top_words_doc:  # doc_id found by txt_d2v_model
                    term = top_t[i][0]
                    score = top_t[i][1]
                    if term in top_concept_t:  # combsum
                        top_concept_t[term] += score
                    else:
                        top_concept_t[term] = score

                if len(
                        top_c
                ) == N_top_words_doc:  # doc_id found by concept_d2v_model
                    term = top_c[i][0]
                    score = top_c[i][1]
                    if term in top_concept_c:  # combsum
                        top_concept_c[term] += score
                    else:
                        top_concept_c[term] = score

                if len(top_r
                       ) == N_top_words_doc:  # doc_id found by retro_model
                    term = top_r[i][0]
                    score = top_r[i][1]
                    if term in top_concept_r:  # combsum
                        top_concept_r[term] += score
                    else:
                        top_concept_r[term] = score

        # sorting top_concept lists
        sorted_candidates_t = sorted(
            top_concept_t.items(), key=operator.itemgetter(
                1))  # [(id1,min_sim), ... , (idn, max_sim)]
        sorted_candidates_c = sorted(
            top_concept_c.items(), key=operator.itemgetter(
                1))  # [(id1,min_sim), ... , (idn, max_sim)]
        sorted_candidates_r = sorted(
            top_concept_r.items(), key=operator.itemgetter(
                1))  # [(id1,min_sim), ... , (idn, max_sim)]
        top_term_t = sorted_candidates_t[-N_top_words_query:]
        top_term_c = sorted_candidates_c[-N_top_words_query:]
        top_term_r = sorted_candidates_r[-N_top_words_query:]

        query_new_t = qtext[opts.qfield]
        query_new_c = qtext[opts.qfield]
        query_new_r = qtext[opts.qfield]

        # query_new_t = ''
        # query_new_c = ''
        # query_new_r = ''

        count_t = 0
        count_c = 0
        count_r = 0

        for term, _ in top_term_t:
            query_new_t += ' ' + term
            count_t += 1

        for cui, _ in top_term_c:
            cui = cui.upper()
            term_variants = [
                term_and_source for term_and_source in
                umls_lookup.lookup_synonyms(cui=cui, preferred=True)
                if term_and_source[1] == 'MSH'
            ]
            term = term_variants[0][0]  # preferred term
            query_new_c += ' ' + term
            count_c += 1

        if opts.beta < 0.5:
            for cui, _ in top_term_r:
                cui = cui.upper()
                term_variants = [
                    term_and_source for term_and_source in
                    umls_lookup.lookup_synonyms(cui=cui, preferred=True)
                    if term_and_source[1] == 'MSH'
                ]
                term = term_variants[0][0]
                query_new_r += ' ' + term
                count_r += 1
        else:
            for term, _ in top_term_r:
                query_new_r += ' ' + term
                count_r += 1

        queries_t[qid] = {opts.qfield: query_new_t}
        queries_c[qid] = {opts.qfield: query_new_c}
        queries_r[qid] = {opts.qfield: query_new_r}

    es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    # set Index instance
    ix = Index()

    print('search and evaluate text-based doc2vec query expansion')
    # perform lexical search over given query field w/ chosen model
    ix.lexical_search(queries_t, opts.qfield, rankings_folder,
                      opts.model_name + '_txt_d2v')
    # evaluate performed search
    scores = utils.evaluate(['recall.20', 'P_20', 'map'], rankings_folder,
                            opts.model_name + '_txt_d2v', qrels_folder,
                            opts.qrels_fname)

    print('search and evaluate concept-based doc2vec query expansion')
    # perform lexical search over given query field w/ chosen model
    ix.lexical_search(queries_c, opts.qfield, rankings_folder,
                      opts.model_name + '_concept_d2v')
    # evaluate performed search
    scores = utils.evaluate(['recall.20', 'P_20', 'map'], rankings_folder,
                            opts.model_name + '_concept_d2v', qrels_folder,
                            opts.qrels_fname)

    print('search and evaluate retrofitted doc2vec query expansion')
    # perform lexical search over given query field w/ chosen model
    ix.lexical_search(queries_r, opts.qfield, rankings_folder,
                      opts.model_name + '_retro_d2v')
    # evaluate performed search
    scores = utils.evaluate(['recall.20', 'P_20', 'map'], rankings_folder,
                            opts.model_name + '_retro_d2v', qrels_folder,
                            opts.qrels_fname)