Exemple #1
0
    def testTermQuery(self):
        should = querybuilder.JBooleanClauseOccur['should'].value
        query_builder = querybuilder.get_boolean_query_builder()
        query_builder.add(querybuilder.get_term_query('information'), should)
        query_builder.add(querybuilder.get_term_query('retrieval'), should)

        query = query_builder.build()
        hits1 = self.searcher.search(query)
        hits2 = self.searcher.search('information retrieval')

        for h1, h2 in zip(hits1, hits2):
            self.assertEqual(h1.docid, h2.docid)
            self.assertEqual(h1.score, h2.score)
Exemple #2
0
    def testIncompatabilityWithRM3(self):
        should = querybuilder.JBooleanClauseOccur['should'].value
        query_builder = querybuilder.get_boolean_query_builder()
        query_builder.add(querybuilder.get_term_query('information'), should)
        query_builder.add(querybuilder.get_term_query('retrieval'), should)

        query = query_builder.build()
        hits = self.searcher.search(query)
        self.assertEqual(10, len(hits))

        self.searcher.set_rm3()
        self.assertTrue(self.searcher.is_using_rm3())

        with self.assertRaises(NotImplementedError):
            self.searcher.search(query)
def buildQuery(queries, en, es, de):
    should = querybuilder.JBooleanClauseOccur['should'].value
    boolean_query_builder = querybuilder.get_boolean_query_builder()
    if en:
        for word in queries["en"]:
            term = querybuilder.get_term_query(word)
            boolean_query_builder.add(term, should)
    if es:
        for word in queries["es"]:
            term = querybuilder.get_term_query(word)
            boolean_query_builder.add(term, should)
    if de:
        for word in queries["de"]:
            term = querybuilder.get_term_query(word)
            boolean_query_builder.add(term, should)
    return boolean_query_builder.build()
    def testTermQuery2(self):
        term_query1 = querybuilder.get_term_query('inform', analyzer=get_lucene_analyzer(stemming=False))
        term_query2 = querybuilder.get_term_query('retriev', analyzer=get_lucene_analyzer(stemming=False))

        should = querybuilder.JBooleanClauseOccur['should'].value

        boolean_query1 = querybuilder.get_boolean_query_builder()
        boolean_query1.add(term_query1, should)
        boolean_query1.add(term_query2, should)

        bq1 = boolean_query1.build()
        hits1 = self.searcher.search(bq1)
        hits2 = self.searcher.search('information retrieval')

        for h1, h2 in zip(hits1, hits2):
            self.assertEqual(h1.docid, h2.docid)
            self.assertEqual(h1.score, h2.score)
Exemple #5
0
    def testBuildBoostedQuery(self):
        term_query1 = querybuilder.get_term_query('information')
        term_query2 = querybuilder.get_term_query('retrieval')

        boost1 = querybuilder.get_boost_query(term_query1, 2.)
        boost2 = querybuilder.get_boost_query(term_query2, 2.)

        should = querybuilder.JBooleanClauseOccur['should'].value

        boolean_query = querybuilder.get_boolean_query_builder()
        boolean_query.add(boost1, should)
        boolean_query.add(boost2, should)

        bq = boolean_query.build()
        hits1 = self.searcher.search(bq)

        boolean_query2 = querybuilder.get_boolean_query_builder()
        boolean_query2.add(term_query1, should)
        boolean_query2.add(term_query2, should)

        bq2 = boolean_query2.build()
        hits2 = self.searcher.search(bq2)

        for h1, h2 in zip(hits1, hits2):
            self.assertEqual(h1.docid, h2.docid)
            self.assertAlmostEqual(h1.score, h2.score * 2, delta=0.001)

        boost3 = querybuilder.get_boost_query(term_query1, 2.)
        boost4 = querybuilder.get_boost_query(term_query2, 3.)

        boolean_query = querybuilder.get_boolean_query_builder()
        boolean_query.add(boost3, should)
        boolean_query.add(boost4, should)

        bq3 = boolean_query.build()
        hits3 = self.searcher.search(bq3)

        for h1, h3 in zip(hits1, hits3):
            self.assertNotEqual(h1.score, h3.score)
Exemple #6
0
def search(expander, rankers, topicreader, index, anserini, output):
    # Information Retrieval using Anserini
    rank_cmd = '{}target/appassembler/bin/SearchCollection'.format(anserini)

    model_name = expander.get_model_name()
    try:
        Q_filename = '{}.{}.txt'.format(output, model_name)
        for ranker in rankers:

            Q_pred = '{}.{}.{}.txt'.format(output, model_name, utils.get_ranker_name(ranker))
            q_dic={}
            searcher = SimpleSearcher(index)
            if ranker =='-bm25':
                searcher.set_bm25(0.9, 0.4)
            elif ranker =='-qld':
                searcher.set_qld()

            if isinstance(expander, OnFields) or isinstance(expander, BertQE) :
                run_file=open(Q_pred,'w')
                list_of_raw_queries=utils.get_raw_query(topicreader,Q_filename)
                for qid,query in list_of_raw_queries.items():
                    q_dic[qid.strip()]= eval(query)
                for qid in q_dic.keys():
                    boost=[]
                    for q_terms,q_weights in q_dic[qid].items():
                        try:
                            boost.append( querybuilder.get_boost_query(querybuilder.get_term_query(q_terms),q_weights))
                        except:
                            # term do not exist in the indexed collection () e.g., stop words
                            pass

                    should = querybuilder.JBooleanClauseOccur['should'].value
                    boolean_query_builder = querybuilder.get_boolean_query_builder()
                    for boost_i in boost:
                        boolean_query_builder.add(boost_i, should)
                    retrieved_docs=[]
                    query = boolean_query_builder.build()
                    hits = searcher.search(query,k=10000)
                    for i in range(0, 1000):
                        try:
                            if hits[i].docid not in retrieved_docs:
                                retrieved_docs.append(hits[i].docid)
                                run_file.write(f'{qid} Q0  {hits[i].docid:15} {i+1:2}  {hits[i].score:.5f} Pyserini \n')
                        except:
                            pass
                run_file.close()

            elif topicreader=='TsvString':
                run_file=open(Q_pred,'w')
                qlines=open(Q_filename,'r').readlines()

                for line in qlines:
                    retrieved_docs=[]
                    qid,qtext=line.split('\t')
                    hits = searcher.search(qtext,k=1000)
                    for i in range(len(hits)):
                        if hits[i].docid not in retrieved_docs:
                            retrieved_docs.append(hits[i].docid)
                            run_file.write(f'{qid} Q0  {hits[i].docid:15} {i+1:2} {hits[i].score:.5f} Pyserini\n')
                run_file.close()

            else:
                cli_cmd = '\"{}\" {} -threads 44 -topicreader {} -index {} -topics {} -output {}'.format(rank_cmd, ranker, topicreader, index, Q_filename, Q_pred)
                print('{}\n'.format(cli_cmd))
                stream = os.popen(cli_cmd)
                print(stream.read())
    except:#all exception related to calling the SearchCollection cannot be captured here!! since it is outside the process scope
        print('INFO: MAIN: SEARCH: There has been error in {}!\n{}'.format(expander, traceback.format_exc()))
        raise