Beispiel #1
0
    def __init__(self, strategy="GREEDY", seed=2020, max_iter=20):
        """
        This class produces a baseline BM25 ranking and uses LDA topic modelling
        in combination with the general re-ranking procedure of Huang and Hu (2009)
        """
        self.seed = seed
        self.max_iter = max_iter
        self.utils = Utils()

        # Amount of documents to rank and rerank
        self.N= 100

        # Select a strategy for weighing final topics
        self.strategy = strategy
    
        # K to use in TOP-K-AVG strategy
        self.top_k = 10 

        # TODO ideally we don't want to first rank every time for the reranking 
        self.baseline = BaselineBM25(k=self.N)
        self.baseline.rank()

        # For each topic, the system outputs N retrieved articles.
        self.batch_hits = self.baseline.get_batch_hits()

        # Read index to retrieve document contents
        # N.B. the `contents` field is currently empty; we stored "raw" instead.
        self.index_loc = self.baseline.get_index_loc()
        reader = IndexReader(self.index_loc)

        # Vocabulary in index
        #vocabulary = [ term.term for term in reader.terms()]
        #print(f"{len(vocabulary)} terms in vocabulary")

        # Topics and the retrieved articles are represented as the keyword sequences
        self.topics = self.baseline.get_topics()
        self.topic_keywords = { id: topic['title'].lower().split() for (id, topic) in self.topics.items() } 
        self.query_ids = self.baseline.get_query_ids()

        # Next line returns preprocessed documents per query 
        docs_per_query = { query_id: [ reader.analyze( reader.doc(hit.docid).raw()) for hit in hits] for query_id, hits in self.batch_hits.items() }

        # Prepare bag-of-words dataset for gensim
        self.X = defaultdict(list)
        for id in self.query_ids:
            dictionary = Dictionary(docs_per_query[id])
            # Dictionary expects a list of lists, elements being lists of tokens
            self.X[id] = [dictionary.doc2bow(doc) for doc in docs_per_query[id]]
# ----------------
# IndexReaderUtils
# ----------------

from pyserini.index import IndexReader

# Now we do not search the index, but retrieve a document directly from the index
reader = IndexReader(index_loc)

# Retrieve a document using its docid
#id = 'd6ed7028c686e5756ceb0aa0c9b62e0d'
id = hits[0].docid

# See class Document in https://github.com/castorini/pyserini/blob/master/pyserini/search/_base.py
# properties: docid; id (alias); lucene_document; contents; raw
doc = reader.doc(id).raw()
#print(doc)

# Get analyzed form (tokenized, stemmed, stopwords removed)
analyzed = reader.analyze(doc)
#print(analyzed)

# Raw document VECTOR is also stored
doc_vector = reader.get_document_vector(id)
utils.top_n_words(doc_vector, 10)

# ----------------
# Topics
# ----------------
from pyserini.search import get_topics
topics = get_topics('core18')
Beispiel #3
0
class MsmarcoLtrSearcher:
    def __init__(self, model: str, ibm_model: str, index: str, data: str):
        self.model = model
        self.ibm_model = ibm_model
        self.fe = FeatureExtractor(index,
                                   max(multiprocessing.cpu_count() // 2, 1))
        self.index_reader = IndexReader(index)
        self.data = data

    def add_fe(self):
        #self.fe.add(RunList('collections/msmarco-ltr-passage/run.monot5.run_list.whole.trec','t5'))
        for qfield, ifield in [('analyzed', 'contents'),
                               ('text_unlemm', 'text_unlemm'),
                               ('text_bert_tok', 'text_bert_tok')]:
            print(qfield, ifield)
            self.fe.add(
                BM25Stat(SumPooler(),
                         k1=2.0,
                         b=0.75,
                         field=ifield,
                         qfield=qfield))
            self.fe.add(
                BM25Stat(AvgPooler(),
                         k1=2.0,
                         b=0.75,
                         field=ifield,
                         qfield=qfield))
            self.fe.add(
                BM25Stat(MedianPooler(),
                         k1=2.0,
                         b=0.75,
                         field=ifield,
                         qfield=qfield))
            self.fe.add(
                BM25Stat(MaxPooler(),
                         k1=2.0,
                         b=0.75,
                         field=ifield,
                         qfield=qfield))
            self.fe.add(
                BM25Stat(MinPooler(),
                         k1=2.0,
                         b=0.75,
                         field=ifield,
                         qfield=qfield))
            self.fe.add(
                BM25Stat(MaxMinRatioPooler(),
                         k1=2.0,
                         b=0.75,
                         field=ifield,
                         qfield=qfield))

            self.fe.add(
                LmDirStat(SumPooler(), mu=1000, field=ifield, qfield=qfield))
            self.fe.add(
                LmDirStat(AvgPooler(), mu=1000, field=ifield, qfield=qfield))
            self.fe.add(
                LmDirStat(MedianPooler(), mu=1000, field=ifield,
                          qfield=qfield))
            self.fe.add(
                LmDirStat(MaxPooler(), mu=1000, field=ifield, qfield=qfield))
            self.fe.add(
                LmDirStat(MinPooler(), mu=1000, field=ifield, qfield=qfield))
            self.fe.add(
                LmDirStat(MaxMinRatioPooler(),
                          mu=1000,
                          field=ifield,
                          qfield=qfield))

            self.fe.add(NormalizedTfIdf(field=ifield, qfield=qfield))
            self.fe.add(ProbalitySum(field=ifield, qfield=qfield))

            self.fe.add(DfrGl2Stat(SumPooler(), field=ifield, qfield=qfield))
            self.fe.add(DfrGl2Stat(AvgPooler(), field=ifield, qfield=qfield))
            self.fe.add(DfrGl2Stat(MedianPooler(), field=ifield,
                                   qfield=qfield))
            self.fe.add(DfrGl2Stat(MaxPooler(), field=ifield, qfield=qfield))
            self.fe.add(DfrGl2Stat(MinPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                DfrGl2Stat(MaxMinRatioPooler(), field=ifield, qfield=qfield))

            self.fe.add(
                DfrInExpB2Stat(SumPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                DfrInExpB2Stat(AvgPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                DfrInExpB2Stat(MedianPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                DfrInExpB2Stat(MaxPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                DfrInExpB2Stat(MinPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                DfrInExpB2Stat(MaxMinRatioPooler(),
                               field=ifield,
                               qfield=qfield))

            self.fe.add(DphStat(SumPooler(), field=ifield, qfield=qfield))
            self.fe.add(DphStat(AvgPooler(), field=ifield, qfield=qfield))
            self.fe.add(DphStat(MedianPooler(), field=ifield, qfield=qfield))
            self.fe.add(DphStat(MaxPooler(), field=ifield, qfield=qfield))
            self.fe.add(DphStat(MinPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                DphStat(MaxMinRatioPooler(), field=ifield, qfield=qfield))

            self.fe.add(Proximity(field=ifield, qfield=qfield))
            self.fe.add(TpScore(field=ifield, qfield=qfield))
            self.fe.add(TpDist(field=ifield, qfield=qfield))

            self.fe.add(DocSize(field=ifield))

            self.fe.add(QueryLength(qfield=qfield))
            self.fe.add(QueryCoverageRatio(qfield=qfield))
            self.fe.add(UniqueTermCount(qfield=qfield))
            self.fe.add(MatchingTermCount(field=ifield, qfield=qfield))
            self.fe.add(SCS(field=ifield, qfield=qfield))

            self.fe.add(TfStat(AvgPooler(), field=ifield, qfield=qfield))
            self.fe.add(TfStat(MedianPooler(), field=ifield, qfield=qfield))
            self.fe.add(TfStat(SumPooler(), field=ifield, qfield=qfield))
            self.fe.add(TfStat(MinPooler(), field=ifield, qfield=qfield))
            self.fe.add(TfStat(MaxPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                TfStat(MaxMinRatioPooler(), field=ifield, qfield=qfield))

            self.fe.add(
                TfIdfStat(True, AvgPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                TfIdfStat(True, MedianPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                TfIdfStat(True, SumPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                TfIdfStat(True, MinPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                TfIdfStat(True, MaxPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                TfIdfStat(True,
                          MaxMinRatioPooler(),
                          field=ifield,
                          qfield=qfield))

            self.fe.add(
                NormalizedTfStat(AvgPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                NormalizedTfStat(MedianPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                NormalizedTfStat(SumPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                NormalizedTfStat(MinPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                NormalizedTfStat(MaxPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                NormalizedTfStat(MaxMinRatioPooler(),
                                 field=ifield,
                                 qfield=qfield))

            self.fe.add(IdfStat(AvgPooler(), field=ifield, qfield=qfield))
            self.fe.add(IdfStat(MedianPooler(), field=ifield, qfield=qfield))
            self.fe.add(IdfStat(SumPooler(), field=ifield, qfield=qfield))
            self.fe.add(IdfStat(MinPooler(), field=ifield, qfield=qfield))
            self.fe.add(IdfStat(MaxPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                IdfStat(MaxMinRatioPooler(), field=ifield, qfield=qfield))

            self.fe.add(IcTfStat(AvgPooler(), field=ifield, qfield=qfield))
            self.fe.add(IcTfStat(MedianPooler(), field=ifield, qfield=qfield))
            self.fe.add(IcTfStat(SumPooler(), field=ifield, qfield=qfield))
            self.fe.add(IcTfStat(MinPooler(), field=ifield, qfield=qfield))
            self.fe.add(IcTfStat(MaxPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                IcTfStat(MaxMinRatioPooler(), field=ifield, qfield=qfield))

            self.fe.add(
                UnorderedSequentialPairs(3, field=ifield, qfield=qfield))
            self.fe.add(
                UnorderedSequentialPairs(8, field=ifield, qfield=qfield))
            self.fe.add(
                UnorderedSequentialPairs(15, field=ifield, qfield=qfield))
            self.fe.add(OrderedSequentialPairs(3, field=ifield, qfield=qfield))
            self.fe.add(OrderedSequentialPairs(8, field=ifield, qfield=qfield))
            self.fe.add(OrderedSequentialPairs(15, field=ifield,
                                               qfield=qfield))
            self.fe.add(UnorderedQueryPairs(3, field=ifield, qfield=qfield))
            self.fe.add(UnorderedQueryPairs(8, field=ifield, qfield=qfield))
            self.fe.add(UnorderedQueryPairs(15, field=ifield, qfield=qfield))
            self.fe.add(OrderedQueryPairs(3, field=ifield, qfield=qfield))
            self.fe.add(OrderedQueryPairs(8, field=ifield, qfield=qfield))
            self.fe.add(OrderedQueryPairs(15, field=ifield, qfield=qfield))

        start = time.time()
        self.fe.add(
            IbmModel1(f"{self.ibm_model}/title_unlemm", "text_unlemm",
                      "title_unlemm", "text_unlemm"))
        end = time.time()
        print('IBM model Load takes %.2f seconds' % (end - start))
        start = end
        self.fe.add(
            IbmModel1(f"{self.ibm_model}url_unlemm", "text_unlemm",
                      "url_unlemm", "text_unlemm"))
        end = time.time()
        print('IBM model Load takes %.2f seconds' % (end - start))
        start = end
        self.fe.add(
            IbmModel1(f"{self.ibm_model}body", "text_unlemm", "body",
                      "text_unlemm"))
        end = time.time()
        print('IBM model Load takes %.2f seconds' % (end - start))
        start = end
        self.fe.add(
            IbmModel1(f"{self.ibm_model}text_bert_tok", "text_bert_tok",
                      "text_bert_tok", "text_bert_tok"))
        end = time.time()
        print('IBM model Load takes %.2f seconds' % (end - start))
        start = end

    def batch_extract(self, df, queries, fe):
        tasks = []
        task_infos = []
        group_lst = []

        for qid, group in tqdm(df.groupby('qid')):
            task = {
                "qid": qid,
                "docIds": [],
                "rels": [],
                "query_dict": queries[qid]
            }
            for t in group.reset_index().itertuples():
                if (self.data == 'document'):
                    if (self.index_reader.doc(t.pid) != None):
                        task["docIds"].append(t.pid)
                        task_infos.append((qid, t.pid, t.rel))
                else:
                    task["docIds"].append(t.pid)
                    task_infos.append((qid, t.pid, t.rel))
            tasks.append(task)
            group_lst.append((qid, len(task['docIds'])))
            if len(tasks) == 1000:
                features = fe.batch_extract(tasks)
                task_infos = pd.DataFrame(task_infos,
                                          columns=['qid', 'pid', 'rel'])
                group = pd.DataFrame(group_lst, columns=['qid', 'count'])
                print(features.shape)
                print(task_infos.qid.drop_duplicates().shape)
                print(group.mean())
                print(features.head(10))
                print(features.info())
                yield task_infos, features, group
                tasks = []
                task_infos = []
                group_lst = []
        # deal with rest
        if len(tasks) > 0:
            features = fe.batch_extract(tasks)
            task_infos = pd.DataFrame(task_infos,
                                      columns=['qid', 'pid', 'rel'])
            group = pd.DataFrame(group_lst, columns=['qid', 'count'])
            print(features.shape)
            print(task_infos.qid.drop_duplicates().shape)
            print(group.mean())
            print(features.head(10))
            print(features.info())
            yield task_infos, features, group

        return

    def batch_predict(self, models, dev_extracted, feature_name):
        task_infos, features, group = dev_extracted
        dev_X = features.loc[:, feature_name]

        task_infos['score'] = 0.
        for gbm in models:
            task_infos['score'] += gbm.predict(dev_X)

    def search(self, dev, queries):
        batch_info = []
        start_extract = time.time()
        models = pickle.load(open(self.model + '/model.pkl', 'rb'))
        metadata = json.load(open(self.model + '/metadata.json', 'r'))
        feature_used = metadata['feature_names']
        for dev_extracted in self.batch_extract(dev, queries, self.fe):
            end_extract = time.time()
            print(f'extract 1000 queries take {end_extract - start_extract}s')
            task_infos, features, group = dev_extracted
            start_predict = time.time()
            self.batch_predict(models, dev_extracted, feature_used)
            end_predict = time.time()
            print(f'predict 1000 queries take {end_predict - start_predict}s')
            batch_info.append(task_infos)
            start_extract = time.time()
        batch_info = pd.concat(batch_info, axis=0, ignore_index=True)
        return batch_info