Esempio n. 1
0
def main():
    base_path = os.path.join(os.path.dirname(__file__), 'data')
    paths = {
        'qrels_path': os.path.join(base_path, 'msmarco-doctrain-qrels.tsv'),
        'top100_path': os.path.join(base_path, 'msmarco-doctrain-top100'),
        'queries_path': os.path.join(base_path,
                                     'msmarco-doctrain-queries.tsv'),
    }

    index_reader = IndexReader(sys.argv[1])

    with open(sys.argv[2], 'w') as output_file:
        for (query_id, query, doc_id,
             is_positive) in generate_examples(**paths):
            query_terms = index_reader.analyze(query)

            feature_vector = [
                np.sum(compute_tf(query_terms, index_reader, doc_id)),
                np.sum(compute_idf(query_terms, index_reader)),
                np.sum(compute_tf_idf(query_terms, index_reader, doc_id)),
                compute_document_length(index_reader, doc_id),
                np.sum(compute_bm25(query_terms, index_reader, doc_id)),
            ]

            line = [
                '1' if is_positive else '0',
                f'qid:{query_id}',
            ]

            for i, feature in enumerate(feature_vector):
                line.append(f'{i}:{feature}')

            output_file.write(' '.join(line) + '\n')
Esempio n. 2
0
def main(queries_file, qrels_file, output_file, write_negative):
    queries = read_topics(queries_file)
    index_reader = IndexReader('indexes/msmarco-passage')
    document_count = int(index_reader.stats()['documents'])
    qrels = open(qrels_file, 'r')

    with open(output_file, 'w') as output_file_handle:
        for line in qrels:
            line = line.strip().split('\t')

            qid = int(line[0])
            docid = line[2]
            target = line[3]
            query = queries[qid]['title']

            features = compute_features(index_reader, query, docid)
            output_file_handle.write(
                format_qrel_line(target, qid, features, docid))

            # The evaluation set doesn't need negative examples.
            if write_negative:
                negative_docid = str(get_negative_docid(document_count, docid))
                features = compute_features(index_reader, query,
                                            negative_docid)
                output_file_handle.write(
                    format_qrel_line(0, qid, features, negative_docid))
Esempio n. 3
0
 def __init__(self, model: str, ibm_model: str, index: str, data: str):
     self.model = model
     self.ibm_model = ibm_model
     self.fe = FeatureExtractor(index,
                                max(multiprocessing.cpu_count() // 2, 1))
     self.index_reader = IndexReader(index)
     self.data = data
Esempio n. 4
0
 def _compute_idf(index_path):
     from pyserini.index import IndexReader
     index_reader = IndexReader(index_path)
     tokens = []
     dfs = []
     for term in index_reader.terms():
         dfs.append(term.df)
         tokens.append(term.term)
     idfs = np.log((index_reader.stats()['documents'] / (np.array(dfs))))
     return dict(zip(tokens, idfs))
Esempio n. 5
0
def compute_idf(query_terms: List[str],
                index_reader: IndexReader) -> np.ndarray:
    """log ( (|C| - df(term) + 0.5) / (df(term) + 0.5)"""
    C = index_reader.stats()['documents']

    query_idf = np.zeros(len(query_terms))
    for i, term in enumerate(query_terms):
        term_df = index_reader.get_term_counts(term, analyzer=None)[0]

        query_idf[i] = np.log(np.divide(C - term_df + 0.5, term_df + 0.5))
    return query_idf
Esempio n. 6
0
 def __init__(self,
              k1: float = 1.6,
              b: float = 0.75,
              index_path: str = None):
     self.k1 = k1
     self.b = b
     self.use_corpus_estimator = False
     self.analyzer = Analyzer(get_lucene_analyzer())
     if index_path:
         self.use_corpus_estimator = True
         self.index_utils = IndexReader(index_path)
Esempio n. 7
0
    def __init__(self, strategy="GREEDY", seed=2020, max_iter=20):
        """
        This class produces a baseline BM25 ranking and uses LDA topic modelling
        in combination with the general re-ranking procedure of Huang and Hu (2009)
        """
        self.seed = seed
        self.max_iter = max_iter
        self.utils = Utils()

        # Amount of documents to rank and rerank
        self.N= 100

        # Select a strategy for weighing final topics
        self.strategy = strategy
    
        # K to use in TOP-K-AVG strategy
        self.top_k = 10 

        # TODO ideally we don't want to first rank every time for the reranking 
        self.baseline = BaselineBM25(k=self.N)
        self.baseline.rank()

        # For each topic, the system outputs N retrieved articles.
        self.batch_hits = self.baseline.get_batch_hits()

        # Read index to retrieve document contents
        # N.B. the `contents` field is currently empty; we stored "raw" instead.
        self.index_loc = self.baseline.get_index_loc()
        reader = IndexReader(self.index_loc)

        # Vocabulary in index
        #vocabulary = [ term.term for term in reader.terms()]
        #print(f"{len(vocabulary)} terms in vocabulary")

        # Topics and the retrieved articles are represented as the keyword sequences
        self.topics = self.baseline.get_topics()
        self.topic_keywords = { id: topic['title'].lower().split() for (id, topic) in self.topics.items() } 
        self.query_ids = self.baseline.get_query_ids()

        # Next line returns preprocessed documents per query 
        docs_per_query = { query_id: [ reader.analyze( reader.doc(hit.docid).raw()) for hit in hits] for query_id, hits in self.batch_hits.items() }

        # Prepare bag-of-words dataset for gensim
        self.X = defaultdict(list)
        for id in self.query_ids:
            dictionary = Dictionary(docs_per_query[id])
            # Dictionary expects a list of lists, elements being lists of tokens
            self.X[id] = [dictionary.doc2bow(doc) for doc in docs_per_query[id]]
Esempio n. 8
0
def main():
    try:
        # Location of the generated index
        index_loc = "indexes/msmarco-passage/lucene-index-msmarco"

        # Create a searcher object
        searcher = SimpleSearcher(index_loc)
        # Set the active scorer to BM25
        searcher.set_bm25(k1=0.9, b=0.4)
        # Fetch 3 results for the given test query
        results = searcher.search('this is a test query', k=3)
        # For all results print the docid and the score
        expected = ['5578280', '2016011', '7004677']
        docids = [x.docid for x in results]
        if expected != docids:
            raise Exception('Test query results do not match expected:',
                            expected, '(expecteD)', docids, '(actual)')
        # IndexReader can give information about the index
        indexer = IndexReader(index_loc)
        if indexer.stats()['total_terms'] != 352316036:
            raise Exception(
                'There are an unexpected number of terms in your index set, perhaps something went wrong while downloading and indexing the dataset?'
            )
        topics = get_topics("msmarco-passage-dev-subset")
        if topics == {}:
            raise Exception(
                'Could not find msmarco-passage-dev-subset... Best approach is to retry indexing the dataset.'
            )
        first_query = topics[list(topics.keys())[0]]['title']
        if first_query != "why do people grind teeth in sleep":
            raise Exception(
                'Found a different first query than expected in the dataset. Did you download the right dataset?'
            )
        # Using the pyserini tokenizer/stemmer/etc. to create queries from scratch
        # Using the pyserini tokenizer/stemmer/etc. to create queries from scratch
        query = "This is a test query in which things are tested. Found using www.google.com of course!"
        # Tokenizing in pyserini is called Analyzing
        output = indexer.analyze(query)
        if len(output) != 9:
            raise Exception(
                'Tokenizer is not working correctly, something is probably wrong in Anserini. Perhaps try to install Anserini again.'
            )
    except Exception as inst:
        print('ERROR: something went wrong in the installation')
        print(inst)
    else:
        print("INSTALLATION OK")
Esempio n. 9
0
def compute_tf(query_terms: List[str], index_reader: IndexReader,
               doc_id: str) -> np.ndarray:
    query_tf = np.zeros(len(query_terms))
    doc_vector = index_reader.get_document_vector(doc_id)

    for i, term in enumerate(query_terms):
        query_tf[i] = doc_vector.get(term, 0)

    return query_tf
Esempio n. 10
0
def main():
    index_reader = IndexReader("../anserini/indexes/msmarco-doc/lucene-index-msmarco")
    generate_libsvm_representation(index_reader, "data/msmarco-doctrain-queries.tsv.gz",
                                   "data/msmarco-doctrain-qrels.tsv.gz",
                                   "data/msmarco-doc-libsvm/msmarco-doctrain-libsvm.txt", num_queries=100)
    # generate_libsvm_representation(index_reader, "data/msmarco-test2019-queries.tsv.gz",
    #                                "data/2019qrels-docs.txt.gz",
    #                                "data/msmarco-doc-libsvm/msmarco-doctest-libsvm.txt", num_queries=10)
    generate_libsvm_representation(index_reader, "data/msmarco-docdev-queries.tsv.gz",
                                   "data/msmarco-docdev-qrels.tsv.gz",
                                   "data/msmarco-doc-libsvm/msmarco-docdev-libsvm.txt", num_queries=100)
Esempio n. 11
0
def compute_bm25(query_terms: List[str],
                 index_reader: IndexReader,
                 doc_id: str,
                 k1=0.9,
                 b=0.4) -> float:
    scores = np.zeros(len(query_terms))
    for i, term in enumerate(query_terms):
        bm25 = index_reader.compute_bm25_term_weight(doc_id,
                                                     term,
                                                     analyzer=None,
                                                     k1=k1,
                                                     b=b)
        scores[i] = bm25

    return scores
Esempio n. 12
0
class Bm25Reranker(Reranker):
    def __init__(self,
                 k1: float = 1.6,
                 b: float = 0.75,
                 index_path: str = None):
        self.k1 = k1
        self.b = b
        self.use_corpus_estimator = False
        self.analyzer = Analyzer(get_lucene_analyzer())
        if index_path:
            self.use_corpus_estimator = True
            self.index_utils = IndexReader(index_path)

    def rerank(self, query: Query, texts: List[Text]) -> List[Text]:
        query_words = self.analyzer.analyze(query.text)
        sentences = list(map(self.analyzer.analyze, (t.text for t in texts)))

        query_words_set = set(query_words)
        sentence_sets = list(map(set, sentences))
        if not self.use_corpus_estimator:
            idfs = {
                w: math.log(
                    len(sentence_sets) /
                    (1 + sum(int(w in sent) for sent in sentence_sets)))
                for w in query_words_set
            }
        mean_len = np.mean(list(map(len, sentences)))
        d_len = len(sentences)

        texts = deepcopy(texts)
        for sent_words, text in zip(sentences, texts):
            tf = Counter(filter(query_words.__contains__, sent_words))
            if self.use_corpus_estimator:
                idfs = {
                    w: self.index_utils.compute_bm25_term_weight(
                        text.metadata['docid'], w)
                    for w in tf
                }
            score = sum(idfs[w] * tf[w] * (self.k1 + 1) /
                        (tf[w] + self.k1 * (1 - self.b + self.b *
                                            (d_len / mean_len))) for w in tf)
            if np.isnan(score):
                score = 0
            text.score = score
        return texts
Esempio n. 13
0
from pyserini.index import IndexReader
import math, numpy


index_reader = IndexReader('marcoindex')
number_of_docs = 8841823
number_of_all_terms=491404850

def IDF(term)
    df, cf = index_reader.get_term_counts(term)
    return math.log10(number_of_docsdf)

def ictf(term)
    df, cf = index_reader.get_term_counts(term)
    return math.log10(number_of_all_terms  cf )

def SCS(query)
    q_terms=query.split()
    avgictf=[]
    for t in q_terms
        avgictf.append(ictf(index_reader,t))

    part_A= math.log10 (  1   len(q_terms))
    part_B = numpy.mean(avgictf)
    return ( part_A + part_B ) 

def SCQ(term)
    df, cf = index_reader.get_term_counts(term)
    part_A=  1 + math.log10(cf)
    part_B=IDF(index_reader,term)
    return (part_A  part_B)
Esempio n. 14
0
    def load_samples(self):
        indexer = IndexReader(self.args.index_dir)
        custom_bm25 = search.LuceneSimilarities.bm25(self.args.bm25_k1,
                                                     self.args.bm25_b)
        qrels_path = os.path.join(self.args.msmarco_dir,
                                  f"qrels.{self.mode}.tsv")
        candidates_path = os.path.join(self.args.msmarco_dir,
                                       f"top_candidates.{self.mode}.tsv")

        # all queries text (for calculating the BM25 scores)
        queries_text = dict()
        for line in open(
                os.path.join(self.args.msmarco_dir,
                             f"queries.{self.mode}.tsv"), 'r'):
            qid, text = line.split('\t')
            text = text.rstrip()
            queries_text[qid] = text

        if self.mode == 'train':
            # qrel (labels)
            qrel_lst = defaultdict(list)
            for line in open(qrels_path, 'r'):
                qid, _, pid, _ = line.split('\t')
                qrel_lst[qid].append(int(pid))
            qrel_lst = dict(qrel_lst)

            # top docs by BM25 (neg samples)
            top_lst = defaultdict(list)
            for line in tqdm(open(candidates_path, 'r'),
                             desc=f"{self.mode} top candidates"):
                qid, pid, score = line.split('\t')
                if int(pid) not in qrel_lst[qid]:
                    top_lst[qid].append({
                        'pid': int(pid),
                        'score': float(score)
                    })
            top_lst = dict(top_lst)

            qids, pos_pids, neg_pids, pos_scores, neg_scores = [], [], [], [], []
            for qid in tqdm(qrel_lst, desc=f"{self.mode} samples"):
                if qid in top_lst:
                    for pos_pid in qrel_lst[qid]:
                        pos_score = indexer.compute_query_document_score(
                            str(pos_pid),
                            queries_text[qid],
                            similarity=custom_bm25)
                        neg_docs = top_lst[
                            qid][:self.args.
                                 p]  #probability p? not clear, since author doesn't mention

                        for neg_doc in neg_docs:
                            qids.append(qid)
                            pos_pids.append(pos_pid)
                            neg_pids.append(neg_doc['pid'])
                            pos_scores.append(pos_score)
                            neg_scores.append(neg_doc['score'])
            self.qids, self.pos_pids, self.neg_pids = qids, pos_pids, neg_pids
            self.pos_scores, self.neg_scores = pos_scores, neg_scores
        else:
            # top docs by BM25 (neg samples)
            top_lst = defaultdict(list)
            for line in tqdm(open(candidates_path, 'r'),
                             desc=f"{self.mode} top candidates"):
                qid, pid, score = line.split('\t')
                top_lst[qid].append({'pid': int(pid), 'score': float(score)})
            top_lst = dict(top_lst)

            qids, pids, scores = [], [], []
            for i, qid in enumerate(top_lst):
                for doc in top_lst[qid]:
                    qids.append(qid)
                    pids.append(doc['pid'])
                    scores.append(doc['score'])
                if (i + 1) == self.args.num_eval_queries:
                    break
            self.qids, self.pids, self.scores = qids, pids, scores
def check_sparse(index):
    for entry in index:
        print(f'# Validating "{entry}"...')
        IndexReader.validate_prebuilt_index(entry)
        print('\n')
def main(is_training):
    embeddings_file = 'glove.840B.300d'
    print(f'Processing {embeddings_file}')

    if is_training:
        qrels_file = 'qrels.train.tsv'
        queries_file = 'queries.train.tsv'
        query_embeddings_file = f'embeddings/{embeddings_file}/queries-embeddings.train.tsv'
        doc_embeddings_file = f'embeddings/{embeddings_file}/documents-embeddings.train.tsv'
        output_file = f'ranklib-features/{embeddings_file}/data_ranklib-embeddings-train.txt'
    else:
        qrels_file = 'runs/run.msmarco-test2019-queries-bm25.trec'
        queries_file = 'msmarco-test2019-queries.tsv'
        query_embeddings_file = f'embeddings/{embeddings_file}/queries-embeddings.test.tsv'
        doc_embeddings_file = f'embeddings/{embeddings_file}/documents-embeddings.test.tsv'
        output_file = f'ranklib-features/{embeddings_file}/data_ranklib-embeddings-test.txt'

    queries = read_topics(queries_file)
    index_reader = IndexReader('indexes/msmarco-passage')
    qrels = open(qrels_file, 'r')

    print('Reading query vectors')
    query_embeddings_handle = open(query_embeddings_file, 'r')
    query_vector_id, query_vector_values = load_fasttext_line(
        query_embeddings_handle.readline())

    print('Reading document vectors')
    doc_vectors = load_fasttext_vectors(doc_embeddings_file, False)
    doc_ids = list(doc_vectors.keys())

    count = 0
    print('Calculating features')
    os.system(f'mkdir -p ranklib-features/{embeddings_file}')
    with open(output_file, 'w') as output_file_handle:
        for line in qrels:
            line = line.strip().split('\t')

            qid = int(line[0])
            docid = line[2]
            target = line[3]
            query = queries[qid]['title']

            if int(query_vector_id) != qid:
                old_id = query_vector_id
                while int(old_id) == int(query_vector_id):
                    query_vector_id, query_vector_values = load_fasttext_line(
                        query_embeddings_handle.readline())

            doc_vector = doc_vectors[docid]
            if math.isnan(query_vector_values[0]) or math.isnan(doc_vector[0]):
                count += 1
                continue

            features = {
                **compute_similarity(query_vector_values, doc_vector),
                **compute_features(index_reader, query, docid)
            }
            output_file_handle.write(
                format_qrel_line(target, qid, features, docid))

            # The evaluation set doesn't need negative examples.
            if is_training:
                negative_docid = str(get_negative_docid(doc_ids, docid))
                features = {
                    **compute_similarity(query_vector_values, doc_vectors[negative_docid]),
                    **compute_features(index_reader, query, docid)
                }
                output_file_handle.write(
                    format_qrel_line(0, qid, features, negative_docid))

            if count % 10000 == 0:
                print(count)
            count += 1
Esempio n. 17
0
        line = line.strip().split("\t")
        # Try and parse the keys into integers
        try:
            topic_key = int(line[0])
        except ValueError:
            topic_key = line[0]
        topics[topic_key] = {
            'title': line[1],
        }

    return topics


C_size = 50

index_reader = IndexReader('indexes/msmarco-passage')

top_25 = [{
    'term': 'you',
    'cf': 3704969
}, {
    'term': 'your',
    'cf': 2871978
}, {
    'term': 'from',
    'cf': 2433977
}, {
    'term': 'us',
    'cf': 2215803
}, {
    'term': 'can',
# hits contains: docid, retrieval score, and document content
# N.B. "black bear attacks" is the title of topic 336
query = 'black bear attacks'
hits = searcher.search(query)

# Print first 10 hits
utils.print_top_n_results(hits, 10)

# ----------------
# IndexReaderUtils
# ----------------

from pyserini.index import IndexReader

# Now we do not search the index, but retrieve a document directly from the index
reader = IndexReader(index_loc)

# Retrieve a document using its docid
#id = 'd6ed7028c686e5756ceb0aa0c9b62e0d'
id = hits[0].docid

# See class Document in https://github.com/castorini/pyserini/blob/master/pyserini/search/_base.py
# properties: docid; id (alias); lucene_document; contents; raw
doc = reader.doc(id).raw()
#print(doc)

# Get analyzed form (tokenized, stemmed, stopwords removed)
analyzed = reader.analyze(doc)
#print(analyzed)

# Raw document VECTOR is also stored
Esempio n. 19
0
def compute_document_length(index_reader: IndexReader, doc_id: str) -> int:
    return len(index_reader.doc_raw(doc_id))
Esempio n. 20
0
class MsmarcoLtrSearcher:
    def __init__(self, model: str, ibm_model: str, index: str, data: str):
        self.model = model
        self.ibm_model = ibm_model
        self.fe = FeatureExtractor(index,
                                   max(multiprocessing.cpu_count() // 2, 1))
        self.index_reader = IndexReader(index)
        self.data = data

    def add_fe(self):
        #self.fe.add(RunList('collections/msmarco-ltr-passage/run.monot5.run_list.whole.trec','t5'))
        for qfield, ifield in [('analyzed', 'contents'),
                               ('text_unlemm', 'text_unlemm'),
                               ('text_bert_tok', 'text_bert_tok')]:
            print(qfield, ifield)
            self.fe.add(
                BM25Stat(SumPooler(),
                         k1=2.0,
                         b=0.75,
                         field=ifield,
                         qfield=qfield))
            self.fe.add(
                BM25Stat(AvgPooler(),
                         k1=2.0,
                         b=0.75,
                         field=ifield,
                         qfield=qfield))
            self.fe.add(
                BM25Stat(MedianPooler(),
                         k1=2.0,
                         b=0.75,
                         field=ifield,
                         qfield=qfield))
            self.fe.add(
                BM25Stat(MaxPooler(),
                         k1=2.0,
                         b=0.75,
                         field=ifield,
                         qfield=qfield))
            self.fe.add(
                BM25Stat(MinPooler(),
                         k1=2.0,
                         b=0.75,
                         field=ifield,
                         qfield=qfield))
            self.fe.add(
                BM25Stat(MaxMinRatioPooler(),
                         k1=2.0,
                         b=0.75,
                         field=ifield,
                         qfield=qfield))

            self.fe.add(
                LmDirStat(SumPooler(), mu=1000, field=ifield, qfield=qfield))
            self.fe.add(
                LmDirStat(AvgPooler(), mu=1000, field=ifield, qfield=qfield))
            self.fe.add(
                LmDirStat(MedianPooler(), mu=1000, field=ifield,
                          qfield=qfield))
            self.fe.add(
                LmDirStat(MaxPooler(), mu=1000, field=ifield, qfield=qfield))
            self.fe.add(
                LmDirStat(MinPooler(), mu=1000, field=ifield, qfield=qfield))
            self.fe.add(
                LmDirStat(MaxMinRatioPooler(),
                          mu=1000,
                          field=ifield,
                          qfield=qfield))

            self.fe.add(NormalizedTfIdf(field=ifield, qfield=qfield))
            self.fe.add(ProbalitySum(field=ifield, qfield=qfield))

            self.fe.add(DfrGl2Stat(SumPooler(), field=ifield, qfield=qfield))
            self.fe.add(DfrGl2Stat(AvgPooler(), field=ifield, qfield=qfield))
            self.fe.add(DfrGl2Stat(MedianPooler(), field=ifield,
                                   qfield=qfield))
            self.fe.add(DfrGl2Stat(MaxPooler(), field=ifield, qfield=qfield))
            self.fe.add(DfrGl2Stat(MinPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                DfrGl2Stat(MaxMinRatioPooler(), field=ifield, qfield=qfield))

            self.fe.add(
                DfrInExpB2Stat(SumPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                DfrInExpB2Stat(AvgPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                DfrInExpB2Stat(MedianPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                DfrInExpB2Stat(MaxPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                DfrInExpB2Stat(MinPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                DfrInExpB2Stat(MaxMinRatioPooler(),
                               field=ifield,
                               qfield=qfield))

            self.fe.add(DphStat(SumPooler(), field=ifield, qfield=qfield))
            self.fe.add(DphStat(AvgPooler(), field=ifield, qfield=qfield))
            self.fe.add(DphStat(MedianPooler(), field=ifield, qfield=qfield))
            self.fe.add(DphStat(MaxPooler(), field=ifield, qfield=qfield))
            self.fe.add(DphStat(MinPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                DphStat(MaxMinRatioPooler(), field=ifield, qfield=qfield))

            self.fe.add(Proximity(field=ifield, qfield=qfield))
            self.fe.add(TpScore(field=ifield, qfield=qfield))
            self.fe.add(TpDist(field=ifield, qfield=qfield))

            self.fe.add(DocSize(field=ifield))

            self.fe.add(QueryLength(qfield=qfield))
            self.fe.add(QueryCoverageRatio(qfield=qfield))
            self.fe.add(UniqueTermCount(qfield=qfield))
            self.fe.add(MatchingTermCount(field=ifield, qfield=qfield))
            self.fe.add(SCS(field=ifield, qfield=qfield))

            self.fe.add(TfStat(AvgPooler(), field=ifield, qfield=qfield))
            self.fe.add(TfStat(MedianPooler(), field=ifield, qfield=qfield))
            self.fe.add(TfStat(SumPooler(), field=ifield, qfield=qfield))
            self.fe.add(TfStat(MinPooler(), field=ifield, qfield=qfield))
            self.fe.add(TfStat(MaxPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                TfStat(MaxMinRatioPooler(), field=ifield, qfield=qfield))

            self.fe.add(
                TfIdfStat(True, AvgPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                TfIdfStat(True, MedianPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                TfIdfStat(True, SumPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                TfIdfStat(True, MinPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                TfIdfStat(True, MaxPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                TfIdfStat(True,
                          MaxMinRatioPooler(),
                          field=ifield,
                          qfield=qfield))

            self.fe.add(
                NormalizedTfStat(AvgPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                NormalizedTfStat(MedianPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                NormalizedTfStat(SumPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                NormalizedTfStat(MinPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                NormalizedTfStat(MaxPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                NormalizedTfStat(MaxMinRatioPooler(),
                                 field=ifield,
                                 qfield=qfield))

            self.fe.add(IdfStat(AvgPooler(), field=ifield, qfield=qfield))
            self.fe.add(IdfStat(MedianPooler(), field=ifield, qfield=qfield))
            self.fe.add(IdfStat(SumPooler(), field=ifield, qfield=qfield))
            self.fe.add(IdfStat(MinPooler(), field=ifield, qfield=qfield))
            self.fe.add(IdfStat(MaxPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                IdfStat(MaxMinRatioPooler(), field=ifield, qfield=qfield))

            self.fe.add(IcTfStat(AvgPooler(), field=ifield, qfield=qfield))
            self.fe.add(IcTfStat(MedianPooler(), field=ifield, qfield=qfield))
            self.fe.add(IcTfStat(SumPooler(), field=ifield, qfield=qfield))
            self.fe.add(IcTfStat(MinPooler(), field=ifield, qfield=qfield))
            self.fe.add(IcTfStat(MaxPooler(), field=ifield, qfield=qfield))
            self.fe.add(
                IcTfStat(MaxMinRatioPooler(), field=ifield, qfield=qfield))

            self.fe.add(
                UnorderedSequentialPairs(3, field=ifield, qfield=qfield))
            self.fe.add(
                UnorderedSequentialPairs(8, field=ifield, qfield=qfield))
            self.fe.add(
                UnorderedSequentialPairs(15, field=ifield, qfield=qfield))
            self.fe.add(OrderedSequentialPairs(3, field=ifield, qfield=qfield))
            self.fe.add(OrderedSequentialPairs(8, field=ifield, qfield=qfield))
            self.fe.add(OrderedSequentialPairs(15, field=ifield,
                                               qfield=qfield))
            self.fe.add(UnorderedQueryPairs(3, field=ifield, qfield=qfield))
            self.fe.add(UnorderedQueryPairs(8, field=ifield, qfield=qfield))
            self.fe.add(UnorderedQueryPairs(15, field=ifield, qfield=qfield))
            self.fe.add(OrderedQueryPairs(3, field=ifield, qfield=qfield))
            self.fe.add(OrderedQueryPairs(8, field=ifield, qfield=qfield))
            self.fe.add(OrderedQueryPairs(15, field=ifield, qfield=qfield))

        start = time.time()
        self.fe.add(
            IbmModel1(f"{self.ibm_model}/title_unlemm", "text_unlemm",
                      "title_unlemm", "text_unlemm"))
        end = time.time()
        print('IBM model Load takes %.2f seconds' % (end - start))
        start = end
        self.fe.add(
            IbmModel1(f"{self.ibm_model}url_unlemm", "text_unlemm",
                      "url_unlemm", "text_unlemm"))
        end = time.time()
        print('IBM model Load takes %.2f seconds' % (end - start))
        start = end
        self.fe.add(
            IbmModel1(f"{self.ibm_model}body", "text_unlemm", "body",
                      "text_unlemm"))
        end = time.time()
        print('IBM model Load takes %.2f seconds' % (end - start))
        start = end
        self.fe.add(
            IbmModel1(f"{self.ibm_model}text_bert_tok", "text_bert_tok",
                      "text_bert_tok", "text_bert_tok"))
        end = time.time()
        print('IBM model Load takes %.2f seconds' % (end - start))
        start = end

    def batch_extract(self, df, queries, fe):
        tasks = []
        task_infos = []
        group_lst = []

        for qid, group in tqdm(df.groupby('qid')):
            task = {
                "qid": qid,
                "docIds": [],
                "rels": [],
                "query_dict": queries[qid]
            }
            for t in group.reset_index().itertuples():
                if (self.data == 'document'):
                    if (self.index_reader.doc(t.pid) != None):
                        task["docIds"].append(t.pid)
                        task_infos.append((qid, t.pid, t.rel))
                else:
                    task["docIds"].append(t.pid)
                    task_infos.append((qid, t.pid, t.rel))
            tasks.append(task)
            group_lst.append((qid, len(task['docIds'])))
            if len(tasks) == 1000:
                features = fe.batch_extract(tasks)
                task_infos = pd.DataFrame(task_infos,
                                          columns=['qid', 'pid', 'rel'])
                group = pd.DataFrame(group_lst, columns=['qid', 'count'])
                print(features.shape)
                print(task_infos.qid.drop_duplicates().shape)
                print(group.mean())
                print(features.head(10))
                print(features.info())
                yield task_infos, features, group
                tasks = []
                task_infos = []
                group_lst = []
        # deal with rest
        if len(tasks) > 0:
            features = fe.batch_extract(tasks)
            task_infos = pd.DataFrame(task_infos,
                                      columns=['qid', 'pid', 'rel'])
            group = pd.DataFrame(group_lst, columns=['qid', 'count'])
            print(features.shape)
            print(task_infos.qid.drop_duplicates().shape)
            print(group.mean())
            print(features.head(10))
            print(features.info())
            yield task_infos, features, group

        return

    def batch_predict(self, models, dev_extracted, feature_name):
        task_infos, features, group = dev_extracted
        dev_X = features.loc[:, feature_name]

        task_infos['score'] = 0.
        for gbm in models:
            task_infos['score'] += gbm.predict(dev_X)

    def search(self, dev, queries):
        batch_info = []
        start_extract = time.time()
        models = pickle.load(open(self.model + '/model.pkl', 'rb'))
        metadata = json.load(open(self.model + '/metadata.json', 'r'))
        feature_used = metadata['feature_names']
        for dev_extracted in self.batch_extract(dev, queries, self.fe):
            end_extract = time.time()
            print(f'extract 1000 queries take {end_extract - start_extract}s')
            task_infos, features, group = dev_extracted
            start_predict = time.time()
            self.batch_predict(models, dev_extracted, feature_used)
            end_predict = time.time()
            print(f'predict 1000 queries take {end_predict - start_predict}s')
            batch_info.append(task_infos)
            start_extract = time.time()
        batch_info = pd.concat(batch_info, axis=0, ignore_index=True)
        return batch_info
Esempio n. 21
0
    def index_reader(self):
        from pyserini.index import IndexReader

        return IndexReader(str(self.path))
Esempio n. 22
0
def run():
    parser = argparse.ArgumentParser(description="TREC-COVID document ranker CLI")
    parser.add_argument("-v", "--verbose", help="Increase output verbosity", action="store_true", default=False)
    parser.add_argument("-cp", "--compute_pickle", help="Compute mapping from internal lucene id's to external docid's", action="store_true", default=False)
    parser.add_argument("-n", "--n_queries", help="Naximum number of queries to run", type=int, default=999)
    parser.add_argument("-m", "--model", help="which model used in ranking from {bm25, tf_idf}", default="bm25")
    parser.add_argument("-d", "--doc_at_a_time", help="Use document_at_a_time algorithm", action="store_true", default=False)
    parser.add_argument("-k", "--k_docs", help="Numer of documents to retrieve", type=int, default=100)
    parser.add_argument("-r", "--rerank", help="Which rerank model to use 'rocchio', or 'ide'", default="none")
    args = parser.parse_args()
    global verbose
    verbose = args.verbose
    model = args.model
    doc_at_a_time = args.doc_at_a_time
    k = args.k_docs
    rerank = args.rerank

    index_reader = IndexReader(LUCENE_INDEX)
    searcher = SimpleSearcher(LUCENE_INDEX)
    models = Models(index_reader, QRELFILE)
    trec_index = Index(index_reader, searcher)

    if not os.path.exists('output'):
        os.makedirs('output')

    if args.compute_pickle:
        print("Computing id index dict")
        docidx_docid = {docidx : (trec_index.get_docid_from_index(docidx), trec_index.get_n_of_words_in_inverted_list_doc(docidx)) for docidx in range(trec_index.get_max_docindex())}
        with open('blob/mapping.pickle', 'wb') as handle:
            pickle.dump(docidx_docid, handle, protocol=pickle.HIGHEST_PROTOCOL)
    if True:
        with open('blob/mapping.pickle', 'rb') as handle:
            print("Loading id index dict")
            docidx_docid = pickle.load(handle)
        print("Finished initializing id index dict")

    topics = parse_topics(TOPICSFILE)

    rocchio = False
    if model == "bm25":
        rankfun = score_bm25
    elif model == "tf_idf":
        rankfun = score_tf_idf
    else:
        print("Model should be 'tf_idf' or 'bm25' (default)!")
        sys.exit(1)

    t = time.localtime()
    current_time = time.strftime("%H:%M", t)
    rankfile = "output/ranking-{0}-{1}.txt".format(model, current_time)
    resultfile = "output/results-{0}-{1}.json".format(model, current_time)

    if doc_at_a_time:
        try:
            with open(rankfile, 'w') as outfile:
                for idx in range(1, min(args.n_queries+1, len(topics)+1)):
                    for i, (score, docid) in enumerate(document_at_a_time(topics[str(idx)]["query"], trec_index, models, k, docidx_docid), 1):
                        outfile.write(write_output(idx, docid, i, score, "document_at_a_time"))
        finally:
            outfile.close()
    else:
        try:
            with open(rankfile, 'w') as outfile:
                for idx in range(1, min(args.n_queries+1, len(topics)+1)):
                    for i, (score, docid) in enumerate(
                        get_docs_and_score_query(topics[str(idx)]["query"], rankfun, trec_index, models, idx, k, docidx_docid, rerank=rerank), 1):
                        outfile.write(write_output(idx, docid, i, score, "score_query"))
        finally:
            outfile.close()

    results = pytrec_evaluation(rankfile, QRELFILE)
    with open(resultfile, 'w') as outjson:
        json.dump(results, outjson)
Esempio n. 23
0
from pyserini.index import IndexReader
from pyserini.search import SimpleSearcher
parser = argparse.ArgumentParser()
parser.add_argument('--msmarco_dir', type=str, default="./data")
parser.add_argument('--index_dir', type=str, default="./data/index")
parser.add_argument('--output_dir', type=str, default="./data/bm25_result")
parser.add_argument('--bm25_k1', type=float, default=0.6)
parser.add_argument('--bm25_b', type=float, default=0.8)
parser.add_argument('--threads', type=int, default=4)
parser.add_argument('--sample', type=int, default=0)
args = parser.parse_args()

if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)

indexer = IndexReader(args.index_dir)
searcher = SimpleSearcher(args.index_dir)
searcher.set_bm25(k1=args.bm25_k1, b=args.bm25_b)
num_candidates = indexer.stats()['documents']


def calculate_bm25(query):
    qid, text = query
    with open(os.path.join(args.output_dir, f"{qid}.tsv"), 'w') as outfile:
        candidates = searcher.search(text, k=num_candidates)
        for i in range(len(candidates)):
            outfile.write(f"{candidates[i].docid}\t{candidates[i].score}\n")


if __name__ == "__main__":
    # load the queries