Beispiel #1
0
def main():
    base_path = os.path.join(os.path.dirname(__file__), 'data')
    paths = {
        'qrels_path': os.path.join(base_path, 'msmarco-doctrain-qrels.tsv'),
        'top100_path': os.path.join(base_path, 'msmarco-doctrain-top100'),
        'queries_path': os.path.join(base_path,
                                     'msmarco-doctrain-queries.tsv'),
    }

    index_reader = IndexReader(sys.argv[1])

    with open(sys.argv[2], 'w') as output_file:
        for (query_id, query, doc_id,
             is_positive) in generate_examples(**paths):
            query_terms = index_reader.analyze(query)

            feature_vector = [
                np.sum(compute_tf(query_terms, index_reader, doc_id)),
                np.sum(compute_idf(query_terms, index_reader)),
                np.sum(compute_tf_idf(query_terms, index_reader, doc_id)),
                compute_document_length(index_reader, doc_id),
                np.sum(compute_bm25(query_terms, index_reader, doc_id)),
            ]

            line = [
                '1' if is_positive else '0',
                f'qid:{query_id}',
            ]

            for i, feature in enumerate(feature_vector):
                line.append(f'{i}:{feature}')

            output_file.write(' '.join(line) + '\n')
Beispiel #2
0
    def __init__(self, strategy="GREEDY", seed=2020, max_iter=20):
        """
        This class produces a baseline BM25 ranking and uses LDA topic modelling
        in combination with the general re-ranking procedure of Huang and Hu (2009)
        """
        self.seed = seed
        self.max_iter = max_iter
        self.utils = Utils()

        # Amount of documents to rank and rerank
        self.N= 100

        # Select a strategy for weighing final topics
        self.strategy = strategy
    
        # K to use in TOP-K-AVG strategy
        self.top_k = 10 

        # TODO ideally we don't want to first rank every time for the reranking 
        self.baseline = BaselineBM25(k=self.N)
        self.baseline.rank()

        # For each topic, the system outputs N retrieved articles.
        self.batch_hits = self.baseline.get_batch_hits()

        # Read index to retrieve document contents
        # N.B. the `contents` field is currently empty; we stored "raw" instead.
        self.index_loc = self.baseline.get_index_loc()
        reader = IndexReader(self.index_loc)

        # Vocabulary in index
        #vocabulary = [ term.term for term in reader.terms()]
        #print(f"{len(vocabulary)} terms in vocabulary")

        # Topics and the retrieved articles are represented as the keyword sequences
        self.topics = self.baseline.get_topics()
        self.topic_keywords = { id: topic['title'].lower().split() for (id, topic) in self.topics.items() } 
        self.query_ids = self.baseline.get_query_ids()

        # Next line returns preprocessed documents per query 
        docs_per_query = { query_id: [ reader.analyze( reader.doc(hit.docid).raw()) for hit in hits] for query_id, hits in self.batch_hits.items() }

        # Prepare bag-of-words dataset for gensim
        self.X = defaultdict(list)
        for id in self.query_ids:
            dictionary = Dictionary(docs_per_query[id])
            # Dictionary expects a list of lists, elements being lists of tokens
            self.X[id] = [dictionary.doc2bow(doc) for doc in docs_per_query[id]]
Beispiel #3
0
def main():
    try:
        # Location of the generated index
        index_loc = "indexes/msmarco-passage/lucene-index-msmarco"

        # Create a searcher object
        searcher = SimpleSearcher(index_loc)
        # Set the active scorer to BM25
        searcher.set_bm25(k1=0.9, b=0.4)
        # Fetch 3 results for the given test query
        results = searcher.search('this is a test query', k=3)
        # For all results print the docid and the score
        expected = ['5578280', '2016011', '7004677']
        docids = [x.docid for x in results]
        if expected != docids:
            raise Exception('Test query results do not match expected:',
                            expected, '(expecteD)', docids, '(actual)')
        # IndexReader can give information about the index
        indexer = IndexReader(index_loc)
        if indexer.stats()['total_terms'] != 352316036:
            raise Exception(
                'There are an unexpected number of terms in your index set, perhaps something went wrong while downloading and indexing the dataset?'
            )
        topics = get_topics("msmarco-passage-dev-subset")
        if topics == {}:
            raise Exception(
                'Could not find msmarco-passage-dev-subset... Best approach is to retry indexing the dataset.'
            )
        first_query = topics[list(topics.keys())[0]]['title']
        if first_query != "why do people grind teeth in sleep":
            raise Exception(
                'Found a different first query than expected in the dataset. Did you download the right dataset?'
            )
        # Using the pyserini tokenizer/stemmer/etc. to create queries from scratch
        # Using the pyserini tokenizer/stemmer/etc. to create queries from scratch
        query = "This is a test query in which things are tested. Found using www.google.com of course!"
        # Tokenizing in pyserini is called Analyzing
        output = indexer.analyze(query)
        if len(output) != 9:
            raise Exception(
                'Tokenizer is not working correctly, something is probably wrong in Anserini. Perhaps try to install Anserini again.'
            )
    except Exception as inst:
        print('ERROR: something went wrong in the installation')
        print(inst)
    else:
        print("INSTALLATION OK")
from pyserini.index import IndexReader

# Now we do not search the index, but retrieve a document directly from the index
reader = IndexReader(index_loc)

# Retrieve a document using its docid
#id = 'd6ed7028c686e5756ceb0aa0c9b62e0d'
id = hits[0].docid

# See class Document in https://github.com/castorini/pyserini/blob/master/pyserini/search/_base.py
# properties: docid; id (alias); lucene_document; contents; raw
doc = reader.doc(id).raw()
#print(doc)

# Get analyzed form (tokenized, stemmed, stopwords removed)
analyzed = reader.analyze(doc)
#print(analyzed)

# Raw document VECTOR is also stored
doc_vector = reader.get_document_vector(id)
utils.top_n_words(doc_vector, 10)

# ----------------
# Topics
# ----------------
from pyserini.search import get_topics
topics = get_topics('core18')

# Get some information on all topics
utils.print_topic(topics)