Beispiel #1
0
def main(queries_file, qrels_file, output_file, write_negative):
    queries = read_topics(queries_file)
    index_reader = IndexReader('indexes/msmarco-passage')
    document_count = int(index_reader.stats()['documents'])
    qrels = open(qrels_file, 'r')

    with open(output_file, 'w') as output_file_handle:
        for line in qrels:
            line = line.strip().split('\t')

            qid = int(line[0])
            docid = line[2]
            target = line[3]
            query = queries[qid]['title']

            features = compute_features(index_reader, query, docid)
            output_file_handle.write(
                format_qrel_line(target, qid, features, docid))

            # The evaluation set doesn't need negative examples.
            if write_negative:
                negative_docid = str(get_negative_docid(document_count, docid))
                features = compute_features(index_reader, query,
                                            negative_docid)
                output_file_handle.write(
                    format_qrel_line(0, qid, features, negative_docid))
Beispiel #2
0
 def _compute_idf(index_path):
     from pyserini.index import IndexReader
     index_reader = IndexReader(index_path)
     tokens = []
     dfs = []
     for term in index_reader.terms():
         dfs.append(term.df)
         tokens.append(term.term)
     idfs = np.log((index_reader.stats()['documents'] / (np.array(dfs))))
     return dict(zip(tokens, idfs))
Beispiel #3
0
def compute_idf(query_terms: List[str],
                index_reader: IndexReader) -> np.ndarray:
    """log ( (|C| - df(term) + 0.5) / (df(term) + 0.5)"""
    C = index_reader.stats()['documents']

    query_idf = np.zeros(len(query_terms))
    for i, term in enumerate(query_terms):
        term_df = index_reader.get_term_counts(term, analyzer=None)[0]

        query_idf[i] = np.log(np.divide(C - term_df + 0.5, term_df + 0.5))
    return query_idf
Beispiel #4
0
def main():
    try:
        # Location of the generated index
        index_loc = "indexes/msmarco-passage/lucene-index-msmarco"

        # Create a searcher object
        searcher = SimpleSearcher(index_loc)
        # Set the active scorer to BM25
        searcher.set_bm25(k1=0.9, b=0.4)
        # Fetch 3 results for the given test query
        results = searcher.search('this is a test query', k=3)
        # For all results print the docid and the score
        expected = ['5578280', '2016011', '7004677']
        docids = [x.docid for x in results]
        if expected != docids:
            raise Exception('Test query results do not match expected:',
                            expected, '(expecteD)', docids, '(actual)')
        # IndexReader can give information about the index
        indexer = IndexReader(index_loc)
        if indexer.stats()['total_terms'] != 352316036:
            raise Exception(
                'There are an unexpected number of terms in your index set, perhaps something went wrong while downloading and indexing the dataset?'
            )
        topics = get_topics("msmarco-passage-dev-subset")
        if topics == {}:
            raise Exception(
                'Could not find msmarco-passage-dev-subset... Best approach is to retry indexing the dataset.'
            )
        first_query = topics[list(topics.keys())[0]]['title']
        if first_query != "why do people grind teeth in sleep":
            raise Exception(
                'Found a different first query than expected in the dataset. Did you download the right dataset?'
            )
        # Using the pyserini tokenizer/stemmer/etc. to create queries from scratch
        # Using the pyserini tokenizer/stemmer/etc. to create queries from scratch
        query = "This is a test query in which things are tested. Found using www.google.com of course!"
        # Tokenizing in pyserini is called Analyzing
        output = indexer.analyze(query)
        if len(output) != 9:
            raise Exception(
                'Tokenizer is not working correctly, something is probably wrong in Anserini. Perhaps try to install Anserini again.'
            )
    except Exception as inst:
        print('ERROR: something went wrong in the installation')
        print(inst)
    else:
        print("INSTALLATION OK")
Beispiel #5
0
parser.add_argument('--msmarco_dir', type=str, default="./data")
parser.add_argument('--index_dir', type=str, default="./data/index")
parser.add_argument('--output_dir', type=str, default="./data/bm25_result")
parser.add_argument('--bm25_k1', type=float, default=0.6)
parser.add_argument('--bm25_b', type=float, default=0.8)
parser.add_argument('--threads', type=int, default=4)
parser.add_argument('--sample', type=int, default=0)
args = parser.parse_args()

if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)

indexer = IndexReader(args.index_dir)
searcher = SimpleSearcher(args.index_dir)
searcher.set_bm25(k1=args.bm25_k1, b=args.bm25_b)
num_candidates = indexer.stats()['documents']


def calculate_bm25(query):
    qid, text = query
    with open(os.path.join(args.output_dir, f"{qid}.tsv"), 'w') as outfile:
        candidates = searcher.search(text, k=num_candidates)
        for i in range(len(candidates)):
            outfile.write(f"{candidates[i].docid}\t{candidates[i].score}\n")


if __name__ == "__main__":
    # load the queries
    queries = dict()
    for line in open(os.path.join(args.msmarco_dir, f"queries.dev.tsv"), 'r'):
        qid, query = line.split('\t')
Beispiel #6
0
    'cf': 1005023
}, {
    'term': 'also',
    'cf': 991428
}, {
    'term': 'mai',
    'cf': 955836
}, {
    'term': 'most',
    'cf': 927327
}, {
    'term': 'about',
    'cf': 909980
}]

total_words = index_reader.stats()['total_terms']


def dirich(freq_term_in_doc,
           total_words_in_doc,
           freq_term_in_collection,
           total_words,
           mu=1000,
           log=True):
    output = 0

    if log:
        output = math.log(
            (freq_term_in_doc + mu * (freq_term_in_collection / total_words)) /
            (total_words_in_doc + mu))
    else: