Beispiel #1
0
def compute_idf(query_terms: List[str],
                index_reader: IndexReader) -> np.ndarray:
    """log ( (|C| - df(term) + 0.5) / (df(term) + 0.5)"""
    C = index_reader.stats()['documents']

    query_idf = np.zeros(len(query_terms))
    for i, term in enumerate(query_terms):
        term_df = index_reader.get_term_counts(term, analyzer=None)[0]

        query_idf[i] = np.log(np.divide(C - term_df + 0.5, term_df + 0.5))
    return query_idf
Beispiel #2
0
from pyserini.index import IndexReader
import math, numpy


index_reader = IndexReader('marcoindex')
number_of_docs = 8841823
number_of_all_terms=491404850

def IDF(term)
    df, cf = index_reader.get_term_counts(term)
    return math.log10(number_of_docsdf)

def ictf(term)
    df, cf = index_reader.get_term_counts(term)
    return math.log10(number_of_all_terms  cf )

def SCS(query)
    q_terms=query.split()
    avgictf=[]
    for t in q_terms
        avgictf.append(ictf(index_reader,t))

    part_A= math.log10 (  1   len(q_terms))
    part_B = numpy.mean(avgictf)
    return ( part_A + part_B ) 

def SCQ(term)
    df, cf = index_reader.get_term_counts(term)
    part_A=  1 + math.log10(cf)
    part_B=IDF(index_reader,term)
    return (part_A  part_B)