def compute_idf(query_terms: List[str], index_reader: IndexReader) -> np.ndarray: """log ( (|C| - df(term) + 0.5) / (df(term) + 0.5)""" C = index_reader.stats()['documents'] query_idf = np.zeros(len(query_terms)) for i, term in enumerate(query_terms): term_df = index_reader.get_term_counts(term, analyzer=None)[0] query_idf[i] = np.log(np.divide(C - term_df + 0.5, term_df + 0.5)) return query_idf
from pyserini.index import IndexReader import math, numpy index_reader = IndexReader('marcoindex') number_of_docs = 8841823 number_of_all_terms=491404850 def IDF(term) df, cf = index_reader.get_term_counts(term) return math.log10(number_of_docsdf) def ictf(term) df, cf = index_reader.get_term_counts(term) return math.log10(number_of_all_terms cf ) def SCS(query) q_terms=query.split() avgictf=[] for t in q_terms avgictf.append(ictf(index_reader,t)) part_A= math.log10 ( 1 len(q_terms)) part_B = numpy.mean(avgictf) return ( part_A + part_B ) def SCQ(term) df, cf = index_reader.get_term_counts(term) part_A= 1 + math.log10(cf) part_B=IDF(index_reader,term) return (part_A part_B)