def test_okapi(self): env = pyndri.OkapiQueryEnvironment(self.index) self.assertEqual(env.query('ipsum'), ((1, 0.691753771033259), )) self.assertEqual(env.query('his'), ((3, -0.3292246306130194), (2, -0.7195255702901702)))
def main(argv): if len(argv) < 1: print("Invalid configuration file.") sys.exit(0) query_log_fold = argv[1] print("Generating candidate queries") candidate_queries = [] pattern = re.compile('([^\s\w]|_)+') for query_log_file in glob.glob(query_log_fold + "user-ct-test-collection-*"): f = gzip.open(query_log_file) # skip first line f.readline() for line in f: line = line.decode("utf-8").split("\t") query_string = line[1] if is_url_substring(query_string): continue query_string = pattern.sub('', query_string) candidate_queries.append(query_string) candidate_queries = set(candidate_queries) print("Found {} candidate queries".format(len(candidate_queries))) print("Generating pseudo labels") f_query = open("training_query_set.txt", encoding='utf-8', mode="w") f_label = open("training_pseudo_labels.txt", "w") with pyndri.open(config["index"]) as index: i = 0 bm25_query_env = pyndri.OkapiQueryEnvironment(index, k1=1.2, b=0.75, k3=1000) for candidate_query in candidate_queries: try: results = index.query(candidate_query, results_requested=1000) except: print(candidate_query) continue if len(results) < 10: continue f_query.write("{} {}\n".format(i, candidate_query)) for docid, score in results: docno, _ = index.document(docid) f_label.write("{} {} {}\n".format(i, docno, score)) i += 1 f.close() print("Finished with {} queries".format(i))
print('export {}LENGTH_MEAN={}'.format(prefix, mean)) print('export {}LENGTH_MIN={}'.format(prefix, min_)) print('export {}LENGTH_MAX={}'.format(prefix, max_)) print('export {}LENGTH_STD={}'.format(prefix, std)) print('export {}TOTAL_TERMS={}'.format(prefix, index.total_terms())) print('export {}UNIQUE_TERMS={}'.format(prefix, index.unique_terms())) with pyndri.open(sys.argv[1]) as index: # Constructs a QueryEnvironment that uses a # language model with Dirichlet smoothing. lm_query_env = pyndri.QueryEnvironment( index, rules=('method:dirichlet,mu:5000', )) print( lm_query_env.query('hello world', results_requested=-5, include_snippets=True)) # Constructs a QueryEnvironment that uses the TF-IDF retrieval model. # # See "Baseline (non-LM) retrieval" # (https://lemurproject.org/doxygen/lemur/html/IndriRunQuery.html) tfidf_query_env = pyndri.TFIDFQueryEnvironment(index) print(tfidf_query_env.query('hello world')) # Constructs a QueryEnvironment that uses the Okapi BM25 retrieval model. # # See "Baseline (non-LM) retrieval" # (https://lemurproject.org/doxygen/lemur/html/IndriRunQuery.html) bm25_query_env = pyndri.OkapiQueryEnvironment(index) print(bm25_query_env.query('hello world'))
t = time.time() # read query validation set filename = "data/validation_set/query_validation_set.txt" base_filename, file_extension = os.path.splitext(filename) output = f'{base_filename}.csv' input = open(filename, "r") lines = input.readlines() input.close() # index of corpus index = pyndri.Index('Vol45/Vol45-index') # define bm25 query environment bm25_query_env = pyndri.OkapiQueryEnvironment(index, k1=1.2, b=0.75, k3=1000) # retrieve documents and bm25 score df = pd.DataFrame() for i in range(len(lines)): query = lines[i].rstrip() list_of_series = getDocuments(index, bm25_query_env, query) df = pd.concat([df, pd.DataFrame(list_of_series)]) df.columns = ['topic', 'query', 'document_name', 'document_score'] # uncomment if you want to write queries and documents to csv #df.to_csv(output, index=False, chunksize=1000) # format output for trec_eval input_size = df.shape[0]