Esempio n. 1
0
    def test_okapi(self):
        env = pyndri.OkapiQueryEnvironment(self.index)

        self.assertEqual(env.query('ipsum'), ((1, 0.691753771033259), ))

        self.assertEqual(env.query('his'),
                         ((3, -0.3292246306130194), (2, -0.7195255702901702)))
Esempio n. 2
0
def main(argv):

    if len(argv) < 1:
        print("Invalid configuration file.")
        sys.exit(0)

    query_log_fold = argv[1]

    print("Generating candidate queries")
    candidate_queries = []
    pattern = re.compile('([^\s\w]|_)+')
    for query_log_file in glob.glob(query_log_fold +
                                    "user-ct-test-collection-*"):
        f = gzip.open(query_log_file)
        # skip first line
        f.readline()
        for line in f:
            line = line.decode("utf-8").split("\t")
            query_string = line[1]
            if is_url_substring(query_string):
                continue
            query_string = pattern.sub('', query_string)
            candidate_queries.append(query_string)
    candidate_queries = set(candidate_queries)
    print("Found {} candidate queries".format(len(candidate_queries)))

    print("Generating pseudo labels")
    f_query = open("training_query_set.txt", encoding='utf-8', mode="w")
    f_label = open("training_pseudo_labels.txt", "w")
    with pyndri.open(config["index"]) as index:
        i = 0
        bm25_query_env = pyndri.OkapiQueryEnvironment(index,
                                                      k1=1.2,
                                                      b=0.75,
                                                      k3=1000)
        for candidate_query in candidate_queries:
            try:
                results = index.query(candidate_query, results_requested=1000)
            except:
                print(candidate_query)
                continue
            if len(results) < 10:
                continue
            f_query.write("{} {}\n".format(i, candidate_query))
            for docid, score in results:
                docno, _ = index.document(docid)
                f_label.write("{} {} {}\n".format(i, docno, score))
            i += 1
        f.close()
        print("Finished with {} queries".format(i))
    print('export {}LENGTH_MEAN={}'.format(prefix, mean))
    print('export {}LENGTH_MIN={}'.format(prefix, min_))
    print('export {}LENGTH_MAX={}'.format(prefix, max_))
    print('export {}LENGTH_STD={}'.format(prefix, std))
    print('export {}TOTAL_TERMS={}'.format(prefix, index.total_terms()))
    print('export {}UNIQUE_TERMS={}'.format(prefix, index.unique_terms()))

    with pyndri.open(sys.argv[1]) as index:
        # Constructs a QueryEnvironment that uses a
        # language model with Dirichlet smoothing.
        lm_query_env = pyndri.QueryEnvironment(
            index, rules=('method:dirichlet,mu:5000', ))
        print(
            lm_query_env.query('hello world',
                               results_requested=-5,
                               include_snippets=True))

        # Constructs a QueryEnvironment that uses the TF-IDF retrieval model.
        #
        # See "Baseline (non-LM) retrieval"
        # (https://lemurproject.org/doxygen/lemur/html/IndriRunQuery.html)
        tfidf_query_env = pyndri.TFIDFQueryEnvironment(index)
        print(tfidf_query_env.query('hello world'))

        # Constructs a QueryEnvironment that uses the Okapi BM25 retrieval model.
        #
        # See "Baseline (non-LM) retrieval"
        # (https://lemurproject.org/doxygen/lemur/html/IndriRunQuery.html)
        bm25_query_env = pyndri.OkapiQueryEnvironment(index)
        print(bm25_query_env.query('hello world'))
Esempio n. 4
0
t = time.time()

# read query validation set
filename = "data/validation_set/query_validation_set.txt"
base_filename, file_extension = os.path.splitext(filename)
output = f'{base_filename}.csv'
input = open(filename, "r")
lines = input.readlines()
input.close()

# index of corpus
index = pyndri.Index('Vol45/Vol45-index')

# define bm25 query environment
bm25_query_env = pyndri.OkapiQueryEnvironment(index, k1=1.2, b=0.75, k3=1000)

# retrieve documents and bm25 score
df = pd.DataFrame()
for i in range(len(lines)):
    query = lines[i].rstrip()
    list_of_series = getDocuments(index, bm25_query_env, query)
    df = pd.concat([df, pd.DataFrame(list_of_series)])

df.columns = ['topic', 'query', 'document_name', 'document_score']

# uncomment if you want to write queries and documents to csv
#df.to_csv(output, index=False, chunksize=1000)

# format output for trec_eval
input_size = df.shape[0]