def test_read_results_topic_merging(self): orig_results = pd.DataFrame({ 'qid': ['1', '1', '2'], 'docno': ['A', 'B', 'C'], 'score': [0.8, 0.4, 0.6], 'rank': [1, 2, 1] }) pt.io.write_results(orig_results, 'test.res') for results in [ pt.io.read_results('test.res', dataset='vaswani'), pt.io.read_results('test.res', dataset=pt.get_dataset('vaswani')), pt.io.read_results( 'test.res', topics=pt.get_dataset('vaswani').get_topics()), ]: self.assertEqual( results.iloc[0].query, 'measurement of dielectric constant of liquids by the use of microwave techniques' ) self.assertEqual( results.iloc[1].query, 'measurement of dielectric constant of liquids by the use of microwave techniques' ) self.assertEqual( results.iloc[2].query, 'mathematical analysis and design details of waveguide fed microwave radiations' )
def test_cache_compose_cache(self): pt.cache.CACHE_DIR = self.test_dir import pandas as pd queries = pd.DataFrame([["q1", "chemical"]], columns=["qid", "query"]) br1 = pt.BatchRetrieve(pt.get_dataset("vaswani").get_index(), wmodel="TF_IDF") br2 = pt.BatchRetrieve(pt.get_dataset("vaswani").get_index(), wmodel="BM25") cache = ~ (~br1 >> br2) self.assertEqual(0, len(cache.chest._keys)) cache(queries) cache(queries) self.assertEqual(0.5, cache.stats()) #lets see if another cache of the same object would see the same cache entries. cache2 = ~(~br1 >> br2) cache2(queries) self.assertEqual(1, cache2.stats()) # check that the cache report works all_report = pt.cache.list_cache() self.assertTrue(len(all_report) > 0) report = list(all_report.values())[0] self.assertEqual(1, report["queries"]) self.assertTrue("transformer" in report) self.assertTrue("size" in report) self.assertTrue("lastmodified" in report) pt.cache.CACHE_DIR = None
def test_save_trec_generator(self): br = pt.BatchRetrieve(pt.get_dataset("vaswani").get_index(), wmodel="TF_IDF") filepath = os.path.join(self.test_dir, "test.res") pt.io.write_results(br.transform_gen( pt.get_dataset("vaswani").get_topics().head()), filepath, format="trec")
def test_fetch_text_docid(self): dfinput = pd.DataFrame([["q1", "a query", 1]], columns=["qid", "query", "docid"]) #directory, indexref, str, Index for indexlike in [ pt.get_dataset("vaswani").get_index(), pt.IndexRef.of(pt.get_dataset("vaswani").get_index()), pt.IndexRef.of(pt.get_dataset("vaswani").get_index()).toString(), pt.IndexFactory.of(pt.get_dataset("vaswani").get_index()) ]: textT = pt.text.get_text(indexlike, "docno") self.assertTrue(isinstance(textT, pt.transformer.TransformerBase)) dfOut = textT.transform(dfinput) self.assertTrue(isinstance(dfOut, pd.DataFrame)) self.assertTrue("docno" in dfOut.columns)
def test_variants(self): dataset = pt.get_dataset('irds:clueweb09/catb/trec-web-2009') with self.subTest('all fields'): topics = dataset.get_topics() self.assertEqual( ['qid', 'query', 'description', 'type', 'subtopics'], list(topics.columns)) with self.subTest('specific field'): topics = dataset.get_topics('description') self.assertEqual(['qid', 'query'], list( topics.columns)) # description mapped to query self.assertEqual( topics.iloc[0]['query'], 'find information on president barack obama s family history including genealogy national origins places and dates of birth etc' ) with self.subTest('specific field'): topics = dataset.get_topics('description', tokenise_query=False) self.assertEqual(['qid', 'query'], list( topics.columns)) # description mapped to query self.assertEqual( topics.iloc[0]['query'], "Find information on President Barack Obama's family\n history, including genealogy, national origins, places and dates of\n birth, etc.\n " ) with self.subTest('field named query'): topics = dataset.get_topics('query') self.assertEqual(['qid', 'query'], list(topics.columns)) self.assertEqual(topics.iloc[0]['query'], 'obama family tree') with self.assertRaises(AssertionError): dataset.get_topics('field_that_does_not_exist')
def test_save_docs_CE(self): index = pt.get_dataset("vaswani").get_index() dph = pt.BatchRetrieve(index, wmodel="DPH") pipe = dph \ >> pt.rewrite.stash_results() \ >> pt.BatchRetrieve(index, wmodel="BM25") \ >> pt.rewrite.Bo1QueryExpansion(index) \ >> pt.rewrite.reset_results() \ >> dph rtr1 = dph.search("chemical reactions") rtr2 = pipe.search("chemical reactions") # Bo1 should be applied as a re-ranker, hence the # number of docs in rtr1 and rtr2 should be equal self.assertEqual(len(rtr1), len(rtr2)) # check columns are passed through where we expect pipeP3 = dph \ >> pt.rewrite.stash_results() \ >> pt.BatchRetrieve(index, wmodel="BM25") res3 = pipeP3.search("chemical reactions") self.assertIn("stashed_results_0", res3.columns) pipeP4 = dph \ >> pt.rewrite.stash_results() \ >> pt.BatchRetrieve(index, wmodel="BM25") \ >> pt.rewrite.Bo1QueryExpansion(index) res4 = pipeP3.search("chemical reactions") self.assertIn("stashed_results_0", res4.columns)
def test_scoring_text(self): pt.logging("DEBUG") dataset = pt.get_dataset("vaswani") indexer = pt.TRECCollectionIndexer( self.test_dir, meta={ 'docno': 26, 'body': 2048 }, # The tags from which to save the text. ELSE is special tag name, which means anything not consumed by other tags. meta_tags={'body': 'ELSE'}) indexref = indexer.index(dataset.get_corpus()) index = pt.IndexFactory.of(indexref) meta = index.getMetaIndex() self.assertTrue("body" in meta.getKeys()) self.assertTrue("compact memories have" in meta.getItem("body", 0)) print(meta.getItem("body", 1047)) self._test_scoring_text(dataset, index, "org.terrier.python.TestModel$Constant") self._test_scoring_text(dataset, index, "Tf") self._test_scoring_text(dataset, index, "org.terrier.python.TestModel$TFOverN") self._test_scoring_text(dataset, index, "org.terrier.python.TestModel$F") self._test_scoring_text(dataset, index, "org.terrier.python.TestModel$Nt") self._test_scoring_text(dataset, index, "DPH")
def test_parallel_joblib_experiment(self): self.skip_windows() dataset = pt.get_dataset("vaswani") br = pt.BatchRetrieve(dataset.get_index()) df = pt.Experiment([br, br.parallel(3)], dataset.get_topics(), dataset.get_qrels(), ["map", "mrt"]) self.assertEqual(df.iloc[0]["map"], df.iloc[1]["map"])
def test_scoring_text(self): pt.logging("DEBUG") dataset = pt.get_dataset("vaswani") indexer = pt.TRECCollectionIndexer(self.test_dir) indexer.setProperties( **{ "TaggedDocument.abstracts": "body", # The tags from which to save the text. ELSE is special tag name, which means anything not consumed by other tags. "TaggedDocument.abstracts.tags": "ELSE", # The max lengths of the abstracts. Abstracts will be cropped to this length. Defaults to empty. "TaggedDocument.abstracts.lengths": "2048", "indexer.meta.forward.keys": "docno,body", "indexer.meta.forward.keylens": "26,2048" }) indexref = indexer.index(dataset.get_corpus()) index = pt.IndexFactory.of(indexref) meta = index.getMetaIndex() self.assertTrue("body" in meta.getKeys()) self.assertTrue("compact memories have" in meta.getItem("body", 0)) print(meta.getItem("body", 1047)) self._test_scoring_text(dataset, index, "org.terrier.python.TestModel$Constant") self._test_scoring_text(dataset, index, "Tf") self._test_scoring_text(dataset, index, "org.terrier.python.TestModel$TFOverN") self._test_scoring_text(dataset, index, "org.terrier.python.TestModel$F") self._test_scoring_text(dataset, index, "org.terrier.python.TestModel$Nt") self._test_scoring_text(dataset, index, "DPH")
def test_monot5_vaswani(self): if not pt.started(): pt.init() bm25 = pt.BatchRetrieve(pt.get_dataset('vaswani').get_index(), wmodel='BM25') monoT5 = pyterrier_t5.MonoT5ReRanker() pipeline = bm25 % 20 >> pt.text.get_text( pt.get_dataset('irds:vaswani'), 'text') >> monoT5 result = pipeline.search('fluid dynamics') self.assertEqual(result.iloc[0]['docno'], '11216') self.assertAlmostEqual(result.iloc[0]['score'], -2.186261, places=4) self.assertEqual(result.iloc[0]['rank'], 0) self.assertEqual(result.iloc[1]['docno'], '5299') self.assertAlmostEqual(result.iloc[1]['score'], -8.078399, places=4) self.assertEqual(result.iloc[1]['rank'], 1) self.assertEqual(result.iloc[-1]['docno'], '3442') self.assertAlmostEqual(result.iloc[-1]['score'], -12.725513, places=4) self.assertEqual(result.iloc[-1]['rank'], 19)
def test_cache_compose(self): pt.cache.CACHE_DIR = self.test_dir import pandas as pd queries = pd.DataFrame([["q1", "chemical"]], columns=["qid", "query"]) br1 = pt.BatchRetrieve(pt.get_dataset("vaswani").get_index(), wmodel="TF_IDF") br2 = pt.BatchRetrieve(pt.get_dataset("vaswani").get_index(), wmodel="BM25") cache = ~ (br1 >> br2) self.assertEqual(0, len(cache.chest._keys)) cache(queries) cache(queries) self.assertEqual(0.5, cache.stats()) #lets see if another cache of the same object would see the same cache entries. cache2 = ~(br1 >> br2) cache2(queries) self.assertEqual(1, cache2.stats()) pt.cache.CACHE_DIR = None
def test_sliding(self): slider = pt.text.sliding("text", 10, 10, prepend_attr=None) indexer = pt.IterDictIndexer(self.test_dir) pipeline = slider >> indexer dataset = pt.get_dataset("irds:vaswani") indexref = pipeline.index(dataset.get_corpus_iter()) self.assertIsNotNone(indexref) index = pt.IndexFactory.of(indexref) self.assertTrue( index.getCollectionStatistics().getNumberOfDocuments() > len(dataset.get_corpus_iter()))
def test_duot5_vaswani(self): if not pt.started(): pt.init() bm25 = pt.BatchRetrieve(pt.get_dataset('vaswani').get_index(), wmodel='BM25') duoT5 = pyterrier_t5.DuoT5ReRanker() pipeline = bm25 % 10 >> pt.text.get_text( pt.get_dataset('irds:vaswani'), 'text') >> duoT5 result = pipeline.search('fluid dynamics') self.assertEqual(result.iloc[0]['docno'], '9731') self.assertAlmostEqual(result.iloc[0]['score'], 44.621585, places=4) self.assertEqual(result.iloc[0]['rank'], 0) self.assertEqual(result.iloc[1]['docno'], '7045') self.assertAlmostEqual(result.iloc[1]['score'], 27.716750, places=4) self.assertEqual(result.iloc[1]['rank'], 1) self.assertEqual(result.iloc[-1]['docno'], '4767') self.assertAlmostEqual(result.iloc[-1]['score'], -9.916206, places=4) self.assertEqual(result.iloc[-1]['rank'], 9)
def test_parallel_joblib_experiment_br_callback(self): self.skip_windows() dataset = pt.get_dataset("vaswani") Tf = lambda keyFreq, posting, entryStats, collStats: posting.getFrequency( ) br = pt.BatchRetrieve(dataset.get_index(), wmodel=Tf) df = pt.Experiment([br, br.parallel(3)], dataset.get_topics(), dataset.get_qrels(), ["map", "mrt"]) self.assertEqual(df.iloc[0]["map"], df.iloc[1]["map"])
def test_TREC_indexing_bad_files_type(self): print("Writing index to " + self.test_dir) indexer = pt.TRECCollectionIndexer(self.test_dir) with self.assertRaises(ValueError): indexRef = indexer.index(5) indexer = pt.TRECCollectionIndexer(self.test_dir) with self.assertRaises(ValueError): indexRef = indexer.index( pt.get_dataset("vaswani").get_corpus_iter())
def test_baseline(self): dataset = pt.get_dataset("vaswani") df = pt.Experiment( [pt.BatchRetrieve(dataset.get_index(), wmodel="BM25"), pt.BatchRetrieve(dataset.get_index(), wmodel="DPH")], dataset.get_topics().head(10), dataset.get_qrels(), eval_metrics=["map", "ndcg"], baseline=0) self.assertTrue("map +" in df.columns) self.assertTrue("map -" in df.columns) self.assertTrue("map p-value" in df.columns)
def test_threading_selfupgrade(self): if not pt.check_version("5.5"): self.skipTest("Requires Terrier 5.5") topics = pt.get_dataset("vaswani").get_topics().head(10) #this test ensures we can upgrade the indexref to be concurrent JIR = pt.autoclass('org.terrier.querying.IndexRef') indexref = JIR.of(self.here + "/fixtures/index/data.properties") retr = pt.BatchRetrieve(indexref, threads=5) result = retr.transform(topics)
def test_fetch_text_irds(self): dfinput = pd.DataFrame([["q1", "a query", "4"]], columns=["qid", "query", "docno"]) textT = pt.text.get_text(pt.get_dataset('irds:vaswani'), "text") self.assertTrue(isinstance(textT, pt.transformer.TransformerBase)) dfOut = textT.transform(dfinput) self.assertTrue(isinstance(dfOut, pd.DataFrame)) self.assertTrue("text" in dfOut.columns) self.assertTrue( "the british computer society report of a conference held in cambridge\njune\n" in dfOut.iloc[0].text)
def test_parallel_joblib_ops(self): dataset = pt.get_dataset("vaswani") topics = dataset.get_topics().head(3) dph = pt.BatchRetrieve(dataset.get_index()) tf = pt.BatchRetrieve(dataset.get_index(), wmodel="Tf") for pipe in [ dph, dph % 10, dph >> tf, dph + tf, pt.apply.query(lambda row: row["query"] + " chemical") >> dph ]: res1 = pipe(topics) res2 = pipe.parallel(3)(topics) self.assertEqual(len(res1), len(res2))
def test_gridsearch(self): dataset = pt.get_dataset("vaswani") pipe = pt.BatchRetrieve(dataset.get_index(), wmodel="PL2", controls={'c': 1}) rtr = pt.pipelines.GridSearch(pipe, {pipe: { 'c': [0.1, 1, 5, 10, 20, 100] }}, dataset.get_topics().head(5), dataset.get_qrels()) self.assertEqual(100, rtr.get_parameter("c"))
def test_webtrack_cw09(self): import pyterrier as pt for k in [ "trec-wt-2009", "trec-wt-2010", "trec-wt-2011", "trec-wt-2012" ]: ds = pt.get_dataset(k) topics = ds.get_topics() qrels = ds.get_qrels("adhoc") #check that the qrels match the topics. join = topics.merge(qrels, on=["qid"]) self.assertTrue(len(join) > 0)
def test_gridscan_1param(self): dataset = pt.get_dataset("vaswani") pipe = pt.BatchRetrieve(dataset.get_index(), wmodel="PL2", controls={'c': 1}) self.assertEqual(1, pipe.get_parameter('c')) rtr = pt.GridScan(pipe, {pipe: { 'c': [0.1, 1, 5, 10, 20, 100] }}, dataset.get_topics().head(5), dataset.get_qrels(), dataframe=False) self.assertEqual(6, len(rtr))
def test_save_docs_QE(self): index = pt.get_dataset("vaswani").get_index() dph = pt.BatchRetrieve(index, wmodel="DPH") pipe = dph \ >> pt.rewrite.stash_results(clear=False) \ >> pt.rewrite.Bo1QueryExpansion(index) \ >> pt.rewrite.reset_results() \ >> dph rtr1 = dph.search("chemical reactions") rtr2 = pipe.search("chemical reactions") # Bo1 should be applied as a re-ranker, hence the # number of docs in rtr1 and rtr2 should be equal self.assertEqual(len(rtr1), len(rtr2))
def test_sliding_title_one(self): corpus = [{"docno": "d1", "text": "A B", "title": "this is a title"}] slider = pt.text.sliding("text", 2, 1, prepend_attr="title") indexer = pt.IterDictIndexer(self.test_dir) pipeline = slider >> indexer dataset = pt.get_dataset("irds:vaswani") indexref = pipeline.index(corpus) self.assertIsNotNone(indexref) index = pt.IndexFactory.of(indexref) # we should get 1 passages in the resulting index self.assertEqual( 1, index.getCollectionStatistics().getNumberOfDocuments())
def test_baseline_and_tests(self): dataset = pt.get_dataset("vaswani") numt=10 res1 = pt.BatchRetrieve(dataset.get_index(), wmodel="BM25")(dataset.get_topics().head(numt)) res2 = pt.BatchRetrieve(dataset.get_index(), wmodel="DPH")(dataset.get_topics().head(numt)) # t-test with warnings.catch_warnings(): warnings.simplefilter("ignore") df = pt.Experiment( [res1, res2], dataset.get_topics().head(numt), dataset.get_qrels(), eval_metrics=["map", "ndcg"], baseline=0) self.assertTrue("map +" in df.columns) self.assertTrue("map -" in df.columns) self.assertTrue("map p-value" in df.columns) # wilcoxon signed-rank test df = pt.Experiment( [res1, res2], dataset.get_topics().head(numt), dataset.get_qrels(), eval_metrics=["map", "ndcg"], test='wilcoxon', baseline=0) self.assertTrue("map +" in df.columns) self.assertTrue("map -" in df.columns) self.assertTrue("map p-value" in df.columns) # user-specified TOST # TOST will omit warnings here, due to low numbers of topics import statsmodels.stats.weightstats fn = lambda X,Y: (0, statsmodels.stats.weightstats.ttost_ind(X, Y, -0.01, 0.01)[0]) #This filter doesnt work with warnings.catch_warnings(record=True) as w: warnings.filterwarnings("always") df = pt.Experiment( [res1, res2], dataset.get_topics().head(numt), dataset.get_qrels(), eval_metrics=["map", "ndcg"], test=fn, baseline=0) print(w) self.assertTrue("map +" in df.columns) self.assertTrue("map -" in df.columns) self.assertTrue("map p-value" in df.columns)
def test_baseline_corrected(self): dataset = pt.get_dataset("vaswani") for corr in ['hs', 'bonferroni', 'holm-sidak']: df = pt.Experiment( [pt.BatchRetrieve(dataset.get_index(), wmodel="BM25"), pt.BatchRetrieve(dataset.get_index(), wmodel="DPH")], dataset.get_topics().head(10), dataset.get_qrels(), eval_metrics=["map", "ndcg"], baseline=0, correction='hs') self.assertTrue("map +" in df.columns) self.assertTrue("map -" in df.columns) self.assertTrue("map p-value" in df.columns) self.assertTrue("map p-value corrected" in df.columns) self.assertTrue("map reject" in df.columns)
def test_fetch_text_irds(self): dfinput = pd.DataFrame([ ["q1", "a query", "4"], ["q1", "a query", "1"], ["q1", "a query", "4"], ], columns=["qid", "query", "docno"]) textT = pt.text.get_text(pt.get_dataset('irds:vaswani'), "text") self.assertTrue(isinstance(textT, pt.transformer.TransformerBase)) dfOut = textT.transform(dfinput) self.assertTrue(isinstance(dfOut, pd.DataFrame)) self.assertTrue("text" in dfOut.columns) self.assertTrue("the british computer society report of a conference held in cambridge\njune\n" in dfOut.iloc[0].text) self.assertTrue("compact memories have flexible capacities a digital data storage\nsystem with capacity up to bits and random and or sequential access\nis described\n" in dfOut.iloc[1].text) self.assertTrue("the british computer society report of a conference held in cambridge\njune\n" in dfOut.iloc[2].text)
def test_scoring_manual_background(self): input = pd.DataFrame([["q1", "fox", "d1", "all the fox were fox"]], columns=["qid", "query", "docno", "body"]) from pyterrier.batchretrieve import TextScorer scorer = TextScorer( wmodel="Tf", background_index=pt.get_dataset("vaswani").get_index()) rtr = scorer(input) self.assertEqual(1, len(rtr)) self.assertTrue("score" in rtr.columns) self.assertEqual(2, rtr.iloc[0]["score"]) index_background = pt.IndexFactory.of( pt.get_dataset("vaswani").get_index()) scorer = TextScorer(wmodel="org.terrier.python.TestModel$TFOverN", background_index=index_background) rtr = scorer(input) self.assertEqual(1, len(rtr)) self.assertTrue("score" in rtr.columns) self.assertEqual( 2 / index_background.getCollectionStatistics().getNumberOfDocuments(), rtr.iloc[0]["score"])
def test_webtrack_gov(self): import pyterrier as pt for k in ["trec-wt-2002", "trec-wt-2003", "trec-wt-2004"]: ds = pt.get_dataset(k) for t in ["np", "td", "hp"]: if k != "trec-wt-2004": #HP finding only for the 2004 task? continue topics = ds.get_topics(t) qrels = ds.get_qrels(t) #check that the qrels qid match the topics. join = topics.merge(qrels, on=["qid"]) self.assertTrue(len(join) > 0)
def test_add_dup(self): def _first(df): df2 = df.copy() df2["docno"] = df2["docno"] + "bis" return pd.concat([df, df2]) slider = pt.apply.generic(_first) indexer = pt.IterDictIndexer(self.test_dir) pipeline = slider >> indexer dataset = pt.get_dataset("irds:vaswani") #print(next(dataset.get_corpus_iter().gen)) indexref = pipeline.index(dataset.get_corpus_iter()) self.assertIsNotNone(indexref) index = pt.IndexFactory.of(indexref) self.assertEqual( index.getCollectionStatistics().getNumberOfDocuments(), 2 * len(dataset.get_corpus_iter()))