def _test_vaswani(self, mhs, TWO_D=False): corpus_file = pt.datasets.get_dataset("vaswani").get_corpus()[0] Arrays = pt.autoclass("java.util.Arrays") corpus_file_list = Arrays.asList(corpus_file) trec_properties = { "TrecDocTags.doctag": "DOC", "TrecDocTags.idtag": "DOCNO", "TrecDocTags.skip": "DOCHDR", "TrecDocTags.casesensitive": "false", "trec.collection.class": "TRECCollection", } for k, v in trec_properties.items(): pt.ApplicationSetup.setProperty(k, v) corpus = pt.autoclass("org.terrier.indexing.TRECCollection")( corpus_file_list, pt.autoclass("org.terrier.utility.TagSet").TREC_DOC_TAGS, "", "") def _get_text(d): terms = [] while not d.endOfDocument(): t = d.getNextTerm() if t is None: continue terms.append(t) return " ".join(terms) def _corpus_iter(): while corpus.nextDocument(): doc = corpus.getDocument() text = _get_text(doc) docno = doc.getProperty("docno") yield docno, text mhs.index(_corpus_iter()) if TWO_D: oneDmatrix = mhs.pairwise_sim() maxPos = None maxSim = -1 numDocs = len(mhs.docNames) for i in range(0, numDocs): # For each of the other test documents... for j in range(i + 1, numDocs): if oneDmatrix[mhs.getTriangleIndex(i, j)] > maxSim: maxSim = oneDmatrix[mhs.getTriangleIndex(i, j)] maxPos = (i, j) print("Most similar pair is %s, with sim %f" % (str(maxPos), maxSim)) input = pd.DataFrame([["17"]], columns=["docno"]) rtr = mhs.transform(input) print(rtr)
def test_num_manual_wmodel(self): JIR = pt.autoclass('org.terrier.querying.IndexRef') Tf = pt.autoclass("org.terrier.matching.models.Tf")() indexref = JIR.of(self.here + "/fixtures/index/data.properties") from jnius import JavaException try: retr = pt.BatchRetrieve(indexref, wmodel=Tf) input = pd.DataFrame([["1", "Stability"]], columns=['qid', 'query']) result = retr.transform(input) except JavaException as ja: print(ja.stacktrace) raise ja
def test_fbr_reranking(self): if not pt.check_version("5.3"): self.skipTest("Requires Terrier 5.3") # this test examines the use of ScoringMatchingWithFat JIR = pt.autoclass('org.terrier.querying.IndexRef') indexref = JIR.of(self.here + "/fixtures/index/data.properties") # we only want a candidate set of 2 documents firstpass = pt.BatchRetrieve(indexref, wmodel="BM25") % 2 pipe = firstpass >> pt.FeaturesBatchRetrieve( indexref, features=["WMODEL:DPH", "WMODEL:PL2"]) input = pd.DataFrame([["1", "Stability"]], columns=['qid', 'query']) result = pipe.transform(input) self.assertTrue("qid" in result.columns) self.assertTrue("docno" in result.columns) self.assertTrue("score" in result.columns) self.assertTrue("features" in result.columns) self.assertEqual(2, len(result)) self.assertEqual(result.iloc[0]["features"].size, 2) pipe_simple = firstpass >> (pt.BatchRetrieve(indexref, wmodel="DPH")** pt.BatchRetrieve(indexref, wmodel="PL2")) result2 = pipe.transform(input) import numpy as np f1 = np.stack(result["features"].values) f2 = np.stack(result2["features"].values) self.assertTrue(np.array_equal(f1, f2))
def test_num_results(self): JIR = pt.autoclass('org.terrier.querying.IndexRef') indexref = JIR.of(self.here + "/fixtures/index/data.properties") retr = pt.BatchRetrieve(indexref, num_results=10) input = pd.DataFrame([["1", "Stability"]], columns=['qid', 'query']) result = retr.transform(input) self.assertEqual(len(result), 10)
def get_topics(self, variant=None, tokenise_query=True): """ Returns the topics, as a dataframe, ready for retrieval. """ ds = self.irds_ref() assert ds.has_queries(), f"{self._irds_id} doesn't support get_topics" qcls = ds.queries_cls() assert variant is None or variant in qcls._fields[1:], f"{self._irds_id} only supports the following topic variants {qcls._fields[1:]}" df = pd.DataFrame(ds.queries_iter()) df.rename(columns={"query_id": "qid"}, inplace=True) # pyterrier uses "qid" if variant is not None: df.rename(columns={variant: "query"}, inplace=True) # user specified which version of the query they want df.drop(df.columns.difference(['qid','query']), 1, inplace=True) elif len(qcls._fields) == 2: # auto-rename single query field to "query" if there's only query_id and that field df.rename(columns={qcls._fields[1]: "query"}, inplace=True) else: print(f'There are multiple query fields available: {qcls._fields[1:]}. To use with pyterrier, provide variant or modify dataframe to add query column.') # apply pyterrier tokenisation (otherwise the queries may not play well with batchretrieve) if tokenise_query and 'query' in df: import pyterrier as pt tokeniser = pt.autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser() def pt_tokenise(text): return ' '.join(tokeniser.getTokens(text)) df['query'] = df['query'].apply(pt_tokenise) return df
def test_two_term_query_correct_qid_docid_score(self): JIR = pt.autoclass('org.terrier.querying.IndexRef') indexref = JIR.of(self.here + "/fixtures/index/data.properties") retr = pt.BatchRetrieve(indexref) input = pd.DataFrame([["1", "Stability"], ["2", "Generator"]], columns=['qid', 'query']) result = retr.transform(input) exp_result = parse_res_file( os.path.dirname(os.path.realpath(__file__)) + "/fixtures/two_queries_result") for index, row in result.iterrows(): self.assertEqual(row['qid'], exp_result[index][0]) self.assertEqual(row['docno'], exp_result[index][1]) self.assertAlmostEqual(row['score'], exp_result[index][2]) input = pd.DataFrame([[1, "Stability"], [2, "Generator"]], columns=['qid', 'query']) result = retr.transform(input) exp_result = parse_res_file( os.path.dirname(os.path.realpath(__file__)) + "/fixtures/two_queries_result") for index, row in result.iterrows(): self.assertEqual(str(row['qid']), exp_result[index][0]) self.assertEqual(row['docno'], exp_result[index][1]) self.assertAlmostEqual(row['score'], exp_result[index][2])
def __init__(self, *args, fb_terms=10, fb_docs=3, **kwargs): """ Args: index_like: the Terrier index to use fb_terms(int): number of terms to add to the query fb_docs(int): number of feedback documents to consider """ global terrier_prf_package_loaded #if not terrier_prf_package_loaded: # pt.extend_classpath("org.terrier:terrier-prf") # terrier_prf_package_loaded = True #rm = pt.ApplicationSetup.getClass("org.terrier.querying.RM3").newInstance() import jnius_config prf_found = False for j in jnius_config.get_classpath(): if "terrier-prf" in j: prf_found = True break assert prf_found, 'terrier-prf jar not found: you should start Pyterrier with '\ + 'pt.init(boot_packages=["org.terrier:terrier-prf:0.0.1-SNAPSHOT"])' rm = pt.autoclass("org.terrier.querying.AxiomaticQE")() self.fb_terms = fb_terms self.fb_docs = fb_docs kwargs["qeclass"] = rm super().__init__(*args, **kwargs)
def get_index(self, variant=None): import pyterrier as pt if self.name == "50pct" and variant is None: variant = "ex1" thedir = self._get_all_files("index", variant=variant) return pt.autoclass("org.terrier.querying.IndexRef").of( os.path.join(thedir, "data.properties"))
def test_fbr_empty(self): JIR = pt.autoclass('org.terrier.querying.IndexRef') indexref = JIR.of(self.here + "/fixtures/index/data.properties") retr = pt.FeaturesBatchRetrieve(indexref, ["WMODEL:PL2"], wmodel="DPH") input = pd.DataFrame([["1", ""]], columns=['qid', 'query']) with warnings.catch_warnings(record=True) as w: result = retr.transform(input) assert "Skipping empty query" in str(w[-1].message) self.assertTrue(len(result) == 0)
def __init__(self, index_like, fb_terms=10, fb_docs=3, qeclass="org.terrier.querying.QueryExpansion", verbose=0, **kwargs): super().__init__(**kwargs) self.verbose = verbose if isinstance(qeclass, str): self.qe = pt.autoclass(qeclass)() else: self.qe = qeclass self.indexref = parse_index_like(index_like) self.fb_terms = fb_terms self.fb_docs = fb_docs self.manager = pt.autoclass( "org.terrier.querying.ManagerFactory")._from_(self.indexref)
def get_text(indexlike, metadata: Union[str, List[str]] = "body", by_query: bool = False, verbose: bool = False) -> TransformerBase: """ A utility transformer for obtaining the text from the text of documents (or other document metadata) from Terrier's MetaIndex or an IRDSDataset docstore. Arguments: - indexlike: a Terrier index or IRDSDataset to retrieve the metadata from - metakeys(list(str) or str): a list of strings of the metadata keys to retrieve from the index. Defaults to ["body"] - by_query(bool): whether the entire dataframe should be progressed at once, rather than one query at a time. Defaults to false, which means that all document metadata will be fetched at once. - verbose(bool): whether to print a tqdm progress bar. Defaults to false. Has no effect when by_query=False Example:: pipe = pt.BatchRetrieve(index, wmodel="DPH") \ >> pt.text.get_text(index) \ >> pt.text.scorer(wmodel="DPH") """ import pyterrier as pt JIR = pt.autoclass('org.terrier.querying.IndexRef') JI = pt.autoclass('org.terrier.structures.Index') if isinstance(metadata, str): metadata = [metadata] if isinstance(indexlike, str) or isinstance(indexlike, JIR): index = pt.IndexFactory.of(indexlike) add_text_fn = _add_text_terrier_metaindex(index, metadata) elif isinstance(indexlike, JI): add_text_fn = _add_text_terrier_metaindex(indexlike, metadata) elif isinstance(indexlike, IRDSDataset): add_text_fn = _add_text_irds_docstore(indexlike, metadata) else: raise ValueError( "indexlike %s of type %s not supported. Pass a string, an IndexRef, an Index, or an IRDSDataset" % (str(indexlike), type(indexlike))) if by_query: return pt.apply.by_query(add_text_fn, verbose=verbose) return pt.apply.generic(add_text_fn)
def test_threading_selfupgrade(self): if not pt.check_version("5.5"): self.skipTest("Requires Terrier 5.5") topics = pt.get_dataset("vaswani").get_topics().head(10) #this test ensures we can upgrade the indexref to be concurrent JIR = pt.autoclass('org.terrier.querying.IndexRef') indexref = JIR.of(self.here + "/fixtures/index/data.properties") retr = pt.BatchRetrieve(indexref, threads=5) result = retr.transform(topics)
def _test_it(self, type): import pyterrier as pt import pandas as pd df1 = pd.DataFrame({ 'docno': ['1', '2', '3'], 'url': ['url1', 'url2', 'url3'], 'text': [ 'He ran out of money, so he had to stop playing', 'The wave were crash on the shore; it was a', 'The body may perhaps compensates for the loss' ] }) pd_indexer1 = pt.DFIndexer(tempfile.mkdtemp(), type=type) indexref1 = pd_indexer1.index(df1["text"], df1["docno"]) df2 = pd.DataFrame({'docno': ['14'], 'text': ['test wave']}) from jnius import JavaException try: pd_indexer2 = pt.DFIndexer(tempfile.mkdtemp(), type=type) indexref2 = pd_indexer2.index(df2["text"], df2["docno"]) index1 = pt.IndexFactory.of(indexref1) self.assertEqual( 3, index1.getCollectionStatistics().getNumberOfDocuments()) index2 = pt.IndexFactory.of(indexref2) self.assertEqual( 1, index2.getCollectionStatistics().getNumberOfDocuments()) index_combined = pt.autoclass( "org.terrier.python.IndexWithBackground")(index2, index1) self.assertEqual( 3, index_combined.getCollectionStatistics().getNumberOfDocuments( )) self.assertEqual( 1, index_combined.getLexicon()["test"].getFrequency()) # this is 1 as we used the background index for the background # WITHOUT adding the statistics of the local index self.assertEqual( 1, index_combined.getLexicon()["wave"].getFrequency()) except JavaException as ja: print(ja.stacktrace) raise ja
def test_fbr_ltr(self): JIR = pt.autoclass('org.terrier.querying.IndexRef') indexref = JIR.of(self.here + "/fixtures/index/data.properties") retr = pt.FeaturesBatchRetrieve(indexref, ["WMODEL:PL2"], wmodel="DPH") topics = pt.io.read_topics(self.here + "/fixtures/vaswani_npl/query-text.trec").head(3) qrels = pt.io.read_qrels(self.here + "/fixtures/vaswani_npl/qrels") res = retr.transform(topics) res = res.merge(qrels, on=['qid', 'docno'], how='left').fillna(0) from sklearn.ensemble import RandomForestClassifier import numpy as np #print(res.dtypes) RandomForestClassifier(n_estimators=10).fit(np.stack(res["features"]), res["label"])
def __init__(self, *args, fb_terms=10, fb_docs=3, fb_lambda=0.6, **kwargs): """ Args: index_like: the Terrier index to use fb_terms(int): number of terms to add to the query. Terrier's default setting is 10 expansion terms. fb_docs(int): number of feedback documents to consider. Terrier's default setting is 3 feedback documents. """ _check_terrier_prf() rm = pt.autoclass("org.terrier.querying.RM3")() self.fb_lambda = fb_lambda kwargs["qeclass"] = rm super().__init__(*args, fb_terms=fb_terms, fb_docs=fb_docs, **kwargs)
def transform(self, topics_and_res): results = [] from .model import query_columns, push_queries queries = topics_and_res[query_columns( topics_and_res, qid=True)].dropna( axis=0, subset=query_columns(topics_and_res, qid=False)).drop_duplicates() # instantiate the DependenceModelPreProcess, specifying a proximity model if specified sdm = DependenceModelPreProcess( ) if self.prox_model is None else DependenceModelPreProcess( self.prox_model) for row in tqdm(queries.itertuples(), desc=self.name, total=queries.shape[0], unit="q") if self.verbose else queries.itertuples(): qid = row.qid query = row.query # parse the querying into a MQT rq = pt.autoclass("org.terrier.querying.Request")() rq.setQueryID(qid) rq.setOriginalQuery(query) TerrierQLParser.process(None, rq) TerrierQLToMatchingQueryTerms.process(None, rq) if self.remove_stopwords: self.ApplyTermPipeline_stopsonly.process(None, rq) # rewrite the query sdm.expandQuery(rq.getMatchingQueryTerms(), rq) new_query = "" # put the query back into a matchopql form that Terrier can parse later for me in rq.getMatchingQueryTerms(): term = me.getKey().toString() w = me.getValue().getWeight() prefix = "" if w != 1.0 or me.getValue().termModels.size() > 0: prefix = "#combine" if w != 1: prefix += ":0=" + str(w) if me.getValue().termModels.size() == 1: prefix += ":wmodel=" + me.getValue( ).termModels[0].getClass().getName() term = prefix + "(" + term + ")" new_query += term + " " new_query = new_query[:-1] results.append([qid, new_query]) new_queries = pd.DataFrame(results, columns=["qid", "query"]) # restore any other columns, e.g. put back docs if we are re-ranking return new_queries.merge(push_queries(topics_and_res, inplace=True), on="qid")
def test_wmodel_dunders(self): wmodel = pt.autoclass("org.terrier.matching.models.BM25")() wmodel.__reduce__() wmodel.__getstate__() rtr = wmodel.__reduce__() pt.cast("org.terrier.matching.models.BM25", rtr[0](*rtr[1])) import pickle #import dill as pickle #check the byte array is picklable print(rtr[1][0]) pickle.dumps(rtr[1][0]) pickle.dumps(wmodel)
def __init__(self, verbose=0, remove_stopwords=True, prox_model=None, **kwargs): super().__init__(**kwargs) self.verbose = 0 self.prox_model = prox_model self.remove_stopwords = remove_stopwords from . import check_version assert check_version("5.3") self.ApplyTermPipeline_stopsonly = pt.autoclass( "org.terrier.querying.ApplyTermPipeline")("Stopwords")
def __init__(self, index_like, fb_terms=10, fb_docs=3, qeclass="org.terrier.querying.QueryExpansion", verbose=0, properties={}, **kwargs): super().__init__(**kwargs) self.verbose = verbose if isinstance(qeclass, str): self.qe = pt.autoclass(qeclass)() else: self.qe = qeclass self.indexref = _parse_index_like(index_like) for k, v in properties.items(): pt.ApplicationSetup.setProperty(k, str(v)) self.applytp = pt.autoclass("org.terrier.querying.ApplyTermPipeline")() self.fb_terms = fb_terms self.fb_docs = fb_docs self.manager = pt.autoclass( "org.terrier.querying.ManagerFactory")._from_(self.indexref)
def test_fbr_reranking2(self): if not pt.check_version("5.4"): self.skipTest("Requires Terrier 5.4") # this test examines the use of ScoringMatchingWithFat, using a particular case known to with Terrier 5.3 JIR = pt.autoclass('org.terrier.querying.IndexRef') indexref = JIR.of(self.here + "/fixtures/index/data.properties") # we only want a candidate set of 3 documents firstpass = pt.BatchRetrieve(indexref, wmodel="BM25") % 3 pipe1 = firstpass >> pt.FeaturesBatchRetrieve(indexref, features=["WMODEL:PL2"]) pipe2 = firstpass >> pt.BatchRetrieve(indexref, wmodel="PL2") input = pd.DataFrame([["1", "Stability"]], columns=['qid', 'query']) result0 = firstpass.transform(input) result1 = pipe1.transform(input) result2 = pipe2.transform(input) result1["feature0"] = result1.apply(lambda row: row["features"][0], axis=1) #BM25 score result0_map = {row.docno: row.score for row in result0.itertuples()} result1S_map = {row.docno: row.score for row in result1.itertuples()} #PL2 score result1F_map = { row.docno: row.feature0 for row in result1.itertuples() } result2_map = {row.docno: row.score for row in result2.itertuples()} print(result1F_map) print(result2_map) # check features scores # NB: places can go no less than 4, as two documents have similar PL2 scores for rank, row in enumerate(result0.itertuples()): docno = row.docno # check that score is unchanged self.assertAlmostEqual( result1S_map[docno], result0_map[docno], msg="input score mismatch at rank %d for docno %s" % (rank, docno), places=4) # check that feature score is correct self.assertAlmostEqual( result1F_map[docno], result2_map[docno], msg="feature score mismatch at rank %d for docno %s" % (rank, docno), places=4)
def test_fbr(self): JIR = pt.autoclass('org.terrier.querying.IndexRef') indexref = JIR.of(self.here + "/fixtures/index/data.properties") retr = pt.FeaturesBatchRetrieve(indexref, ["WMODEL:PL2"], wmodel="DPH") input = pd.DataFrame([["1", "Stability"]], columns=['qid', 'query']) result = retr.transform(input) self.assertTrue("qid" in result.columns) self.assertTrue("docno" in result.columns) self.assertTrue("score" in result.columns) self.assertTrue("features" in result.columns) self.assertTrue(len(result) > 0) self.assertEqual(result.iloc[0]["features"].size, 1) retrBasic = pt.BatchRetrieve(indexref) if "matching" in retrBasic.controls: self.assertNotEqual(retrBasic.controls["matching"], "FatFeaturedScoringMatching,org.terrier.matching.daat.FatFull")
def test_threading_manualref(self): if not pt.check_version("5.5"): self.skipTest("Requires Terrier 5.5") topics = pt.get_dataset("vaswani").get_topics().head(10) #this test ensures that we operate when the indexref is specified to be concurrent JIR = pt.autoclass('org.terrier.querying.IndexRef') indexref = JIR.of("concurrent:" + self.here + "/fixtures/index/data.properties") retr = pt.BatchRetrieve(indexref, threads=5) result = retr.transform(topics) #check that use of a callback model works under threading Tf = lambda keyFreq, posting, entryStats, collStats: posting.getFrequency( ) retr = pt.BatchRetrieve(indexref, threads=5, wmodel=Tf) result = retr.transform(topics)
def transform(self, input): threshold = 0.9 docid_provided = "docid" in input.columns docno_provided = "docno" in input.columns assert docid_provided or docno_provided import pyterrier as pt from pyterrier import autoclass index = pt.IndexFactory.of(self.indexref) for k, v in pt.BatchRetrieve.default_properties.items(): pt.ApplicationSetup.setProperty(k, v) ManagerFactory = autoclass("org.terrier.querying.ManagerFactory") manager = ManagerFactory._from_(self.indexref) rtr = [] for row in tqdm(input.itertuples(), total=len(input)): if docid_provided: docid = row.docid if not docno_provided: docno = index.getMetaIndex().getItem("docno", docid) else: docno = row.docno docid = index.getMetaIndex().getDocument("docno", docno) if docid == -1: raise KeyError("Could not convert docno %s to a docid" % docno) q = self.get_query(index, docid) srq = manager.newSearchRequest(docno, q) srq.setControl("wmodel", "Tf") manager.runSearchRequest(srq) results = srq.getResults() for r in results: score = r.getScore() / self.numHashes if score > self.sim_threshold and r.getDocid() != docid: rtr.append([ docno, docid, r.getMetadata("docno"), r.getDocid(), score ]) return pd.DataFrame( rtr, columns=["docno_x", "docid_x", "docno_y", "docid_x", "score"])
def __init__(self, *args, fb_terms=10, fb_docs=3, **kwargs): global terrier_prf_package_loaded #if not terrier_prf_package_loaded: # pt.extend_classpath("org.terrier:terrier-prf") # terrier_prf_package_loaded = True #rm = pt.ApplicationSetup.getClass("org.terrier.querying.RM3").newInstance() import jnius_config prf_found = False for j in jnius_config.get_classpath(): if "terrier-prf" in j: prf_found = True break assert prf_found, 'terrier-prf jar not found: you should start Pyterrier with '\ + 'pt.init(boot_packages=["org.terrier:terrier-prf:0.0.1-SNAPSHOT"])' rm = pt.autoclass("org.terrier.querying.RM3")() self.fb_terms = fb_terms self.fb_docs = fb_docs kwargs["qeclass"] = rm super().__init__(*args, **kwargs)
def test_callable_wmodel_dunders(self): testPosting = pt.autoclass( "org.terrier.structures.postings.BasicPostingImpl")(0, 1) from pyterrier.batchretrieve import _function2wmodel lambdafn = lambda keyFreq, posting, entryStats, collStats: posting.getFrequency( ) callback, wmodel = _function2wmodel(lambdafn) from pyterrier.bootstrap import javabytebuffer2array byterep = javabytebuffer2array(wmodel.scoringClass.serializeFn()) import dill as pickle from dill import extend #see https://github.com/SeldonIO/alibi/issues/447#issuecomment-881552005 extend(use_dill=False) fn = pickle.loads(byterep) self.assertEqual( lambdafn(1, testPosting, None, None), fn(1, testPosting, None, None), ) wmodel.__getstate__() rtr = wmodel.__reduce__() #check the byte array is picklable pickle.dumps(rtr[1][0]) #check object is picklable pickle.dumps(wmodel) #check can be unpickled too wmodel2 = pickle.loads(pickle.dumps(wmodel)) score1 = wmodel.score(testPosting) score2 = wmodel2.score(testPosting) self.assertEqual(score1, score2) #check newly unpickled can still be pickled pickle.dumps(wmodel2) wmodel3 = pickle.loads(pickle.dumps(wmodel2)) score3 = wmodel3.score(testPosting) self.assertEqual(score1, score3)
def _test2_manual(self, type): import pyterrier as pt #pt.logging("INFO") import pandas as pd df1 = pd.DataFrame({ 'docno': ['1048'], 'body': [ 'h f noise radiators in ground flashes of tropical lightning a ' + 'detailed analysis of h f noise sources in tropical ground flashes ' + 'v l f phase characteristics deduced from atmospheric waveforms' ] }) pd_indexer1 = pt.DFIndexer(tempfile.mkdtemp(), type=type) indexref1 = pd_indexer1.index(df1["body"], df1["docno"]) index1 = pt.IndexFactory.of(indexref1) has_direct1 = index1.hasIndexStructure("direct") indexref_big = pt.get_dataset("vaswani").get_index() index_big = pt.IndexFactory.of(indexref_big) from pyterrier import autoclass stopwords = autoclass("org.terrier.terms.Stopwords")(None) stemmer = autoclass("org.terrier.terms.PorterStemmer")(None) q = "MATHEMATICAL ANALYSIS AND DESIGN DETAILS OF WAVEGUIDE FED MICROWAVE RADIATIONS" self.assertEqual("1048", index_big.getMetaIndex().getItem("docno", 1047)) contents_big = TestBackground.get_contents(1047, index_big) def _check_index(index_small): if has_direct1: contents1 = TestBackground.get_contents(0, index_small) self.assertEqual(contents1, contents_big) inv1 = index_small.getInvertedIndex() print(inv1.getClass().getName()) lex1 = index_small.getLexicon() for t in contents_big: pointer = lex1[t] print(pointer.toString()) p = inv1.getPostings(pointer) print(p.getClass().getName()) rtr = p.next() self.assertEqual(0, rtr) self.assertEqual( p.getDocumentLength(), index_big.getDocumentIndex().getDocumentLength(1047)) self.assertEqual(contents_big[t], p.getFrequency()) self.assertEqual(p.next(), p.EOL) from jnius import JavaException try: br1 = pt.BatchRetrieve(index_small, wmodel="Tf") brall = pt.BatchRetrieve(index_big, wmodel="Tf") with_doc = pd.DataFrame( [["q1", q, "1048", 1047]], columns=["qid", "query", "docno", "docid"]) rtr1 = br1.transform(q) except JavaException as ja: print(ja.stacktrace) raise ja rtrall = brall(with_doc) self.assertTrue( np.array_equal(rtr1["score"].values, rtrall["score"].values)) _check_index(index1) _check_index( pt.autoclass("org.terrier.python.IndexWithBackground")(index1, index_big))
def get_index(self): import pyterrier as pt thedir = self._get_all_files("index") return pt.autoclass("org.terrier.querying.IndexRef").of(os.path.join(thedir, "data.properties"))
import pyterrier as pt from jnius import cast import pandas as pd from .batchretrieve import _parse_index_like from .transformer import TransformerBase, Symbol from . import tqdm from warnings import warn from typing import List TerrierQLParser = pt.autoclass("org.terrier.querying.TerrierQLParser")() TerrierQLToMatchingQueryTerms = pt.autoclass( "org.terrier.querying.TerrierQLToMatchingQueryTerms")() QueryResultSet = pt.autoclass("org.terrier.matching.QueryResultSet") DependenceModelPreProcess = pt.autoclass( "org.terrier.querying.DependenceModelPreProcess") _terrier_prf_package_loaded = False _terrier_prf_message = 'terrier-prf jar not found: you should start PyTerrier with '\ + 'pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])' def _check_terrier_prf(): import jnius_config global _terrier_prf_package_loaded if _terrier_prf_package_loaded: return for j in jnius_config.get_classpath(): if "terrier-prf" in j: _terrier_prf_package_loaded = True break
def test_br_pickle_straightwmodel(self): self._br(pickle, wmodel=pt.autoclass("org.terrier.matching.models.BM25")())