def test_cacm(self): # We're going to append a random value to downloaded files: r = randint(0, 10000000) collection_url = 'https://github.com/castorini/anserini/blob/master/src/main/resources/cacm/cacm.tar.gz?raw=true' tarball_name = 'cacm{}.tar.gz'.format(r) collection_dir = 'collection{}/'.format(r) filename, headers = urlretrieve(collection_url, tarball_name) tarball = tarfile.open(tarball_name) tarball.extractall(collection_dir) tarball.close() collection = pycollection.Collection('HtmlCollection', collection_dir) generator = pygenerator.Generator('JsoupGenerator') cnt = 0 for (i, fs) in enumerate(collection): for (j, doc) in enumerate(fs): parsed = generator.create_document(doc) docid = parsed.get('id') # FIELD_ID raw = parsed.get('raw') # FIELD_RAW contents = parsed.get('contents') # FIELD_BODY self.assertTrue(docid != '') self.assertTrue(raw != '') self.assertTrue(contents != '') cnt += 1 self.assertEqual(cnt, 3204) # Clean up os.remove(tarball_name) shutil.rmtree(collection_dir)
def __documents_in_collection(config): collection = pycollection.Collection(config['pyserini_collection'], config['collection_directory']) generator = pygenerator.Generator(config['pyserini_generator']) for (_, fs) in enumerate(collection): for (_, document) in enumerate(fs): if not __document_is_parsable(document): continue ret = generator.create_document(document) if ret is not None: yield ret
def spawn_child_process_to_read_docs(data): target_doc_ids = data["doc_ids"] path = data["rootdir"] ctype = data["ctype"] shared_dict = data["shared_dict"] local_dict = {} start = time.time() from pyserini.collection import pycollection from pyserini.index import pygenerator collection = pycollection.Collection(ctype, path) generator = pygenerator.Generator("JsoupGenerator") for i, file_segment in enumerate(collection): doc_ids, doc_contents = read_file_segment(target_doc_ids, file_segment, generator) for i, doc_id in enumerate(doc_ids): local_dict[doc_id] = doc_contents[i] shared_dict.update(local_dict) print( "PID: {0}, Done getting documents from disk: {1} for path: {2}".format( os.getpid(), time.time() - start, path))
def __iter__(self): from pyserini.collection import pycollection from pyserini.index import pygenerator path, ctype, gtype = self.get_path_and_types() # TODO change on pyserini upgrade if gtype == "WashingtonPostGenerator": gtype = "WapoGenerator" collection = pycollection.Collection(ctype, path) generator = pygenerator.Generator(gtype) for fs in collection: for doc in fs: parsed = None try: parsed = generator.create_document(doc) except: pass if parsed: yield (parsed.get("id"), parsed.get("title"), parsed.get("contents"))
from pyserini.collection import pycollection collection = pycollection.Collection( 'Cord19AbstractCollection', 'C:\Users\noekn\Documents\Information_Retrieval') cnt = 0 full_text = {True: 0, False: 0} articles = collection.next() for (i, d) in enumerate(articles): article = pycollection.Cord19Article(d.raw) cnt = cnt + 1 full_text[article.is_full_text()] += 1 if cnt % 1000 == 0: print(f'{cnt} articles read...')
def IterCollection(input_path, collection_class, generator_class, output_path, threads=1, tokenize=None, tokenmin=0, raw=False): """ Parameters ---------- input_path : str Path to input collection. collection_class : str Anserini collection class to use. generator_class : str Anserini generator class to use. output_path : str Path to output resulting json collection. threads : int Maximum number of threads. tokenize : str Name of tokenizer method to use for document-to-passage splitting. tokenmin : int Minimum limit argument (e.g. minword) passed to tokenizer raw : bool True for using raw (FIELD_RAW) document contents, False for using parsed (FIELD_BODY) document contents. """ start = time.time() logger.info("Begin reading collection.") ## Check and create tokenizer tokenizer = None if tokenize is not None: try: tokenizer = DocumentTokenizer(tokenize) except: raise ValueError(tokenize) collection = pycollection.Collection(collection_class, input_path) generator = pygenerator.Generator(generator_class) if not os.path.exists(output_path): logger.info("making directory...") os.mkdir(output_path) with ThreadPoolExecutor(max_workers=threads) as executor: for (seg_num, fs) in enumerate(collection): executor.submit(IterSegment, fs, generator, output_path, tokenizer, tokenmin, raw) end = time.time() elapsed = end - start print("all threads complete") logger.info("# Final Counter Values") logger.info("indexable: {:12d}".format( collection.counters.indexable.value)) logger.info("unindexable: {:12d}".format( collection.counters.unindexable.value)) logger.info("skipped: {:12d}".format( collection.counters.skipped.value)) logger.info("errors: {:12d}".format(collection.counters.errors.value)) logger.info("Total duration: %s", str(datetime.timedelta(seconds=elapsed)))