def get_documents_from_disk(self, doc_ids): """ Does not make use of the index. We use pyserini's disk traversal methods to retrieve documents. This allows us to get away with much smaller index sizes on disk, since indexes now does not have to store the document """ start = time.time() logger.info("Starting to get documents from disk") document_type = self.collection.config["documents"]["type"] if document_type == "trec": ctype = "TrecCollection" elif document_type == "trecweb": ctype = "TrecwebCollection" else: # For clueweb12, document_type in yaml is the same as anserini - ClueWeb12Collection ctype = document_type rootdir = self.collection.config["documents"]["path"] p = subprocess.run( ["python", get_crawl_collection_script(), rootdir, ctype], stdout=subprocess.PIPE, input=",".join(doc_ids), check=True, encoding="utf-8", ) with open( "{0}/disk_crawl_temp_dump.json".format( os.getenv("CAPREOLUS_CACHE", get_default_cache_dir())), "rt") as fp: fetched_docs = json.load(fp) return [fetched_docs.get(doc_id, []) for doc_id in doc_ids]
def crawl(): """ Iterates through every document in a collection and looks for the doc ids passed as command line arguments. Spawns multiple processes to do this for us. Clueweb12 crawl completes in approximately 42 hours with 8 processes See `get_documents_from_disk()` in anserini.py to know how this file is being used """ rootdir = sys.argv[1] ctype = sys.argv[2] doc_ids = set(input().split(",")) manager = Manager() shared_dict = manager.dict() multiprocess_start = time.time() logger.debug("Start multiprocess") args_list = [] for subdir in os.listdir(rootdir): if os.path.isdir(rootdir + "/" + subdir): args_list.append({ "doc_ids": doc_ids, "rootdir": rootdir + "/" + subdir, "ctype": ctype, "shared_dict": shared_dict }) pool = Pool(processes=8) pool.map(spawn_child_process_to_read_docs, args_list) logger.debug( "Getting all documents from disk took: {0}".format(time.time() - multiprocess_start)) # TODO: This will fail if multiple crawls are running at the same time with open( "{0}/disk_crawl_temp_dump.json".format( os.getenv("CAPREOLUS_CACHE", get_default_cache_dir())), "w") as fp: json.dump(shared_dict.copy(), fp)
def get_available_indices(): cache_path = os.environ.get("CAPREOLUS_CACHE", get_default_cache_dir()) index_dirs = search_files_or_folders_in_directory(cache_path, "index") index_dirs_with_done = [ index_dir for index_dir in index_dirs if len(search_files_or_folders_in_directory(index_dir, "done")) ] return index_dirs_with_done
def get_paths(self, config): """ Returns a dictionary of various paths :param config: A sacred config :return: A dict. Eg: { "collection_path": "path", "base_path": "path", "cache_path": "path", "index_path": "path", "run_path": "path", "model_path": "path" } """ expid = config["expid"] collection_path = self.module2cls["collection"].basepath base_path = os.environ.get("CAPREOLUS_RESULTS", get_default_results_dir()) cache_path = os.environ.get("CAPREOLUS_CACHE", get_default_cache_dir()) index_key = os.path.join(cache_path, config["collection"], self.module_key("index")) index_path = os.path.join(index_key, "index") run_path = os.path.join(index_key, "searcher", self.module_key("searcher")) model_path = os.path.join( base_path, expid, config["collection"], self.module_key("index"), self.module_key("searcher"), self.module_key("benchmark"), self.module_key("pipeline"), self.module_key("reranker") + "_" + self.module_key("extractor"), ) trained_weight_path = os.path.join(model_path, config["fold"], "weights", "dev") return { "collection_path": collection_path, "base_path": base_path, "cache_path": cache_path, "index_path": index_path, "index_key": index_key, "run_path": run_path, "model_path": model_path, "trained_weight_path": trained_weight_path, }
def __init__(self, embedding_name): """ If the _is_initialized class property is not set, build the benchmark and model (expensive) Else, do nothing. """ self.embedding_name = embedding_name self.embedding = Magnitude( MagnitudeUtils.download_model( self.SUPPORTED_EMBEDDINGS[embedding_name], download_dir=os.environ.get("CAPREOLUS_CACHE", get_default_cache_dir()) ), lazy_loading=-1, blocking=True, ) self.stoi = {self.PAD: 0} # string to integer. Associates an integer value with every token self.itos = {0: self.PAD}
def create(self): self.tokenizer = BertTokenizer.from_pretrained( self.tokmodel, cache_dir=os.environ.get("CAPREOLUS_CACHE", get_default_cache_dir())) self.vocab = self.tokenizer.vocab