Python get_default_cache_dirの例

プログラミング言語: Python

名前空間/パッケージ名: capreolus.utils.common

メソッド/関数: get_default_cache_dir

hotexamples.comのコード掲載数: 6

Python get_default_cache_dir - 6件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのcapreolus.utils.common.get_default_cache_dirの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

    def get_documents_from_disk(self, doc_ids):
        """
        Does not make use of the index. We use pyserini's disk traversal methods to retrieve documents. This allows
        us to get away with much smaller index sizes on disk, since indexes now does not have to store the document
        """
        start = time.time()
        logger.info("Starting to get documents from disk")
        document_type = self.collection.config["documents"]["type"]
        if document_type == "trec":
            ctype = "TrecCollection"
        elif document_type == "trecweb":
            ctype = "TrecwebCollection"
        else:
            # For clueweb12, document_type in yaml is the same as anserini - ClueWeb12Collection
            ctype = document_type

        rootdir = self.collection.config["documents"]["path"]
        p = subprocess.run(
            ["python", get_crawl_collection_script(), rootdir, ctype],
            stdout=subprocess.PIPE,
            input=",".join(doc_ids),
            check=True,
            encoding="utf-8",
        )
        with open(
                "{0}/disk_crawl_temp_dump.json".format(
                    os.getenv("CAPREOLUS_CACHE", get_default_cache_dir())),
                "rt") as fp:
            fetched_docs = json.load(fp)

        return [fetched_docs.get(doc_id, []) for doc_id in doc_ids]

コード例 #2

ファイルを表示

ファイル: crawl_collection.py プロジェクト: bpiwowar/capreolus-xpm

def crawl():
    """
    Iterates through every document in a collection and looks for the doc ids passed as command line arguments.
    Spawns multiple processes to do this for us. Clueweb12 crawl completes in approximately 42 hours with 8 processes
    See `get_documents_from_disk()` in anserini.py to know how this file is being used
    """
    rootdir = sys.argv[1]
    ctype = sys.argv[2]
    doc_ids = set(input().split(","))
    manager = Manager()
    shared_dict = manager.dict()
    multiprocess_start = time.time()
    logger.debug("Start multiprocess")
    args_list = []
    for subdir in os.listdir(rootdir):
        if os.path.isdir(rootdir + "/" + subdir):
            args_list.append({
                "doc_ids": doc_ids,
                "rootdir": rootdir + "/" + subdir,
                "ctype": ctype,
                "shared_dict": shared_dict
            })

    pool = Pool(processes=8)
    pool.map(spawn_child_process_to_read_docs, args_list)

    logger.debug(
        "Getting all documents from disk took: {0}".format(time.time() -
                                                           multiprocess_start))
    # TODO: This will fail if multiple crawls are running at the same time
    with open(
            "{0}/disk_crawl_temp_dump.json".format(
                os.getenv("CAPREOLUS_CACHE", get_default_cache_dir())),
            "w") as fp:
        json.dump(shared_dict.copy(), fp)

コード例 #3

ファイルを表示

ファイル: views.py プロジェクト: bpiwowar/capreolus-xpm

    def get_available_indices():
        cache_path = os.environ.get("CAPREOLUS_CACHE", get_default_cache_dir())
        index_dirs = search_files_or_folders_in_directory(cache_path, "index")
        index_dirs_with_done = [
            index_dir for index_dir in index_dirs if len(search_files_or_folders_in_directory(index_dir, "done"))
        ]

        return index_dirs_with_done

コード例 #4

ファイルを表示

    def get_paths(self, config):
        """
        Returns a dictionary of various paths
        :param config: A sacred config
        :return: A dict. Eg:
        {
            "collection_path": "path",
            "base_path": "path",
            "cache_path": "path",
            "index_path": "path",
            "run_path": "path",
            "model_path": "path"
        }
        """
        expid = config["expid"]
        collection_path = self.module2cls["collection"].basepath
        base_path = os.environ.get("CAPREOLUS_RESULTS",
                                   get_default_results_dir())
        cache_path = os.environ.get("CAPREOLUS_CACHE", get_default_cache_dir())
        index_key = os.path.join(cache_path, config["collection"],
                                 self.module_key("index"))
        index_path = os.path.join(index_key, "index")
        run_path = os.path.join(index_key, "searcher",
                                self.module_key("searcher"))
        model_path = os.path.join(
            base_path,
            expid,
            config["collection"],
            self.module_key("index"),
            self.module_key("searcher"),
            self.module_key("benchmark"),
            self.module_key("pipeline"),
            self.module_key("reranker") + "_" + self.module_key("extractor"),
        )
        trained_weight_path = os.path.join(model_path, config["fold"],
                                           "weights", "dev")

        return {
            "collection_path": collection_path,
            "base_path": base_path,
            "cache_path": cache_path,
            "index_path": index_path,
            "index_key": index_key,
            "run_path": run_path,
            "model_path": model_path,
            "trained_weight_path": trained_weight_path,
        }

コード例 #5

ファイルを表示

ファイル: embedding.py プロジェクト: bpiwowar/capreolus-xpm

 def __init__(self, embedding_name):
     """
         If the _is_initialized class property is not set, build the benchmark and model (expensive)
         Else, do nothing.
     """
     self.embedding_name = embedding_name
     self.embedding = Magnitude(
         MagnitudeUtils.download_model(
             self.SUPPORTED_EMBEDDINGS[embedding_name], download_dir=os.environ.get("CAPREOLUS_CACHE", get_default_cache_dir())
         ),
         lazy_loading=-1,
         blocking=True,
     )
     self.stoi = {self.PAD: 0}  # string to integer. Associates an integer value with every token
     self.itos = {0: self.PAD}

コード例 #6

ファイルを表示

 def create(self):
     self.tokenizer = BertTokenizer.from_pretrained(
         self.tokmodel,
         cache_dir=os.environ.get("CAPREOLUS_CACHE",
                                  get_default_cache_dir()))
     self.vocab = self.tokenizer.vocab