Beispiel #1
0
 def download(self):
     if is_file_correct(self.embedding_file):
         logger.info("embedding file location: {}".format(
             self.embedding_file))
         return self.embedding_file
     dcache_path = os.path.join(self.data_download_cache, DATA_CACHE_CONF)
     dcache = read_json(dcache_path)
     if self.embedding_file in dcache and not self.cache_ignore:
         download_loc = dcache[self.embedding_file]
         logger.info("files for {} found in cache".format(
             self.embedding_file))
         return self._get_embedding_file(download_loc, self.embedding_key)
     else:  # try to download the bundle and unzip
         url = self.embedding_file
         if not validate_url(url):
             raise RuntimeError("can not download from the given url")
         else:
             cache_dir = self.data_download_cache
             temp_file = web_downloader(url)
             unzip_fn = Downloader.ZIPD.get(
                 mime_type(temp_file)) if self.unzip_file else None
             download_loc = extractor(filepath=temp_file,
                                      cache_dir=cache_dir,
                                      extractor_func=unzip_fn)
             if self.sha1 is not None:
                 if os.path.split(download_loc)[-1] != self.sha1:
                     raise RuntimeError(
                         "The sha1 of the downloaded file does not match with the provided one"
                     )
             dcache.update({url: download_loc})
             write_json(
                 dcache,
                 os.path.join(self.data_download_cache, DATA_CACHE_CONF))
             return self._get_embedding_file(download_loc,
                                             self.embedding_key)
Beispiel #2
0
 def download(self):
     file_loc = self.dataset_file
     if is_file_correct(file_loc):
         return file_loc
     elif validate_url(
             file_loc):  # is it a web URL? check if exists in cache
         url = file_loc
         dcache_path = os.path.join(self.data_download_cache,
                                    DATA_CACHE_CONF)
         dcache = read_json(dcache_path)
         if url in dcache and is_file_correct(
                 dcache[url], self.data_download_cache,
                 url) and not self.cache_ignore:
             logger.info(
                 "file for {} found in cache, not downloading".format(url))
             return dcache[url]
         else:  # download the file in the cache, update the json
             cache_dir = self.data_download_cache
             logger.info(
                 "using {} as data/embeddings cache".format(cache_dir))
             temp_file = web_downloader(url)
             dload_file = extractor(filepath=temp_file,
                                    cache_dir=cache_dir,
                                    extractor_func=Downloader.ZIPD.get(
                                        mime_type(temp_file), None))
             dcache.update({url: dload_file})
             write_json(
                 dcache,
                 os.path.join(self.data_download_cache, DATA_CACHE_CONF))
             return dload_file
     raise RuntimeError(
         "the file [{}] is not in cache and can not be downloaded".format(
             file_loc))
Beispiel #3
0
    def download(self):
        dload_bundle = self.dataset_desc.get("download", None)
        if dload_bundle is not None:  # download a zip/tar/tar.gz directory, look for train, dev test files inside that.
            dcache_path = os.path.join(self.data_download_cache,
                                       DATA_CACHE_CONF)
            dcache = read_json(dcache_path)
            if dload_bundle in dcache and \
                    is_dir_correct(dcache[dload_bundle], self.dataset_desc, self.data_download_cache, dload_bundle,
                                   self.enc_dec) and not self.cache_ignore:
                download_dir = dcache[dload_bundle]
                logger.info(
                    "files for {} found in cache, not downloading".format(
                        dload_bundle))
                updated = _update_md(self.dataset_desc, download_dir)
                return updated
            else:  # try to download the bundle and unzip
                if not validate_url(dload_bundle):
                    raise RuntimeError("can not download from the given url")
                else:
                    cache_dir = self.data_download_cache
                    temp_file = web_downloader(dload_bundle)

                    download_dir = extractor(
                        filepath=temp_file,
                        cache_dir=cache_dir,
                        extractor_func=Downloader.ZIPD.get(
                            mime_type(temp_file), None))
                    if "sha1" in self.dataset_desc:
                        if os.path.split(
                                download_dir)[-1] != self.dataset_desc["sha1"]:
                            raise RuntimeError(
                                "The sha1 of the downloaded file does not match with the provided one"
                            )
                    dcache.update({dload_bundle: download_dir})
                    write_json(
                        dcache,
                        os.path.join(self.data_download_cache,
                                     DATA_CACHE_CONF))
                    updated = _update_md(self.dataset_desc, download_dir)
                    return updated
        else:  # we have download links to every file or they exist
            updated = _update_md(self.dataset_desc, None)
            if not self.enc_dec:
                updated.update({
                    k:
                    SingleFileDownloader(self.dataset_desc[k],
                                         self.data_download_cache).download()
                    for k in self.dataset_desc
                    if k.endswith("_file") and self.dataset_desc[k]
                })
            return updated
Beispiel #4
0
def get_file_or_url(file, cache=None):
    """
    Return itself if its a file, else downloaded.  If cache, use that
    :param file: The file name
    :param cache: A cache location, If `None`, just download
    :return: The proper filename
    """

    if validate_url(file):
        logger.info(f'Downloading {file}')
        if cache:
            file = SingleFileDownloader(file, cache).download()
        else:
            file, _ = urlretrieve(file)
    return file
Beispiel #5
0
 def download(self):
     file_loc = self.dataset_file
     if is_file_correct(file_loc):
         return file_loc
     elif validate_url(
             file_loc):  # is it a web URL? check if exists in cache
         url = file_loc
         dcache_path = os.path.join(self.data_download_cache,
                                    DATA_CACHE_CONF)
         dcache = read_json(dcache_path)
         # If the file already exists in the cache
         if url in dcache and is_file_correct(
                 dcache[url], self.data_download_cache,
                 url) and not self.cache_ignore:
             logger.info(
                 "file for {} found in cache, not downloading".format(url))
             return dcache[url]
         # Otherwise, we want it to be placed in ~/.bl-cache/addons
         else:  # download the file in the cache, update the json
             cache_dir = self.data_download_cache
             addon_path = os.path.join(cache_dir,
                                       AddonDownloader.ADDON_SUBPATH)
             if not os.path.exists(addon_path):
                 os.makedirs(addon_path)
             path_to_save = os.path.join(addon_path,
                                         os.path.basename(file_loc))
             logger.info("using {} as data/addons cache".format(cache_dir))
             web_downloader(url, path_to_save)
             dcache.update({url: path_to_save})
             write_json(
                 dcache,
                 os.path.join(self.data_download_cache, DATA_CACHE_CONF))
             return path_to_save
     raise RuntimeError(
         "the file [{}] is not in cache and can not be downloaded".format(
             file_loc))