Esempio n. 1
0
 def download(self):
     file_loc = self.dataset_file
     if is_file_correct(file_loc):
         return file_loc
     elif validate_url(
             file_loc):  # is it a web URL? check if exists in cache
         url = file_loc
         dcache_path = os.path.join(self.data_download_cache,
                                    DATA_CACHE_CONF)
         dcache = read_json(dcache_path)
         if url in dcache and is_file_correct(
                 dcache[url], self.data_download_cache,
                 url) and not self.cache_ignore:
             print(
                 "file for {} found in cache, not downloading".format(url))
             return dcache[url]
         else:  # download the file in the cache, update the json
             cache_dir = self.data_download_cache
             print("using {} as data/embeddings cache".format(cache_dir))
             temp_file = web_downloader(url)
             dload_file = extractor(filepath=temp_file,
                                    cache_dir=cache_dir,
                                    extractor_func=Downloader.ZIPD.get(
                                        mime_type(temp_file), None))
             dcache.update({url: dload_file})
             write_json(
                 dcache,
                 os.path.join(self.data_download_cache, DATA_CACHE_CONF))
             return dload_file
     raise RuntimeError(
         "the file [{}] is not in cache and can not be downloaded".format(
             file_loc))
Esempio n. 2
0
    def download(self):
        dload_bundle = self.dataset_desc.get("download", None)
        if dload_bundle is not None:  # download a zip/tar/tar.gz directory, look for train, dev test files inside that.
            dcache_path = os.path.join(self.data_download_cache, DATA_CACHE_CONF)
            dcache = read_json(dcache_path)
            if dload_bundle in dcache and \
                    is_dir_correct(dcache[dload_bundle], self.dataset_desc, self.data_download_cache, dload_bundle,
                                   self.enc_dec) and not self.cache_ignore:
                download_dir = dcache[dload_bundle]
                logger.info("files for {} found in cache, not downloading".format(dload_bundle))
                return {k: os.path.join(download_dir, self.dataset_desc[k]) for k in self.dataset_desc
                        if k.endswith("_file")}
            else:  # try to download the bundle and unzip
                if not validate_url(dload_bundle):
                    raise RuntimeError("can not download from the given url")
                else:
                    cache_dir = self.data_download_cache
                    temp_file = web_downloader(dload_bundle)

                    download_dir = extractor(filepath=temp_file, cache_dir=cache_dir,
                                             extractor_func=Downloader.ZIPD.get(mime_type(temp_file), None))
                    if "sha1" in self.dataset_desc:
                        if os.path.split(download_dir)[-1] != self.dataset_desc["sha1"]:
                            raise RuntimeError("The sha1 of the downloaded file does not match with the provided one")
                    dcache.update({dload_bundle: download_dir})
                    write_json(dcache, os.path.join(self.data_download_cache, DATA_CACHE_CONF))
                    return {k: os.path.join(download_dir, self.dataset_desc[k]) for k in self.dataset_desc
                            if k.endswith("_file")}
        else:  # we have download links to every file or they exist
            if not self.enc_dec:
                    return {k: SingleFileDownloader(self.dataset_desc[k], self.data_download_cache).download()
                        for k in self.dataset_desc if k.endswith("_file") and self.dataset_desc[k]}
            else:
                return {k: self.dataset_desc[k] for k in self.dataset_desc if k.endswith("_file")}
Esempio n. 3
0
    def download(self):
        dload_bundle = self.dataset_desc.get("download", None)
        if dload_bundle is not None:  # download a zip/tar/tar.gz directory, look for train, dev test files inside that.
            dcache_path = os.path.join(self.data_download_cache, DATA_CACHE_CONF)
            dcache = read_json(dcache_path)
            if dload_bundle in dcache and \
                    is_dir_correct(dcache[dload_bundle], self.dataset_desc, self.data_download_cache, dload_bundle,
                                   self.enc_dec) and not self.cache_ignore:
                download_dir = dcache[dload_bundle]
                logger.info("files for {} found in cache, not downloading".format(dload_bundle))
                return {k: os.path.join(download_dir, self.dataset_desc[k]) for k in self.dataset_desc
                        if k.endswith("_file")}
            else:  # try to download the bundle and unzip
                if not validate_url(dload_bundle):
                    raise RuntimeError("can not download from the given url")
                else:
                    cache_dir = self.data_download_cache
                    temp_file = web_downloader(dload_bundle)

                    download_dir = extractor(filepath=temp_file, cache_dir=cache_dir,
                                             extractor_func=Downloader.ZIPD.get(mime_type(temp_file), None))
                    if "sha1" in self.dataset_desc:
                        if os.path.split(download_dir)[-1] != self.dataset_desc["sha1"]:
                            raise RuntimeError("The sha1 of the downloaded file does not match with the provided one")
                    dcache.update({dload_bundle: download_dir})
                    write_json(dcache, os.path.join(self.data_download_cache, DATA_CACHE_CONF))
                    return {k: os.path.join(download_dir, self.dataset_desc[k]) for k in self.dataset_desc
                            if k.endswith("_file")}
        else:  # we have download links to every file or they exist
            if not self.enc_dec:
                    return {k: SingleFileDownloader(self.dataset_desc[k], self.data_download_cache).download()
                        for k in self.dataset_desc if k.endswith("_file")}
            else:
                return {k: self.dataset_desc[k] for k in self.dataset_desc if k.endswith("_file")}
Esempio n. 4
0
 def download(self):
     if is_file_correct(self.embedding_file):
         logger.info("embedding file location: {}".format(self.embedding_file))
         return self.embedding_file
     dcache_path = os.path.join(self.data_download_cache, DATA_CACHE_CONF)
     dcache = read_json(dcache_path)
     if self.embedding_file in dcache and not self.cache_ignore:
         download_loc = dcache[self.embedding_file]
         logger.info("files for {} found in cache".format(self.embedding_file))
         return self._get_embedding_file(download_loc, self.embedding_key)
     else:  # try to download the bundle and unzip
         url = self.embedding_file
         if not validate_url(url):
             raise RuntimeError("can not download from the given url")
         else:
             cache_dir = self.data_download_cache
             temp_file = web_downloader(url)
             download_loc = extractor(filepath=temp_file, cache_dir=cache_dir,
                                      extractor_func=Downloader.ZIPD.get(mime_type(temp_file), None))
             if self.sha1 is not None:
                 if os.path.split(download_loc)[-1] != self.sha1:
                     raise RuntimeError("The sha1 of the downloaded file does not match with the provided one")
             dcache.update({url: download_loc})
             write_json(dcache, os.path.join(self.data_download_cache, DATA_CACHE_CONF))
             return self._get_embedding_file(download_loc, self.embedding_key)
Esempio n. 5
0
 def download(self):
     if is_file_correct(self.embedding_file):
         logger.info("embedding file location: {}".format(self.embedding_file))
         return self.embedding_file
     dcache_path = os.path.join(self.data_download_cache, DATA_CACHE_CONF)
     dcache = read_json(dcache_path)
     if self.embedding_file in dcache and not self.cache_ignore:
         download_loc = dcache[self.embedding_file]
         logger.info("files for {} found in cache".format(self.embedding_file))
         return self._get_embedding_file(download_loc, self.embedding_key)
     else:  # try to download the bundle and unzip
         url = self.embedding_file
         if not validate_url(url):
             raise RuntimeError("can not download from the given url")
         else:
             cache_dir = self.data_download_cache
             temp_file = web_downloader(url)
             download_loc = extractor(filepath=temp_file, cache_dir=cache_dir,
                                      extractor_func=Downloader.ZIPD.get(mime_type(temp_file), None))
             if self.sha1 is not None:
                 if os.path.split(download_loc)[-1] != self.sha1:
                     raise RuntimeError("The sha1 of the downloaded file does not match with the provided one")
             dcache.update({url: download_loc})
             write_json(dcache, os.path.join(self.data_download_cache, DATA_CACHE_CONF))
             return self._get_embedding_file(download_loc, self.embedding_key)
Esempio n. 6
0
def _verify_file(file_loc):
    # dropbox doesn't give 404 in case the file does not exist, produces an HTML. The actual files are never HTMLs.
    if not os.path.exists(file_loc):
        return False

    if os.path.isfile(file_loc) and mime_type(file_loc) == "text/html":
        return False

    return True
Esempio n. 7
0
def _verify_file(file_loc):
    # dropbox doesn't give 404 in case the file does not exist, produces an HTML. The actual files are never HTMLs.
    if not os.path.exists(file_loc):
        return False

    if os.path.isfile(file_loc) and mime_type(file_loc) == "text/html":
        return False

    return True
Esempio n. 8
0
def extract_gzip(file_loc):
    temp_file = delete_old_copy("{}.1".format(file_loc))
    with gzip.open(file_loc, 'rb') as f_in:
        with open(temp_file, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    if mime_type(temp_file) == "application/x-tar":
        return extract_tar(temp_file)
    else:
        shutil.move(temp_file, file_loc)
        return file_loc
Esempio n. 9
0
def extract_gzip(file_loc):
    temp_file = delete_old_copy("{}.1".format(file_loc))
    with gzip.open(file_loc, 'rb') as f_in:
        with open(temp_file, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    if mime_type(temp_file) == "application/x-tar":
        return extract_tar(temp_file)
    else:
        shutil.move(temp_file, file_loc)
        return file_loc
Esempio n. 10
0
    def _read_vectors(self, filename, idx, known_vocab, keep_unused, **kwargs):
        use_mmap = bool(kwargs.get('use_mmap', False))
        read_fn = self._read_word2vec_file
        is_glove_file = mime_type(filename) == 'text/plain'
        if use_mmap:
            if is_glove_file:
                read_fn = self._read_text_mmap
            else:
                read_fn = self._read_word2vec_mmap
        elif is_glove_file:
            read_fn = self._read_text_file

        return read_fn(filename, idx, known_vocab, keep_unused)
Esempio n. 11
0
    def _read_vectors(self, filename, idx, known_vocab, keep_unused, **kwargs):
        use_mmap = bool(kwargs.get('use_mmap', False))
        read_fn = self._read_word2vec_file
        is_glove_file = mime_type(filename) == 'text/plain'
        if use_mmap:
            if is_glove_file:
                read_fn = self._read_text_mmap
            else:
                read_fn = self._read_word2vec_mmap
        elif is_glove_file:
            read_fn = self._read_text_file

        return read_fn(filename, idx, known_vocab, keep_unused)
Esempio n. 12
0
def unzip_files(zip_path):
    if os.path.isdir(zip_path):
        return zip_path
    from baseline.mime_type import mime_type
    if mime_type(zip_path) == 'application/zip':
        with open(zip_path, 'rb') as f:
            sha1 = hashlib.sha1(f.read()).hexdigest()
            temp_dir = os.path.join("/tmp/", sha1)
            if not os.path.exists(temp_dir):
                logger.info("unzipping model")
                with zipfile.ZipFile(zip_path, "r") as zip_ref:
                    zip_ref.extractall(temp_dir)
            if len(os.listdir(temp_dir)) == 1:  # a directory was zipped v files
                temp_dir = os.path.join(temp_dir, os.listdir(temp_dir)[0])
        return temp_dir
    return zip_path
Esempio n. 13
0
def unzip_files(zip_path):
    if os.path.isdir(zip_path):
        return zip_path
    from baseline.mime_type import mime_type
    if mime_type(zip_path) == 'application/zip':
        with open(zip_path, 'rb') as f:
            sha1 = hashlib.sha1(f.read()).hexdigest()
            temp_dir = os.path.join("/tmp/", sha1)
            if not os.path.exists(temp_dir):
                logger.info("unzipping model")
                with zipfile.ZipFile(zip_path, "r") as zip_ref:
                    zip_ref.extractall(temp_dir)
            if len(os.listdir(temp_dir)) == 1:  # a directory was zipped v files
                temp_dir = os.path.join(temp_dir, os.listdir(temp_dir)[0])
        return temp_dir
    return zip_path
Esempio n. 14
0
def unzip_model(path):
    """If the path for a model file is a zip file, unzip it in /tmp and return the unzipped path"""
    # Import inside function to avoid circular dep :(
    # TODO: future solution move the export code a different file so mime_type can import from it
    # rather then from here, this allows here to import mime_type
    if os.path.isdir(path):
        return path
    from baseline.mime_type import mime_type
    if mime_type(path) == 'application/zip':
        with open(path, 'rb') as f:
            sha1 = hashlib.sha1(f.read()).hexdigest()
        temp_dir = os.path.join("/tmp/", sha1)
        if not os.path.exists(temp_dir):
            logger.info("unzipping model")
            with zipfile.ZipFile(path, "r") as zip_ref:
                zip_ref.extractall(temp_dir)
        if len(os.listdir(temp_dir)) == 1:  # a directory was zipped v files
            temp_dir = os.path.join(temp_dir, os.listdir(temp_dir)[0])
        path = os.path.join(temp_dir, [x[:-6] for x in os.listdir(temp_dir) if 'index' in x][0])
    return path
Esempio n. 15
0
def unzip_model(path):
    """If the path for a model file is a zip file, unzip it in /tmp and return the unzipped path"""
    # Import inside function to avoid circular dep :(
    # TODO: future solution move the export code a different file so mime_type can import from it
    # rather then from here, this allows here to import mime_type
    if os.path.isdir(path):
        return path
    from baseline.mime_type import mime_type
    if mime_type(path) == 'application/zip':
        with open(path, 'rb') as f:
            sha1 = hashlib.sha1(f.read()).hexdigest()
        temp_dir = os.path.join("/tmp/", sha1)
        if not os.path.exists(temp_dir):
            logger.info("unzipping model")
            with zipfile.ZipFile(path, "r") as zip_ref:
                zip_ref.extractall(temp_dir)
        if len(os.listdir(temp_dir)) == 1:  # a directory was zipped v files
            temp_dir = os.path.join(temp_dir, os.listdir(temp_dir)[0])
        path = os.path.join(temp_dir, [x[:-6] for x in os.listdir(temp_dir) if 'index' in x][0])
    return path
Esempio n. 16
0
 def download(self):
     file_loc = self.dataset_file
     if is_file_correct(file_loc):
         return file_loc
     elif validate_url(file_loc):  # is it a web URL? check if exists in cache
         url = file_loc
         dcache_path = os.path.join(self.data_download_cache, DATA_CACHE_CONF)
         dcache = read_json(dcache_path)
         if url in dcache and is_file_correct(dcache[url], self.data_download_cache, url) and not self.cache_ignore:
             logger.info("file for {} found in cache, not downloading".format(url))
             return dcache[url]
         else:  # download the file in the cache, update the json
             cache_dir = self.data_download_cache
             logger.info("using {} as data/embeddings cache".format(cache_dir))
             temp_file = web_downloader(url)
             dload_file = extractor(filepath=temp_file, cache_dir=cache_dir,
                                    extractor_func=Downloader.ZIPD.get(mime_type(temp_file), None))
             dcache.update({url: dload_file})
             write_json(dcache, os.path.join(self.data_download_cache, DATA_CACHE_CONF))
             return dload_file
     raise RuntimeError("the file [{}] is not in cache and can not be downloaded".format(file_loc))
Esempio n. 17
0
def is_file_correct(file_loc, data_dcache=None, key=None):
    """check if the file location mentioned in the json file is correct, i.e.,
    exists and not corrupted. This is needed when the direct download link/ path for a file
    changes and the user is unaware. This is not tracked by sha1 either. If it returns False, delete the corrupted file.
    Additionally, if the file location is a URL, i.e. exists in the cache, delete it so that it can be re-downloaded.

    Keyword arguments:
    file_loc -- location of the file
    data_dcache -- data download cache location (default None, for local system file paths)
    key -- URL for download (default None, for local system file paths)
    """

    if os.path.exists(file_loc) and os.path.isfile(
            file_loc) and not mime_type(file_loc) == "text/html":
        # dropbox doesn't give 404 in case the file does not exist, produces an HTML. The actual files are never HTMLs.
        return True
    else:
        delete_old_copy(file_loc)
        if key is not None:  # cache file validation
            update_cache(key, data_dcache)
        return False
Esempio n. 18
0
def _verify_file(file_loc):
    # dropbox doesn't give 404 in case the file does not exist, produces an HTML. The actual files are never HTMLs.
    return os.path.exists(file_loc) and os.path.isfile(
        file_loc) and not mime_type(file_loc) == "text/html"