Esempio n. 1
0
    pmode = RESET_MODE
    # get prefix for environment name
    envprefix = sys.argv[1]
    # get number of environments to create
    nenvs = sys.argv[2]
elif nargs > 1:
    # returns env to the list of environments
    pmode = WRITE_MODE
    # get name of environment to return
    env = sys.argv[1]
else:
    # gets name of an environment to use
    pmode = READ_MODE

# creates a lock for the file so it can only be accessed one at a time
lock = FileLock(lock_path, timeout=time_out_secs)

with lock:
    if pmode == RESET_MODE:
        # create a list (named clist) of nevns environments with the
        clist = []
        nenvsnum = int(nenvs)
        for i in range(nenvsnum):
            clist.append(envprefix + str(i))
        # prefix envprefix
        # add code here
    else:
        # load hickle file
        clist = hickle.load(file_path)

        if pmode == WRITE_MODE:
Esempio n. 2
0
    def _instance_iterator(self, file_path: str) -> Iterable[Instance]:
        cache_file: Optional[str] = None
        if self._cache_directory:
            cache_file = self._get_cache_location_for_file_path(file_path)

        if cache_file is not None and os.path.exists(cache_file):
            cache_file_lock = FileLock(cache_file + ".lock",
                                       timeout=self.CACHE_FILE_LOCK_TIMEOUT)
            try:
                cache_file_lock.acquire()
                # We make an assumption here that if we can obtain the lock, no one will
                # be trying to write to the file anymore, so it should be safe to release the lock
                # before reading so that other processes can also read from it.
                cache_file_lock.release()
                logger.info("Reading instances from cache %s", cache_file)
                with open(cache_file) as data_file:
                    yield from self._multi_worker_islice(
                        data_file, transform=self.deserialize_instance)
            except Timeout:
                logger.warning(
                    "Failed to acquire lock on dataset cache file within %d seconds. "
                    "Cannot use cache to read instances.",
                    self.CACHE_FILE_LOCK_TIMEOUT,
                )
                yield from self._multi_worker_islice(self._read(file_path),
                                                     ensure_lazy=True)
        elif cache_file is not None and not os.path.exists(cache_file):
            instances = self._multi_worker_islice(self._read(file_path),
                                                  ensure_lazy=True)
            # The cache file doesn't exist so we'll try writing to it.
            if self.max_instances is not None:
                # But we don't write to the cache when max_instances is specified.
                logger.warning(
                    "Skipping writing to data cache since max_instances was specified."
                )
                yield from instances
            elif util.is_distributed() or (get_worker_info()
                                           and get_worker_info().num_workers):
                # We also shouldn't write to the cache if there's more than one process loading
                # instances since each worker only receives a partial share of the instances.
                logger.warning(
                    "Can't cache data instances when there are multiple processes loading data"
                )
                yield from instances
            else:
                try:
                    with FileLock(cache_file + ".lock",
                                  timeout=self.CACHE_FILE_LOCK_TIMEOUT):
                        with CacheFile(cache_file, mode="w+") as cache_handle:
                            logger.info("Caching instances to temp file %s",
                                        cache_handle.name)
                            for instance in instances:
                                cache_handle.write(
                                    self.serialize_instance(instance) + "\n")
                                yield instance
                except Timeout:
                    logger.warning(
                        "Failed to acquire lock on dataset cache file within %d seconds. "
                        "Cannot write to cache.",
                        self.CACHE_FILE_LOCK_TIMEOUT,
                    )
                    yield from instances
        else:
            # No cache.
            yield from self._multi_worker_islice(self._read(file_path),
                                                 ensure_lazy=True)
Esempio n. 3
0
def clear_csv(csv_path):
    with FileLock(csv_path + ".lock"):
        f = open(csv_path, "w+")
        csv_writer = csv.writer(f)
        f.close()
 def initialize_control_file(self) -> None:
     with FileLock(LTIGradesSenderControlFile.lock_file):
         with Path(self.config_fullname).open('w+') as new_file:
             json.dump(LTIGradesSenderControlFile.cache_sender_data,
                       new_file)
             logger.debug('Control file initialized.')
Esempio n. 5
0
    def read(
        self, file_path: Union[Path, str]
    ) -> Union[AllennlpDataset, AllennlpLazyDataset]:
        """
        Returns an dataset containing all the instances that can be read from the file path.

        If `self.lazy` is `False`, this eagerly reads all instances from `self._read()`
        and returns an `AllennlpDataset`.

        If `self.lazy` is `True`, this returns an `AllennlpLazyDataset`, which internally
        relies on the generator created from `self._read()` to lazily produce `Instance`s.
        In this case your implementation of `_read()` must also be lazy
        (that is, not load all instances into memory at once), otherwise
        you will get a `ConfigurationError`.

        In either case, the returned `Iterable` can be iterated
        over multiple times. It's unlikely you want to override this function,
        but if you do your result should likewise be repeatedly iterable.
        """
        if not isinstance(file_path, str):
            file_path = str(file_path)

        lazy = getattr(self, "lazy", None)

        if lazy is None:
            warnings.warn(
                "DatasetReader.lazy is not set, "
                "did you forget to call the superclass constructor?",
                UserWarning,
            )

        if lazy:
            return AllennlpLazyDataset(self._instance_iterator, file_path)
        else:
            cache_file: Optional[str] = None
            if self._cache_directory:
                cache_file = self._get_cache_location_for_file_path(file_path)

            if cache_file is not None and os.path.exists(cache_file):
                try:
                    # Try to acquire a lock just to make sure another process isn't in the middle
                    # of writing to the cache.
                    cache_file_lock = FileLock(
                        cache_file + ".lock",
                        timeout=self.CACHE_FILE_LOCK_TIMEOUT)
                    cache_file_lock.acquire()
                    # We make an assumption here that if we can obtain the lock, no one will
                    # be trying to write to the file anymore, so it should be safe to release the lock
                    # before reading so that other processes can also read from it.
                    cache_file_lock.release()
                    logger.info("Reading instances from cache %s", cache_file)
                    instances = self._instances_from_cache_file(cache_file)
                except Timeout:
                    logger.warning(
                        "Failed to acquire lock on dataset cache file within %d seconds. "
                        "Cannot use cache to read instances.",
                        self.CACHE_FILE_LOCK_TIMEOUT,
                    )
                    instances = self._multi_worker_islice(
                        self._read(file_path))
            else:
                instances = self._multi_worker_islice(self._read(file_path))

            # Then some validation.
            if not isinstance(instances, list):
                instances = list(instances)

            if not instances:
                raise ConfigurationError(
                    "No instances were read from the given filepath {}. "
                    "Is the path correct?".format(file_path))

            # And finally we try writing to the cache.
            if cache_file is not None and not os.path.exists(cache_file):
                if self.max_instances is not None:
                    # But we don't write to the cache when max_instances is specified.
                    logger.warning(
                        "Skipping writing to data cache since max_instances was specified."
                    )
                elif util.is_distributed() or (get_worker_info() and
                                               get_worker_info().num_workers):
                    # We also shouldn't write to the cache if there's more than one process loading
                    # instances since each worker only receives a partial share of the instances.
                    logger.warning(
                        "Can't cache data instances when there are multiple processes loading data"
                    )
                else:
                    try:
                        with FileLock(cache_file + ".lock",
                                      timeout=self.CACHE_FILE_LOCK_TIMEOUT):
                            self._instances_to_cache_file(
                                cache_file, instances)
                    except Timeout:
                        logger.warning(
                            "Failed to acquire lock on dataset cache file within %d seconds. "
                            "Cannot write to cache.",
                            self.CACHE_FILE_LOCK_TIMEOUT,
                        )

            return AllennlpDataset(instances)
Esempio n. 6
0
def get_from_cache(
    url,
    cache_dir=None,
    force_download=False,
    proxies=None,
    etag_timeout=10,
    resume_download=False,
    user_agent: Union[Dict, str, None] = None,
    local_files_only=False,
) -> Optional[str]:
    """
    Given a URL, look for the corresponding file in the local cache.
    If it's not there, download it. Then return the path to the cached file.

    Return:
        None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
        Local path (string) otherwise
    """
    if cache_dir is None:
        cache_dir = TRANSFORMERS_CACHE
    if isinstance(cache_dir, Path):
        cache_dir = str(cache_dir)

    os.makedirs(cache_dir, exist_ok=True)

    etag = None
    if not local_files_only:
        try:
            response = requests.head(url, allow_redirects=True, proxies=proxies, timeout=etag_timeout)
            if response.status_code == 200:
                etag = response.headers.get("ETag")
        except (EnvironmentError, requests.exceptions.Timeout):
            # etag is already None
            pass

    filename = url_to_filename(url, etag)

    # get cache path to put the file
    cache_path = os.path.join(cache_dir, filename)

    # etag is None = we don't have a connection, or url doesn't exist, or is otherwise inaccessible.
    # try to get the last downloaded one
    if etag is None:
        if os.path.exists(cache_path):
            return cache_path
        else:
            matching_files = [
                file
                for file in fnmatch.filter(os.listdir(cache_dir), filename + ".*")
                if not file.endswith(".json") and not file.endswith(".lock")
            ]
            if len(matching_files) > 0:
                return os.path.join(cache_dir, matching_files[-1])
            else:
                # If files cannot be found and local_files_only=True,
                # the models might've been found if local_files_only=False
                # Notify the user about that
                if local_files_only:
                    raise ValueError(
                        "Cannot find the requested files in the cached path and outgoing traffic has been"
                        " disabled. To enable model look-ups and downloads online, set 'local_files_only'"
                        " to False."
                    )
                return None

    # From now on, etag is not None.
    if os.path.exists(cache_path) and not force_download:
        return cache_path

    # Prevent parallel downloads of the same file with a lock.
    lock_path = cache_path + ".lock"
    with FileLock(lock_path):

        # If the download just completed while the lock was activated.
        if os.path.exists(cache_path) and not force_download:
            # Even if returning early like here, the lock will be released.
            return cache_path

        if resume_download:
            incomplete_path = cache_path + ".incomplete"

            @contextmanager
            def _resumable_file_manager():
                with open(incomplete_path, "a+b") as f:
                    yield f

            temp_file_manager = _resumable_file_manager
            if os.path.exists(incomplete_path):
                resume_size = os.stat(incomplete_path).st_size
            else:
                resume_size = 0
        else:
            temp_file_manager = partial(tempfile.NamedTemporaryFile, dir=cache_dir, delete=False)
            resume_size = 0

        # Download to temporary file, then copy to cache dir once finished.
        # Otherwise you get corrupt cache entries if the download gets interrupted.
        with temp_file_manager() as temp_file:
            logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)

            http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent)

        logger.info("storing %s in cache at %s", url, cache_path)
        os.replace(temp_file.name, cache_path)

        logger.info("creating metadata file for %s", cache_path)
        meta = {"url": url, "etag": etag}
        meta_path = cache_path + ".json"
        with open(meta_path, "w") as meta_file:
            json.dump(meta, meta_file)

    return cache_path
check_min_version("4.22.0.dev0")

require_version(
    "datasets>=1.8.0",
    "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

logger = logging.getLogger(__name__)

try:
    nltk.data.find("tokenizers/punkt")
except (LookupError, OSError):
    if is_offline_mode():
        raise LookupError(
            "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files"
        )
    with FileLock(".lock") as lock:
        nltk.download("punkt", quiet=True)

# A list of all multilingual tokenizer which require lang attribute.
MULTILINGUAL_TOKENIZERS = [
    MBartTokenizer, MBartTokenizerFast, MBart50Tokenizer, MBart50TokenizerFast
]


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
Esempio n. 8
0
    def __init__(
        self,
        name: str = None,
        process_id: int = 0,
        num_process: int = 1,
        data_dir: Optional[str] = None,
        experiment_id: Optional[str] = None,
        in_memory: bool = False,
        **kwargs,
    ):
        """ A Metrics is the base class and common API for all metrics.
            Args:
                process_id (int): specify the id of the node in a distributed settings between 0 and num_nodes-1
                    This can be used, to compute metrics on distributed setups
                    (in particular non-additive metrics like F1).
                data_dir (str): path to a directory in which temporary data will be stored.
                    This should be a shared file-system for distributed setups.
                experiment_id (str): Should be used if you perform several concurrent experiments using
                    the same caching directory (will be indicated in the raise error)
                in_memory (bool): keep all predictions and references in memory. Not possible in distributed settings.
        """
        # Safety checks
        assert isinstance(process_id, int) and process_id >= 0, "'process_id' should be a number greater than 0"
        assert (
            isinstance(num_process, int) and num_process > process_id
        ), "'num_process' should be a number greater than process_id"
        assert (
            process_id == 0 or not in_memory
        ), "Using 'in_memory' is not possible in distributed setting (process_id > 0)."

        # Metric name
        self.name = camelcase_to_snakecase(self.__class__.__name__)
        # Configuration name
        self.config_name = name

        self.process_id = process_id
        self.num_process = num_process
        self.in_memory = in_memory
        self.experiment_id = experiment_id if experiment_id is not None else "cache"
        self._version = "1.0.0"
        self._data_dir_root = os.path.expanduser(data_dir or HF_METRICS_CACHE)
        self.data_dir = self._build_data_dir()

        # prepare info
        info = self._info()
        info.metric_name = self.name
        info.config_name = self.config_name
        info.version = self._version
        self.info = info

        # Update 'compute' and 'add' docstring
        self.compute.__func__.__doc__ += self.info.inputs_description
        self.add.__func__.__doc__ += self.info.inputs_description

        self.arrow_schema = pa.schema(field for field in self.info.features.type)
        self.buf_writer = None
        self.writer = None
        self.writer_batch_size = None
        self.data = None

        # Check we can write on the cache file without competitors
        self.cache_file_name = os.path.join(self.data_dir, self._get_file_name(self.process_id))
        self.filelock = FileLock(self.cache_file_name + ".lock")
        try:
            self.filelock.acquire(timeout=1)
        except Timeout:
            raise ValueError(
                "Cannot acquire lock, caching file might be used by another process, "
                "you should setup a unique 'experiment_id' for this run."
            )
Esempio n. 9
0
def cached_path(
    url_or_filename,
    cache_dir=None,
    force_download=False,
    proxies=None,
    resume_download=False,
    user_agent: Union[Dict, str, None] = None,
    extract_compressed_file=False,
    force_extract=False,
    local_files_only=False,
) -> Optional[str]:
    """
    Given something that might be a URL (or might be a local path),
    determine which. If it's a URL, download the file and cache it, and
    return the path to the cached file. If it's already a local path,
    make sure the file exists and then return the path.
    Args:
        cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
        force_download: if True, re-dowload the file even if it's already cached in the cache dir.
        resume_download: if True, resume the download if incompletly recieved file is found.
        user_agent: Optional string or dict that will be appended to the user-agent on remote requests.
        extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed
            file in a folder along the archive.
        force_extract: if True when extract_compressed_file is True and the archive was already extracted,
            re-extract the archive and overide the folder where it was extracted.

    Return:
        None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
        Local path (string) otherwise
    """
    if cache_dir is None:
        cache_dir = TRANSFORMERS_CACHE
    if isinstance(url_or_filename, Path):
        url_or_filename = str(url_or_filename)
    if isinstance(cache_dir, Path):
        cache_dir = str(cache_dir)

    if is_remote_url(url_or_filename):
        # URL, so get it from the cache (downloading if necessary)
        output_path = get_from_cache(
            url_or_filename,
            cache_dir=cache_dir,
            force_download=force_download,
            proxies=proxies,
            resume_download=resume_download,
            user_agent=user_agent,
            local_files_only=local_files_only,
        )
    elif os.path.exists(url_or_filename):
        # File, and it exists.
        output_path = url_or_filename
    elif urlparse(url_or_filename).scheme == "":
        # File, but it doesn't exist.
        raise EnvironmentError("file {} not found".format(url_or_filename))
    else:
        # Something unknown
        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))

    if extract_compressed_file:
        if not is_zipfile(output_path) and not tarfile.is_tarfile(output_path):
            return output_path

        # Path where we extract compressed archives
        # We avoid '.' in dir name and add "-extracted" at the end: "./model.zip" => "./model-zip-extracted/"
        output_dir, output_file = os.path.split(output_path)
        output_extract_dir_name = output_file.replace(".", "-") + "-extracted"
        output_path_extracted = os.path.join(output_dir, output_extract_dir_name)

        if os.path.isdir(output_path_extracted) and os.listdir(output_path_extracted) and not force_extract:
            return output_path_extracted

        # Prevent parallel extractions
        lock_path = output_path + ".lock"
        with FileLock(lock_path):
            shutil.rmtree(output_path_extracted, ignore_errors=True)
            os.makedirs(output_path_extracted)
            if is_zipfile(output_path):
                with ZipFile(output_path, "r") as zip_file:
                    zip_file.extractall(output_path_extracted)
                    zip_file.close()
            elif tarfile.is_tarfile(output_path):
                tar_file = tarfile.open(output_path)
                tar_file.extractall(output_path_extracted)
                tar_file.close()
            else:
                raise EnvironmentError("Archive format of {} could not be identified".format(output_path))

        return output_path_extracted

    return output_path
Esempio n. 10
0
    def from_files(
        cls,
        directory: Union[str, os.PathLike],
        padding_token: Optional[str] = DEFAULT_PADDING_TOKEN,
        oov_token: Optional[str] = DEFAULT_OOV_TOKEN,
    ) -> "Vocabulary":
        """
        Loads a `Vocabulary` that was serialized either using `save_to_files` or inside
        a model archive file.

        # Parameters

        directory : `str`
            The directory or archive file containing the serialized vocabulary.
        """
        logger.info("Loading token dictionary from %s.", directory)
        padding_token = padding_token if padding_token is not None else DEFAULT_PADDING_TOKEN
        oov_token = oov_token if oov_token is not None else DEFAULT_OOV_TOKEN

        if not os.path.isdir(directory):
            base_directory = cached_path(directory, extract_archive=True)
            # For convenience we'll check for a 'vocabulary' subdirectory of the archive.
            # That way you can use model archives directly.
            vocab_subdir = os.path.join(base_directory, "vocabulary")
            if os.path.isdir(vocab_subdir):
                directory = vocab_subdir
            elif os.path.isdir(base_directory):
                directory = base_directory
            else:
                raise ConfigurationError(
                    f"{directory} is neither a directory nor an archive")

        # We use a lock file to avoid race conditions where multiple processes
        # might be reading/writing from/to the same vocab files at once.
        with FileLock(os.path.join(directory, ".lock")):
            with codecs.open(os.path.join(directory, NAMESPACE_PADDING_FILE),
                             "r", "utf-8") as namespace_file:
                non_padded_namespaces = [
                    namespace_str.strip() for namespace_str in namespace_file
                ]

            vocab = cls(
                non_padded_namespaces=non_padded_namespaces,
                padding_token=padding_token,
                oov_token=oov_token,
            )

            # Check every file in the directory.
            for namespace_filename in os.listdir(directory):
                if namespace_filename == NAMESPACE_PADDING_FILE:
                    continue
                if namespace_filename.startswith("."):
                    continue
                namespace = namespace_filename.replace(".txt", "")
                if any(
                        namespace_match(pattern, namespace)
                        for pattern in non_padded_namespaces):
                    is_padded = False
                else:
                    is_padded = True
                filename = os.path.join(directory, namespace_filename)
                vocab.set_from_file(filename,
                                    is_padded,
                                    namespace=namespace,
                                    oov_token=oov_token)

        return vocab
Esempio n. 11
0
def main():
    if request.method != 'POST':
        abort(METHOD_NOT_ALLOWED)

    try:
        payload = request.get_json()
        if payload is None:
            raise RuntimeError
    except:
        log.error('Failed to obtain payload data.')
        abort(BAD_REQUEST)

    event = request.headers.get('X-GitHub-Event')
    if event is None:
        log.error('DECLINED: no event provided.')
        abort(BAD_REQUEST)

    # Ping event
    if event == 'ping':
        return json.dumps({'msg': 'pong'})

    repo = payload['repository']['full_name']
    clone_url = payload['repository']['clone_url']
    idx = md5(clone_url)
    record_file_path = f'{DATABASE_DIRECTORY}/{idx}.json'
    if not os.path.isfile(record_file_path):
        log.error(
            f'DECLINED: repository "{repo}" has no record on the server.')
        abort(UNAUTHORIZED)

    with open(record_file_path, 'r') as fp:
        record = json.load(fp)
        log.info(f'Record file "{record_file_path}" loaded.')

    if not DEBUG_MODE:
        secret = record['secret']
        signature = request.headers.get('X-Hub-Signature')
        if signature is None:
            log.error('DECLINED: no signature provided.')
            abort(BAD_REQUEST)
        if not authenticate(secret, signature, request.data):
            log.error('DECLINED: failed to pass authentication.')
            abort(UNAUTHORIZED)

    if event != 'push':
        return json.dumps({'status': 'fail', 'reason': 'not a push event'})

    commits = payload['commits']
    if not commits:
        log.info('No new commits.')
        return json.dumps({'status': 'success'})

    branch = payload['ref'].rsplit('/', 1)[-1]
    commit = payload['head_commit']
    head = commit['id']
    folder_name = '%s/%s' % (repo, branch)
    folder = os.path.join(WEBPAGE_DIRECTORY, folder_name)
    if not os.path.exists(folder):
        os.makedirs(folder)
    status = os.path.join(folder, STATUS_FILE)
    index = os.path.join(folder, INDEX_FILE)
    output = os.path.join(folder, OUTPUT_FILE)
    status_url = WEBURL + os.path.join(folder_name, STATUS_FILE)
    index_url = WEBURL + os.path.join(folder_name, INDEX_FILE)

    index_lock = index + '.lock'
    with FileLock(index_lock):
        log.info(f'Copying {TEMPORARY_INDEX_FILE} to {index}')
        shutil.copyfile(TEMPORARY_INDEX_FILE, index)

        tmpfd, tmppath = tempfile.mkstemp()
        log.debug('tmppath = %s', tmppath)
        with os.fdopen(tmpfd, 'w') as fp:
            json.dump(record['checksums'], fp)

        log.info('Launching docmeld...')
        with open(status, 'w') as fp:
            fp.write(
                f'Current server time: {str(datetime.now())} (UTC{get_utc_offset()})\n'
            )
            fp.write(f'Build for commit #{head}: {commit["message"]}\n')
        with open(status, 'a') as fp:
            proc = subprocess.Popen([
                DOCMELD_EXECUTABLE, GIT_URL_START + clone_url, '-b', branch,
                '-s', head, '-c', tmppath, '-o', output,
                '-v' if DEBUG_MODE else '-q'
            ],
                                    stdout=fp,
                                    stderr=subprocess.STDOUT)

            try:
                proc.wait(timeout=COMPILE_TIME_LIMIT)
            except subprocess.TimeoutExpried as e:
                log.error('Time limit exceeded.')
                return json.dumps({
                    'status': 'fail',
                    'reason': f'time limit exceeded ({e.timeout}s)',
                    'detail': status_url
                })

        if proc.returncode != 0:
            log.error(
                f'docmeld execution failed with status code {proc.returncode}',
            )
            return json.dumps({
                'status': 'fail',
                'reason': f'docmeld failed with status code {proc.returncode}',
                'returncode': proc.returncode,
                'detail': status_url
            })

        log.info(f'Copying {output} to {index}...')
        shutil.copyfile(output, index)

    os.remove(tmppath)
    record['last_build'] = head
    with open(record_file_path, 'w') as fp:
        json.dump(record, fp, sort_keys=True, indent=4)

    return json.dumps({
        'status': 'success',
        'returncode': proc.returncode,
        'output_url': index_url,
        'detail': status_url
    })
Esempio n. 12
0
def cached_download(
    url: str,
    *,
    library_name: Optional[str] = None,
    library_version: Optional[str] = None,
    cache_dir: Union[str, Path, None] = None,
    user_agent: Union[Dict, str, None] = None,
    force_download: Optional[bool] = False,
    force_filename: Optional[str] = None,
    proxies: Optional[Dict] = None,
    etag_timeout: Optional[float] = 10,
    resume_download: Optional[bool] = False,
    use_auth_token: Union[bool, str, None] = None,
    local_files_only: Optional[bool] = False,
) -> Optional[str]:  # pragma: no cover
    """
    Download from a given URL and cache it if it's not already present in the
    local cache.

    Given a URL, this function looks for the corresponding file in the local
    cache. If it's not there, download it. Then return the path to the cached
    file.

    Args:
        url (`str`):
            The path to the file to be downloaded.
        library_name (`str`, *optional*):
            The name of the library to which the object corresponds.
        library_version (`str`, *optional*):
            The version of the library.
        cache_dir (`str`, `Path`, *optional*):
            Path to the folder where cached files are stored.
        user_agent (`dict`, `str`, *optional*):
            The user-agent info in the form of a dictionary or a string.
        force_download (`bool`, *optional*, defaults to `False`):
            Whether the file should be downloaded even if it already exists in
            the local cache.
        force_filename (`str`, *optional*):
            Use this name instead of a generated file name.
        proxies (`dict`, *optional*):
            Dictionary mapping protocol to the URL of the proxy passed to
            `requests.request`.
        etag_timeout (`float`, *optional* defaults to `10`):
            When fetching ETag, how many seconds to wait for the server to send
            data before giving up which is passed to `requests.request`.
        resume_download (`bool`, *optional*, defaults to `False`):
            If `True`, resume a previously interrupted download.
        use_auth_token (`bool`, `str`, *optional*):
            A token to be used for the download.
                - If `True`, the token is read from the HuggingFace config
                  folder.
                - If a string, it's used as the authentication token.
        local_files_only (`bool`, *optional*, defaults to `False`):
            If `True`, avoid downloading the file and return the path to the
            local cached file if it exists.

    Returns:
        Local path (string) of file or if networking is off, last version of
        file cached on disk.

    <Tip>

    Raises the following errors:

        - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
          if `use_auth_token=True` and the token cannot be found.
        - [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError)
          if ETag cannot be determined.
        - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
          if some parameter value is invalid

    </Tip>
    """
    if cache_dir is None:
        cache_dir = HUGGINGFACE_HUB_CACHE
    if isinstance(cache_dir, Path):
        cache_dir = str(cache_dir)

    os.makedirs(cache_dir, exist_ok=True)

    headers = {
        "user-agent": http_user_agent(
            library_name=library_name,
            library_version=library_version,
            user_agent=user_agent,
        )
    }
    if isinstance(use_auth_token, str):
        headers["authorization"] = f"Bearer {use_auth_token}"
    elif use_auth_token:
        token = HfFolder.get_token()
        if token is None:
            raise EnvironmentError(
                "You specified use_auth_token=True, but a huggingface token was not"
                " found."
            )
        headers["authorization"] = f"Bearer {token}"

    url_to_download = url
    etag = None
    if not local_files_only:
        try:
            r = _request_with_retry(
                method="HEAD",
                url=url,
                headers=headers,
                allow_redirects=False,
                proxies=proxies,
                timeout=etag_timeout,
            )
            r.raise_for_status()
            etag = r.headers.get("X-Linked-Etag") or r.headers.get("ETag")
            # We favor a custom header indicating the etag of the linked resource, and
            # we fallback to the regular etag header.
            # If we don't have any of those, raise an error.
            if etag is None:
                raise OSError(
                    "Distant resource does not have an ETag, we won't be able to"
                    " reliably ensure reproducibility."
                )
            # In case of a redirect,
            # save an extra redirect on the request.get call,
            # and ensure we download the exact atomic version even if it changed
            # between the HEAD and the GET (unlikely, but hey).
            if 300 <= r.status_code <= 399:
                url_to_download = r.headers["Location"]
        except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
            # Actually raise for those subclasses of ConnectionError
            raise
        except (
            requests.exceptions.ConnectionError,
            requests.exceptions.Timeout,
            OfflineModeIsEnabled,
        ):
            # Otherwise, our Internet connection is down.
            # etag is None
            pass

    filename = (
        force_filename if force_filename is not None else url_to_filename(url, etag)
    )

    # get cache path to put the file
    cache_path = os.path.join(cache_dir, filename)

    # etag is None == we don't have a connection or we passed local_files_only.
    # try to get the last downloaded one
    if etag is None:
        if os.path.exists(cache_path) and not force_download:
            return cache_path
        else:
            matching_files = [
                file
                for file in fnmatch.filter(
                    os.listdir(cache_dir), filename.split(".")[0] + ".*"
                )
                if not file.endswith(".json") and not file.endswith(".lock")
            ]
            if (
                len(matching_files) > 0
                and not force_download
                and force_filename is None
            ):
                return os.path.join(cache_dir, matching_files[-1])
            else:
                # If files cannot be found and local_files_only=True,
                # the models might've been found if local_files_only=False
                # Notify the user about that
                if local_files_only:
                    raise ValueError(
                        "Cannot find the requested files in the cached path and"
                        " outgoing traffic has been disabled. To enable model look-ups"
                        " and downloads online, set 'local_files_only' to False."
                    )
                else:
                    raise ValueError(
                        "Connection error, and we cannot find the requested files in"
                        " the cached path. Please try again or make sure your Internet"
                        " connection is on."
                    )

    # From now on, etag is not None.
    if os.path.exists(cache_path) and not force_download:
        return cache_path

    # Prevent parallel downloads of the same file with a lock.
    lock_path = cache_path + ".lock"

    # Some Windows versions do not allow for paths longer than 255 characters.
    # In this case, we must specify it is an extended path by using the "\\?\" prefix.
    if os.name == "nt" and len(os.path.abspath(lock_path)) > 255:
        lock_path = "\\\\?\\" + os.path.abspath(lock_path)

    if os.name == "nt" and len(os.path.abspath(cache_path)) > 255:
        cache_path = "\\\\?\\" + os.path.abspath(cache_path)

    with FileLock(lock_path):

        # If the download just completed while the lock was activated.
        if os.path.exists(cache_path) and not force_download:
            # Even if returning early like here, the lock will be released.
            return cache_path

        if resume_download:
            incomplete_path = cache_path + ".incomplete"

            @contextmanager
            def _resumable_file_manager() -> "io.BufferedWriter":
                with open(incomplete_path, "ab") as f:
                    yield f

            temp_file_manager = _resumable_file_manager
            if os.path.exists(incomplete_path):
                resume_size = os.stat(incomplete_path).st_size
            else:
                resume_size = 0
        else:
            temp_file_manager = partial(
                tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False
            )
            resume_size = 0

        # Download to temporary file, then copy to cache dir once finished.
        # Otherwise you get corrupt cache entries if the download gets interrupted.
        with temp_file_manager() as temp_file:
            logger.info("downloading %s to %s", url, temp_file.name)

            http_get(
                url_to_download,
                temp_file,
                proxies=proxies,
                resume_size=resume_size,
                headers=headers,
            )

        logger.info("storing %s in cache at %s", url, cache_path)
        os.replace(temp_file.name, cache_path)

        if force_filename is None:
            logger.info("creating metadata file for %s", cache_path)
            meta = {"url": url, "etag": etag}
            meta_path = cache_path + ".json"
            with open(meta_path, "w") as meta_file:
                json.dump(meta, meta_file)

    return cache_path
Esempio n. 13
0
def download_and_unpack_package(
    pkg_uri: str,
    base_directory: str,
    logger: Optional[logging.Logger] = default_logger,
) -> str:
    """Download the package corresponding to this URI and unpack it.

    Will be written to a directory named {base_directory}/{uri}.
    """
    pkg_file = Path(_get_local_path(base_directory, pkg_uri))
    with FileLock(str(pkg_file) + ".lock"):
        if logger is None:
            logger = default_logger

        logger.debug(f"Fetching package for URI: {pkg_uri}")

        local_dir = pkg_file.with_suffix("")
        assert local_dir != pkg_file, "Invalid pkg_file!"
        if local_dir.exists():
            assert local_dir.is_dir(), f"{local_dir} is not a directory"
        else:
            protocol, pkg_name = parse_uri(pkg_uri)
            if protocol == Protocol.GCS:
                # Download package from the GCS.
                code = _internal_kv_get(pkg_uri)
                if code is None:
                    raise IOError(f"Failed to fetch URI {pkg_uri} from GCS.")
                code = code or b""
                pkg_file.write_bytes(code)
                unzip_package(package_path=pkg_file,
                              target_dir=local_dir,
                              remove_top_level_directory=False,
                              unlink_zip=True,
                              logger=logger)
            elif protocol in Protocol.remote_protocols():
                # Download package from remote URI
                tp = None

                if protocol == Protocol.S3:
                    try:
                        from smart_open import open
                        import boto3
                    except ImportError:
                        raise ImportError(
                            "You must `pip install smart_open` and "
                            "`pip install boto3` to fetch URIs in s3 "
                            "bucket.")
                    tp = {"client": boto3.client("s3")}
                elif protocol == Protocol.GS:
                    try:
                        from smart_open import open
                        from google.cloud import storage  # noqa: F401
                    except ImportError:
                        raise ImportError(
                            "You must `pip install smart_open` and "
                            "`pip install google-cloud-storage` "
                            "to fetch URIs in Google Cloud Storage bucket.")
                else:
                    try:
                        from smart_open import open
                    except ImportError:
                        raise ImportError(
                            "You must `pip install smart_open` "
                            f"to fetch {protocol.value.upper()} URIs.")

                with open(pkg_uri, "rb", transport_params=tp) as package_zip:
                    with open(pkg_file, "wb") as fin:
                        fin.write(package_zip.read())

                unzip_package(package_path=pkg_file,
                              target_dir=local_dir,
                              remove_top_level_directory=True,
                              unlink_zip=True,
                              logger=logger)
            else:
                raise NotImplementedError(
                    f"Protocol {protocol} is not supported")

        return str(local_dir)
Esempio n. 14
0
    def _writeStatsFile(self):
        toUpdate = False

        for st in self._stats:
            if st[3] == SStatus.ACQUIRED:
                toUpdate = True
                break

        if not toUpdate:
            return

        exstats = self._readStatsFile()
        lock = FileLock(Teprolin.statsLockFile)
        uprec = 0
        adrec = 0

        with lock:
            print(
                "PID {0}-{1}.{2}[{3}]: updating the statistics in the file...".
                format(os.getpid(),
                       Path(inspect.stack()[0].filename).stem,
                       inspect.stack()[0].function,
                       inspect.stack()[0].lineno),
                file=sys.stderr,
                flush=True)

            with open(Teprolin.statsFile, mode="w") as f:
                for x in exstats:
                    for i in range(len(self._stats)):
                        y = self._stats[i]

                        if y[0] == x[0]:
                            if y[3] == SStatus.ACQUIRED:
                                x[1] += y[1]
                                x[2] += y[2]
                                uprec += 1
                            # end if ACQUIRED
                            self._stats.pop(i)
                            break
                        # end if same day
                    # end for i
                    d = x[0]
                    t = x[1]
                    r = x[2]
                    f.write(" ".join(str(e) for e in d))
                    f.write(" ")
                    f.write(str(t))
                    f.write(" ")
                    f.write(str(r))
                    f.write("\n")
                # end for x

                for y in self._stats:
                    if y[3] == SStatus.ACQUIRED:
                        d = y[0]
                        t = y[1]
                        r = y[2]
                        f.write(" ".join(str(e) for e in d))
                        f.write(" ")
                        f.write(str(t))
                        f.write(" ")
                        f.write(str(r))
                        f.write("\n")
                        adrec += 1
                # end for y
                print(
                    "PID {0}-{1}.{2}[{3}]: updated {4} records and added {5} records."
                    .format(os.getpid(),
                            Path(inspect.stack()[0].filename).stem,
                            inspect.stack()[0].function,
                            inspect.stack()[0].lineno, uprec, adrec),
                    file=sys.stderr,
                    flush=True)