Esempio n. 1
0
 def _upload_fobj(self, fobj, to_info):
     self.makedirs(to_info.parent)
     tmp_info = to_info.parent / tmp_fname("")
     try:
         copy_fobj_to_file(fobj, tmp_info)
         os.rename(tmp_info, to_info)
     except Exception:
         self.remove(tmp_info)
         raise
Esempio n. 2
0
 def copy(self, from_info, to_info):
     tmp_info = to_info.parent / tmp_fname(to_info.name)
     try:
         System.copy(from_info, tmp_info)
         os.chmod(tmp_info, self.file_mode)
         os.rename(tmp_info, to_info)
     except Exception:
         self.remove(tmp_info)
         raise
Esempio n. 3
0
    def __init__(self, **config):
        from fsspec.utils import infer_storage_options

        super().__init__(**config)

        self.url = config["url"]
        opts = infer_storage_options(self.url)

        if not opts["host"]:
            raise DvcException(
                "Empty GDrive URL '{}'. Learn more at {}".format(
                    config["url"],
                    format_link("https://man.dvc.org/remote/add"),
                )
            )

        self._bucket = opts["host"]
        self._path = opts["path"].lstrip("/")
        self._trash_only = config.get("gdrive_trash_only")
        self._use_service_account = config.get("gdrive_use_service_account")
        self._service_account_user_email = config.get(
            "gdrive_service_account_user_email"
        )
        self._service_account_json_file_path = config.get(
            "gdrive_service_account_json_file_path"
        )
        self._client_id = config.get("gdrive_client_id")
        self._client_secret = config.get("gdrive_client_secret")
        self._validate_config()

        tmp_dir = config["gdrive_credentials_tmp_dir"]
        assert tmp_dir

        self._gdrive_service_credentials_path = tmp_fname(
            os.path.join(tmp_dir, "")
        )
        self._gdrive_user_credentials_path = (
            tmp_fname(os.path.join(tmp_dir, ""))
            if os.getenv(GDriveFileSystem.GDRIVE_CREDENTIALS_DATA)
            else config.get(
                "gdrive_user_credentials_file",
                os.path.join(tmp_dir, self.DEFAULT_USER_CREDENTIALS_FILE),
            )
        )
Esempio n. 4
0
    def download(
        self,
        from_infos,
        to_infos,
        no_progress_bar=False,
        names=None,
        resume=False,
    ):
        names = self._verify_path_args(from_infos, to_infos, names)

        s3 = self.s3

        for to_info, from_info, name in zip(to_infos, from_infos, names):
            if from_info.scheme != "s3":
                raise NotImplementedError

            if to_info.scheme == "s3":
                self.copy(from_info, to_info, s3=s3)
                continue

            if to_info.scheme != "local":
                raise NotImplementedError

            msg = "Downloading '{}/{}' to '{}'".format(from_info.bucket,
                                                       from_info.path,
                                                       to_info.path)
            logger.debug(msg)

            tmp_file = tmp_fname(to_info.path)
            if not name:
                name = os.path.basename(to_info.path)

            makedirs(os.path.dirname(to_info.path), exist_ok=True)

            try:
                if no_progress_bar:
                    cb = None
                else:
                    total = s3.head_object(Bucket=from_info.bucket,
                                           Key=from_info.path)["ContentLength"]
                    cb = Callback(name, total)

                s3.download_file(from_info.bucket,
                                 from_info.path,
                                 tmp_file,
                                 Callback=cb)
            except Exception:
                msg = "failed to download '{}/{}'".format(
                    from_info.bucket, from_info.path)
                logger.exception(msg)
                continue

            move(tmp_file, to_info.path)

            if not no_progress_bar:
                progress.finish_target(name)
Esempio n. 5
0
    def download(
        self,
        from_infos,
        to_infos,
        no_progress_bar=False,
        names=None,
        resume=False,
    ):
        names = self._verify_path_args(from_infos, to_infos, names)

        gs = self.gs

        for to_info, from_info, name in zip(to_infos, from_infos, names):
            if from_info["scheme"] != "gs":
                raise NotImplementedError

            if to_info["scheme"] == "gs":
                self.copy(from_info, to_info, gs=gs)
                continue

            if to_info["scheme"] != "local":
                raise NotImplementedError

            msg = "Downloading '{}/{}' to '{}'".format(
                from_info["bucket"], from_info["path"], to_info["path"]
            )
            logger.debug(msg)

            tmp_file = tmp_fname(to_info["path"])
            if not name:
                name = os.path.basename(to_info["path"])

            if not no_progress_bar:
                # percent_cb is not available for download_to_filename, so
                # lets at least update progress at pathpoints(start, finish)
                progress.update_target(name, 0, None)

            makedirs(os.path.dirname(to_info["path"]), exist_ok=True)

            try:
                bucket = gs.bucket(from_info["bucket"])
                blob = bucket.get_blob(from_info["path"])
                blob.download_to_filename(tmp_file)
            except Exception:
                msg = "failed to download '{}/{}' to '{}'"
                logger.exception(
                    msg.format(
                        from_info["bucket"], from_info["path"], to_info["path"]
                    )
                )
                continue

            move(tmp_file, to_info["path"])

            if not no_progress_bar:
                progress.finish_target(name)
Esempio n. 6
0
 def copy_fobj(self, fobj, to_info):
     self.makedirs(to_info.parent)
     tmp_info = to_info.parent / tmp_fname(to_info.name)
     try:
         copy_fobj_to_file(fobj, tmp_info)
         os.chmod(tmp_info, self.file_mode)
         os.rename(tmp_info, to_info)
     except Exception:
         self.remove(tmp_info)
         raise
Esempio n. 7
0
File: local.py Progetto: nik123/dvc
 def put_file(self,
              from_file,
              to_info,
              callback=DEFAULT_CALLBACK,
              **kwargs):
     parent = self.path.parent(to_info)
     makedirs(parent, exist_ok=True)
     tmp_file = self.path.join(parent, tmp_fname())
     copyfile(from_file, tmp_file, callback=callback)
     os.replace(tmp_file, to_info)
Esempio n. 8
0
    def _upload(
        self, from_file, to_info, name=None, no_progress_bar=False, **_kwargs
    ):
        makedirs(to_info.parent, exist_ok=True)

        tmp_file = tmp_fname(to_info)
        copyfile(
            from_file, tmp_file, name=name, no_progress_bar=no_progress_bar
        )
        os.rename(tmp_file, fspath_py35(to_info))
Esempio n. 9
0
    def upload(self, src, dest, no_progress_bar=False, progress_title=None):
        self.makedirs(posixpath.dirname(dest))
        tmp_file = tmp_fname(dest)
        if not progress_title:
            progress_title = posixpath.basename(dest)

        with Tqdm(desc=progress_title, disable=no_progress_bar,
                  bytes=True) as pbar:
            self.sftp.put(src, tmp_file, callback=pbar.update_to)

        self.sftp.rename(tmp_file, dest)
Esempio n. 10
0
    def client(self):
        import pydata_google_auth
        from gdrivefs import GoogleDriveFileSystem

        tmp_path = tmp_fname()
        with open(tmp_path, "w") as stream:
            stream.write(os.getenv(GDriveFileSystem.GDRIVE_CREDENTIALS_DATA))

        GoogleDriveFileSystem._connect_cache = partial(
            pydata_google_auth.load_user_credentials, tmp_path)
        return GoogleDriveFileSystem(token="cache")
Esempio n. 11
0
    def _get_dir_info_checksum(self, dir_info):
        tmp = tempfile.NamedTemporaryFile(delete=False).name
        with open(tmp, "w+") as fobj:
            json.dump(dir_info, fobj, sort_keys=True)

        from_info = PathInfo(tmp)
        to_info = self.cache.path_info / tmp_fname("")
        self.cache.upload(from_info, to_info, no_progress_bar=True)

        checksum = self.get_file_checksum(to_info) + self.CHECKSUM_DIR_SUFFIX
        return checksum, to_info
Esempio n. 12
0
    def __init__(self, **config):
        super().__init__(**config)

        self.path_info = self.PATH_CLS(config["url"])

        if not self.path_info.bucket:
            raise DvcException(
                "Empty GDrive URL '{}'. Learn more at {}".format(
                    config["url"],
                    format_link("https://man.dvc.org/remote/add"),
                )
            )

        self._bucket = self.path_info.bucket
        self._path = self.path_info.path
        self._trash_only = config.get("gdrive_trash_only")
        self._use_service_account = config.get("gdrive_use_service_account")
        self._service_account_user_email = config.get(
            "gdrive_service_account_user_email"
        )
        self._service_account_json_file_path = config.get(
            "gdrive_service_account_json_file_path"
        )
        self._client_id = config.get("gdrive_client_id")
        self._client_secret = config.get("gdrive_client_secret")
        self._validate_config()

        tmp_dir = config["gdrive_credentials_tmp_dir"]
        assert tmp_dir

        self._gdrive_service_credentials_path = tmp_fname(
            os.path.join(tmp_dir, "")
        )
        self._gdrive_user_credentials_path = (
            tmp_fname(os.path.join(tmp_dir, ""))
            if os.getenv(GDriveFileSystem.GDRIVE_CREDENTIALS_DATA)
            else config.get(
                "gdrive_user_credentials_file",
                os.path.join(tmp_dir, self.DEFAULT_USER_CREDENTIALS_FILE),
            )
        )
Esempio n. 13
0
def _upload_file(path_info, fs, odb, upload_odb):
    from dvc.utils import tmp_fname
    from dvc.utils.stream import HashedStreamReader

    tmp_info = upload_odb.path_info / tmp_fname()
    with fs.open(path_info, mode="rb", chunk_size=fs.CHUNK_SIZE) as stream:
        stream = HashedStreamReader(stream)
        size = fs.getsize(path_info)
        upload_odb.fs.upload(stream, tmp_info, desc=path_info.name, total=size)

    odb.add(tmp_info, upload_odb.fs, stream.hash_info)
    return path_info, odb.get(stream.hash_info)
Esempio n. 14
0
 def copy(self, from_info, to_info, **_kwargs):
     with self.hdfs(to_info) as hdfs:
         # NOTE: this is how `hadoop fs -cp` works too: it copies through
         # your local machine.
         with hdfs.open(from_info.path, "rb") as from_fobj:
             tmp_info = to_info.parent / tmp_fname(to_info.name)
             try:
                 with hdfs.open(tmp_info.path, "wb") as tmp_fobj:
                     tmp_fobj.upload(from_fobj)
                 hdfs.rename(tmp_info.path, to_info.path)
             except Exception:
                 self.remove(tmp_info)
                 raise
Esempio n. 15
0
File: hdfs.py Progetto: ye-man/dvc
    def _upload(self, from_file, to_info, **_kwargs):
        self.hadoop_fs("mkdir -p {}".format(to_info.parent.url),
                       user=to_info.user)

        tmp_file = tmp_fname(to_info.url)

        self.hadoop_fs(
            "copyFromLocal {} {}".format(from_file, tmp_file),
            user=to_info.user,
        )

        self.hadoop_fs("mv {} {}".format(tmp_file, to_info.url),
                       user=to_info.user)
Esempio n. 16
0
    def _download_file(self, from_info, to_info, name, no_progress_bar):
        makedirs(to_info.parent, exist_ok=True)

        logger.debug("Downloading '%s' to '%s'", from_info, to_info)
        name = name or to_info.name

        tmp_file = tmp_fname(to_info)

        self._download(  # noqa, pylint: disable=no-member
            from_info, tmp_file, name=name, no_progress_bar=no_progress_bar
        )

        move(tmp_file, to_info)
Esempio n. 17
0
    def upload(self, src, dest, no_progress_bar=False, progress_title=None):
        self.makedirs(posixpath.dirname(dest))
        tmp_file = tmp_fname(dest)

        if no_progress_bar:
            self.sftp.put(src, tmp_file)
        else:
            if not progress_title:
                progress_title = posixpath.basename(dest)

            self.sftp.put(src, tmp_file, callback=create_cb(progress_title))
            progress.finish_target(progress_title)

        self.sftp.rename(tmp_file, dest)
Esempio n. 18
0
def _upload_file(path_info, fs, odb):
    from dvc.utils import tmp_fname
    from dvc.utils.stream import HashedStreamReader

    tmp_info = odb.path_info / tmp_fname()
    with fs.open(path_info, mode="rb", chunk_size=fs.CHUNK_SIZE) as stream:
        stream = HashedStreamReader(stream)
        odb.fs.upload_fobj(stream,
                           tmp_info,
                           desc=path_info.name,
                           size=fs.getsize(path_info))

    obj = HashFile(tmp_info, odb.fs, stream.hash_info)
    return path_info, obj
Esempio n. 19
0
File: base.py Progetto: kss682/dvc
    def download(
        self,
        from_info,
        to_info,
        name=None,
        no_progress_bar=False,
        file_mode=None,
        dir_mode=None,
    ):
        if not hasattr(self, "_download"):
            raise RemoteActionNotImplemented("download", self.scheme)

        if from_info.scheme != self.scheme:
            raise NotImplementedError

        if to_info.scheme == self.scheme != "local":
            self.copy(from_info, to_info)
            return 0

        if to_info.scheme != "local":
            raise NotImplementedError

        logger.debug("Downloading '{}' to '{}'".format(from_info, to_info))

        name = name or to_info.name

        if not no_progress_bar:
            # real progress is not always available,
            # lets at least show start and finish
            progress.update_target(name, 0, None)

        makedirs(to_info.parent, exist_ok=True, mode=dir_mode)
        tmp_file = tmp_fname(to_info)

        try:
            self._download(from_info,
                           tmp_file,
                           name=name,
                           no_progress_bar=no_progress_bar)
        except Exception:
            msg = "failed to download '{}' to '{}'"
            logger.exception(msg.format(from_info, to_info))
            return 1  # 1 fail

        move(tmp_file, to_info, mode=file_mode)

        if not no_progress_bar:
            progress.finish_target(name)

        return 0
Esempio n. 20
0
 def init_drive(self):
     self.client_id = self.config.get(Config.SECTION_GDRIVE_CLIENT_ID, None)
     self.client_secret = self.config.get(
         Config.SECTION_GDRIVE_CLIENT_SECRET, None)
     if not self.client_id or not self.client_secret:
         raise DvcException("Please specify Google Drive's client id and "
                            "secret in DVC's config. Learn more at "
                            "https://man.dvc.org/remote/add.")
     self.gdrive_user_credentials_path = (
         tmp_fname(".dvc/tmp/")
         if os.getenv(RemoteGDrive.GDRIVE_USER_CREDENTIALS_DATA) else
         self.config.get(
             Config.SECTION_GDRIVE_USER_CREDENTIALS_FILE,
             self.DEFAULT_USER_CREDENTIALS_FILE,
         ))
Esempio n. 21
0
 def copy(self, from_info, to_info, **_kwargs):
     # NOTE: hdfs.copy_file is not supported yet in pyarrow
     with self.hdfs(to_info) as hdfs:
         # NOTE: this is how `hadoop fs -cp` works too: it copies through
         # your local machine.
         with closing(hdfs.open_input_stream(from_info.path)) as from_fobj:
             tmp_info = to_info.parent / tmp_fname(to_info.name)
             try:
                 with closing(hdfs.open_output_stream(
                         tmp_info.path)) as tmp_fobj:
                     shutil.copyfileobj(from_fobj, tmp_fobj)
                 hdfs.move(tmp_info.path, to_info.path)
             except Exception:
                 self.remove(tmp_info)
                 raise
Esempio n. 22
0
    def _get_dir_info_hash(self, dir_info):
        # Sorting the list by path to ensure reproducibility
        dir_info = sorted(dir_info, key=itemgetter(self.PARAM_RELPATH))

        tmp = tempfile.NamedTemporaryFile(delete=False).name
        with open(tmp, "w+") as fobj:
            json.dump(dir_info, fobj, sort_keys=True)

        tree = self.cache.tree
        from_info = PathInfo(tmp)
        to_info = tree.path_info / tmp_fname("")
        tree.upload(from_info, to_info, no_progress_bar=True)

        typ, hash_ = tree.get_file_hash(to_info)
        return typ, hash_ + self.CHECKSUM_DIR_SUFFIX, to_info
Esempio n. 23
0
    def _download_file(self, from_info, to_info, name, no_progress_bar,
                       file_mode, dir_mode):
        makedirs(to_info.parent, exist_ok=True, mode=dir_mode)

        logger.debug("Downloading '%s' to '%s'", from_info, to_info)
        name = name or to_info.name

        tmp_file = tmp_fname(to_info)

        self._download(from_info,
                       tmp_file,
                       name=name,
                       no_progress_bar=no_progress_bar)

        move(tmp_file, to_info, mode=file_mode)
Esempio n. 24
0
    def _get_dir_info_checksum(self, dir_info, path_info):
        to_info = copy(path_info)
        to_info.path = self.cache.ospath.join(self.cache.prefix, tmp_fname(""))

        tmp = tempfile.NamedTemporaryFile(delete=False).name
        with open(tmp, "w+") as fobj:
            json.dump(dir_info, fobj, sort_keys=True)

        from_info = PathLOCAL(path=tmp)
        self.cache.upload([from_info], [to_info], no_progress_bar=True)

        checksum = self.get_file_checksum(to_info) + self.CHECKSUM_DIR_SUFFIX
        from_info = copy(to_info)
        to_info.path = self.cache.checksum_to_path(checksum)
        return checksum, from_info, to_info
Esempio n. 25
0
    def digest(self):
        from dvc.fs.memory import MemoryFileSystem
        from dvc.path_info import PathInfo
        from dvc.utils import tmp_fname

        memfs = MemoryFileSystem()
        path_info = PathInfo(tmp_fname(""))
        with memfs.open(path_info, "wb") as fobj:
            fobj.write(self.as_bytes())
        self.fs = memfs
        self.path_info = path_info
        self.hash_info = get_file_hash(path_info, memfs, "md5")
        self.hash_info.value += ".dir"
        self.hash_info.size = self.size
        self.hash_info.nfiles = len(self)
Esempio n. 26
0
def as_atomic(fs, to_info):
    from dvc.utils import tmp_fname

    tmp_info = fs.path.join(fs.path.parent(to_info), tmp_fname())
    try:
        yield tmp_info
    except BaseException:
        # Handle stuff like KeyboardInterrupt
        # as well as other errors that might
        # arise during file transfer.
        with suppress(FileNotFoundError):
            fs.remove(tmp_info)
        raise
    else:
        fs.move(tmp_info, to_info)
Esempio n. 27
0
File: tree.py Progetto: skshetry/dvc
    def digest(self, hash_info: Optional["HashInfo"] = None):
        from dvc.fs.memory import MemoryFileSystem
        from dvc.utils import tmp_fname

        memfs = MemoryFileSystem()
        fs_path = "memory://{}".format(tmp_fname(""))
        with memfs.open(fs_path, "wb") as fobj:
            fobj.write(self.as_bytes())
        self.fs = memfs
        self.fs_path = fs_path
        if hash_info:
            self.hash_info = hash_info
        else:
            _, self.hash_info = get_file_hash(fs_path, memfs, "md5")
            assert self.hash_info.value
            self.hash_info.value += ".dir"
Esempio n. 28
0
def _upload_file(from_fs_path, fs, odb, upload_odb):
    from dvc.utils import tmp_fname
    from dvc.utils.stream import HashedStreamReader

    fs_path = upload_odb.fs.path
    tmp_info = fs_path.join(upload_odb.fs_path, tmp_fname())
    with fs.open(from_fs_path, mode="rb", chunk_size=fs.CHUNK_SIZE) as stream:
        stream = HashedStreamReader(stream)
        size = fs.getsize(from_fs_path)
        upload_odb.fs.upload(
            stream, tmp_info, desc=fs_path.name(from_fs_path), total=size
        )

    odb.add(tmp_info, upload_odb.fs, stream.hash_info)
    meta = Meta(size=size)
    return from_fs_path, meta, odb.get(stream.hash_info)
Esempio n. 29
0
    def download(
        self,
        from_infos,
        to_infos,
        no_progress_bar=False,
        names=None,
        resume=False,
    ):
        names = self._verify_path_args(from_infos, to_infos, names)

        for to_info, from_info, name in zip(to_infos, from_infos, names):
            if from_info["scheme"] != self.scheme:
                raise NotImplementedError

            if to_info["scheme"] != "local":
                raise NotImplementedError

            bucket = from_info["bucket"]
            path = from_info["path"]

            logger.debug(
                "Downloading '{}/{}' to '{}'".format(
                    bucket, path, to_info["path"]
                )
            )

            tmp_file = tmp_fname(to_info["path"])
            if not name:
                name = os.path.basename(to_info["path"])

            cb = None if no_progress_bar else Callback(name)

            makedirs(os.path.dirname(to_info["path"]), exist_ok=True)

            try:
                self.blob_service.get_blob_to_path(
                    bucket, path, tmp_file, progress_callback=cb
                )
            except Exception:
                msg = "failed to download '{}/{}'".format(bucket, path)
                logger.warning(msg)
            else:
                move(tmp_file, to_info["path"])

                if not no_progress_bar:
                    progress.finish_target(name)
Esempio n. 30
0
 def init_drive(self):
     self.client_id = self.config.get(Config.SECTION_GDRIVE_CLIENT_ID, None)
     self.client_secret = self.config.get(
         Config.SECTION_GDRIVE_CLIENT_SECRET, None)
     if not self.client_id or not self.client_secret:
         raise DvcException(
             "Please specify Google Drive's client id and "
             "secret in DVC's config. Learn more at "
             "{}.".format(format_link("https://man.dvc.org/remote/add")))
     self.gdrive_user_credentials_path = (
         tmp_fname(os.path.join(self.repo.tmp_dir, ""))
         if os.getenv(RemoteGDrive.GDRIVE_USER_CREDENTIALS_DATA) else
         self.config.get(
             Config.SECTION_GDRIVE_USER_CREDENTIALS_FILE,
             os.path.join(self.repo.tmp_dir,
                          self.DEFAULT_USER_CREDENTIALS_FILE),
         ))