Example #1
0
    def checkout(self, path_info, checksum_info):
        if path_info['scheme'] != 'ssh':
            raise NotImplementedError

        md5 = checksum_info.get(self.PARAM_MD5, None)
        if not md5:
            return

        if not self.changed(path_info, checksum_info):
            msg = "Data '{}' didn't change."
            logger.info(msg.format(self.to_string(path_info)))
            return

        if self.changed_cache(md5):
            msg = "Cache '{}' not found. File '{}' won't be created."
            logger.warn(msg.format(md5, self.to_string(path_info)))
            return

        if self.exists([path_info])[0]:
            msg = "Data '{}' exists. Removing before checkout."
            logger.warn(msg.format(self.to_string(path_info)))
            self.remove(path_info)
            return

        msg = "Checking out '{}' with cache '{}'."
        logger.info(msg.format(self.to_string(path_info), md5))

        src = path_info.copy()
        src['path'] = posixpath.join(self.prefix, md5[0:2], md5[2:])

        self.cp(src, path_info)
Example #2
0
    def status(self, checksum_infos, remote, jobs=None, show_checksums=False):
        logger.info("Preparing to pull data from {}".format(remote.url))
        title = "Collecting information"

        progress.set_n_total(1)
        progress.update_target(title, 0, 100)

        progress.update_target(title, 10, 100)

        md5s, names = self._group(checksum_infos,
                                  show_checksums=show_checksums)

        progress.update_target(title, 20, 100)

        path_infos = remote.md5s_to_path_infos(md5s)

        progress.update_target(title, 30, 100)

        remote_exists = remote.exists(path_infos)

        progress.update_target(title, 90, 100)

        local_exists = [not self.changed_cache_file(md5) for md5 in md5s]

        progress.finish_target(title)

        return [(name, STATUS_MAP[l, r])
                for name, l, r in zip(names, local_exists, remote_exists)]
Example #3
0
    def _save_dir(self, path_info):
        path = path_info['path']
        md5, dir_info = self.state.update_info(path)
        dir_relpath = os.path.relpath(path)
        dir_size = len(dir_info)
        bar = dir_size > LARGE_DIR_SIZE

        logger.info("Linking directory '{}'.".format(dir_relpath))

        for processed, entry in enumerate(dir_info):
            relpath = entry[self.PARAM_RELPATH]
            m = entry[self.PARAM_MD5]
            p = os.path.join(path, relpath)
            c = self.get(m)

            if self.changed_cache(m):
                self._move(p, c)
            else:
                remove(p)

            self.link(c, p)

            if bar:
                progress.update_target(dir_relpath, processed, dir_size)

        self.state.update_link(path)

        if bar:
            progress.finish_target(dir_relpath)

        return {self.PARAM_MD5: md5}
Example #4
0
    def checkout(self, path_info, checksum_info):
        if path_info['scheme'] != 's3':
            raise NotImplementedError

        etag = checksum_info.get(self.PARAM_ETAG, None)
        if not etag:
            return

        if not self.changed(path_info, checksum_info):
            msg = "Data '{}' didn't change."
            logger.info(msg.format(self.to_string(path_info)))
            return

        if self.changed_cache(etag):
            msg = "Cache '{}' not found. File '{}' won't be created."
            logger.warn(msg.format(etag, self.to_string(path_info)))
            return

        if self.exists([path_info])[0]:
            msg = "Data '{}' exists. Removing before checkout."
            logger.warn(msg.format(self.to_string(path_info)))
            self.remove(path_info)
            return

        msg = "Checking out '{}' with cache '{}'."
        logger.info(msg.format(self.to_string(path_info), etag))

        key = posixpath.join(self.prefix, etag[0:2], etag[2:])
        from_info = {'scheme': 's3', 'bucket': self.bucket, 'key': key}

        self._copy(from_info, path_info)
Example #5
0
    def checkout(self, path_info, checksum_info):
        if path_info['scheme'] != 'hdfs':
            raise NotImplementedError

        assert path_info.get('url')

        checksum = checksum_info.get(self.PARAM_CHECKSUM, None)
        if not checksum:
            return

        if not self.changed(path_info, checksum_info):
            msg = "Data '{}' didn't change."
            logger.info(msg.format(self.to_string(path_info)))
            return

        if self.changed_cache(checksum):
            msg = "Cache '{}' not found. File '{}' won't be created."
            logger.warn(msg.format(checksum, self.to_string(path_info)))
            return

        if self.exists([path_info])[0]:
            msg = "Data '{}' exists. Removing before checkout."
            logger.warn(msg.format(self.to_string(path_info)))
            self.remove(path_info)
            return

        msg = "Checking out '{}' with cache '{}'."
        logger.info(msg.format(self.to_string(path_info), checksum))

        src = path_info.copy()
        src['url'] = posixpath.join(self.url, checksum[0:2], checksum[2:])

        self.cp(src, path_info)
Example #6
0
    def loads(project=None,
              cmd=None,
              deps=[],
              outs=[],
              outs_no_cache=[],
              metrics_no_cache=[],
              fname=None,
              cwd=os.curdir,
              locked=False,
              add=False,
              overwrite=True,
              ignore_build_cache=False,
              remove_outs=False):

        stage = Stage(project=project,
                      cwd=cwd,
                      cmd=cmd,
                      locked=locked)

        stage.outs = output.loads_from(stage, outs, use_cache=True)
        stage.outs += output.loads_from(stage, outs_no_cache, use_cache=False)
        stage.outs += output.loads_from(stage, metrics_no_cache,
                                        use_cache=False, metric=True)
        stage.deps = dependency.loads_from(stage, deps)

        if fname is not None and os.path.basename(fname) != fname:
            msg = "Stage file name '{}' should not contain subdirectories. " \
                  "Use '-c|--cwd' to change location of the stage file."
            raise StageFileBadNameError(msg.format(fname))

        fname, cwd = Stage._stage_fname_cwd(fname, cwd, stage.outs, add=add)

        Stage._check_inside_project(project, cwd)

        cwd = os.path.abspath(cwd)
        path = os.path.join(cwd, fname)

        stage.cwd = cwd
        stage.path = path

        # NOTE: remove outs before we check build cache
        if remove_outs:
            stage.remove_outs(ignore_remove=False)
            project.logger.warn("Build cache is ignored when using "
                                "--remove-outs.")
            ignore_build_cache = True
        else:
            stage.unprotect_outs()

        if os.path.exists(path):
            if not ignore_build_cache and stage.is_cached():
                logger.info('Stage is cached, skipping.')
                return None

            msg = "'{}' already exists. Do you wish to run the command and " \
                  "overwrite it?".format(stage.relpath)
            if not overwrite and not project.prompt.prompt(msg, False):
                raise StageFileAlreadyExistsError(stage.relpath)

        return stage
Example #7
0
 def test_stdout(self, mock_stdout, mock_stderr):
     logger = Logger(force=True)
     non_error_message = 'non-error message'
     logger.info(non_error_message)
     self.assertEqual('', mock_stderr.getvalue())
     self.assertEqual('{}\n'.format(non_error_message),
                      mock_stdout.getvalue())
Example #8
0
    def init(root_dir=os.curdir, no_scm=False, force=False):
        """
        Initiate dvc project in directory.

        Args:
            root_dir: Path to project's root directory.

        Returns:
            Project instance.

        Raises:
            KeyError: Raises an exception.
        """
        import colorama
        import shutil
        from dvc.scm import SCM, Base
        from dvc.config import Config
        from dvc.logger import logger

        root_dir = os.path.abspath(root_dir)
        dvc_dir = os.path.join(root_dir, Project.DVC_DIR)
        scm = SCM(root_dir)
        if type(scm) == Base and not no_scm:
            msg = "{} is not tracked by any supported scm tool(e.g. git)."
            raise InitError(msg.format(root_dir))

        if os.path.isdir(dvc_dir):
            if not force:
                msg = "'{}' exists. Use '-f' to force."
                raise InitError(msg.format(os.path.relpath(dvc_dir)))
            shutil.rmtree(dvc_dir)

        os.mkdir(dvc_dir)

        config = Config.init(dvc_dir)
        proj = Project(root_dir)

        scm.add([config.config_file])
        if scm.ignore_file():
            scm.add([os.path.join(dvc_dir, scm.ignore_file())])

        logger.info('\nYou can now commit the changes to git.')

        logger.info(
            "\n"
            "{yellow}What's next?{nc}\n"
            "{yellow}------------{nc}\n"
            "- Check out the documentation: {blue}https://dvc.org/doc{nc}\n"
            "- Get help and share ideas: {blue}https://dvc.org/chat{nc}\n"
            "- Star us on GitHub: {blue}https://github.com/iterative/dvc{nc}"

            .format(yellow=colorama.Fore.YELLOW,
                    blue=colorama.Fore.BLUE,
                    green=colorama.Fore.GREEN,
                    nc=colorama.Fore.RESET)
        )

        return proj
Example #9
0
    def show(self, config, section, opt):
        if section not in config.keys():
            raise ConfigError("Section '{}' doesn't exist".format(section))

        if opt not in config[section].keys():
            raise ConfigError("Option '{}.{}' doesn't exist".format(
                section, opt))

        logger.info(config[section][opt])
Example #10
0
 def run(self):
     for section in self.configobj.keys():
         r = re.match(Config.SECTION_REMOTE_REGEX, section)
         if r:
             name = r.group('name')
             url = self.configobj[section].get(Config.SECTION_REMOTE_URL,
                                               '')
             logger.info('{}\t{}'.format(name, url))
     return 0
Example #11
0
    def run(self):
        section = Config.SECTION_REMOTE_FMT.format(self.args.name)
        ret = self._set(section, Config.SECTION_REMOTE_URL, self.args.url)
        if ret != 0:
            return ret

        if self.args.default:
            msg = 'Setting \'{}\' as a default remote.'.format(self.args.name)
            logger.info(msg)
            ret = self._set(Config.SECTION_CORE, Config.SECTION_CORE_REMOTE,
                            self.args.name)

        return ret
Example #12
0
    def dump(self, fname=None):
        if not fname:
            fname = self.path

        self._check_dvc_filename(fname)

        msg = "Saving information to '{}'.".format(os.path.relpath(fname))
        logger.info(msg)

        with open(fname, 'w') as fd:
            yaml.safe_dump(self.dumpd(), fd, default_flow_style=False)

        self.project._files_to_git_add.append(os.path.relpath(fname))
Example #13
0
    def save(self, path_info):
        if path_info['scheme'] != 'local':
            raise NotImplementedError

        path = path_info['path']

        msg = "Saving '{}' to cache '{}'."
        logger.info(
            msg.format(os.path.relpath(path), os.path.relpath(self.cache_dir)))

        if os.path.isdir(path):
            return self._save_dir(path_info)
        else:
            return self._save_file(path_info)
Example #14
0
    def collect_dir_cache(self, dname):
        dir_info = []

        for root, dirs, files in os.walk(dname):
            bar = False

            if len(files) > LARGE_DIR_SIZE:
                msg = "Computing md5 for a large directory {}. " \
                      "This is only done once."
                logger.info(msg.format(os.path.relpath(root)))
                bar = True
                title = os.path.relpath(root)
                processed = 0
                total = len(files)
                progress.update_target(title, 0, total)

            for fname in files:
                path = os.path.join(root, fname)
                relpath = self.unixpath(os.path.relpath(path, dname))

                if bar:
                    progress.update_target(title, processed, total)
                    processed += 1

                md5 = self.state.update(path)
                dir_info.append({
                    self.PARAM_RELPATH: relpath,
                    self.PARAM_MD5: md5
                })

            if bar:
                progress.finish_target(title)

        # NOTE: sorting the list by path to ensure reproducibility
        dir_info = sorted(dir_info, key=itemgetter(self.PARAM_RELPATH))

        md5 = dict_md5(dir_info) + self.MD5_DIR_SUFFIX
        if self.changed_cache(md5):
            self.dump_dir_cache(md5, dir_info)

        return (md5, dir_info)
Example #15
0
def file_md5(fname):
    """ get the (md5 hexdigest, md5 digest) of a file """
    if os.path.exists(fname):
        hash_md5 = hashlib.md5()
        binary = not istextfile(fname)
        size = os.path.getsize(fname)
        bar = False
        if size >= LARGE_FILE_SIZE:
            bar = True
            msg = "Computing md5 for a large file {}. This is only done once."
            logger.info(msg.format(os.path.relpath(fname)))
            name = os.path.relpath(fname)
            total = 0

        with open(fname, 'rb') as fobj:
            while True:
                data = fobj.read(LOCAL_CHUNK_SIZE)
                if not data:
                    break

                if bar:
                    total += len(data)
                    progress.update_target(name, total, size)

                if binary:
                    chunk = data
                else:
                    chunk = dos2unix(data)

                hash_md5.update(chunk)

        if bar:
            progress.finish_target(name)

        return (hash_md5.hexdigest(), hash_md5.digest())
    else:
        return (None, None)
Example #16
0
    def ignore(self, path):
        entry, gitignore = self._get_gitignore(path)

        ignore_list = []
        if os.path.exists(gitignore):
            ignore_list = open(gitignore, 'r').readlines()
            filtered = list(
                filter(lambda x: x.strip() == entry.strip(), ignore_list))
            if len(filtered) != 0:
                return

        msg = "Adding '{}' to '{}'.".format(os.path.relpath(path),
                                            os.path.relpath(gitignore))
        logger.info(msg)

        content = entry
        if len(ignore_list) > 0:
            content = '\n' + content

        with open(gitignore, 'a') as fd:
            fd.write(content)

        if self.project is not None:
            self.project._files_to_git_add.append(os.path.relpath(gitignore))
Example #17
0
 def _check_dvc_file(fname):
     sname = fname + Stage.STAGE_FILE_SUFFIX
     if Stage.is_stage_file(sname):
         logger.info("Do you mean '{}'?".format(sname))
Example #18
0
    def checkout(self, path_info, checksum_info, force=False):
        path = path_info['path']
        md5 = checksum_info.get(self.PARAM_MD5)
        cache = self.get(md5)

        if not cache:
            msg = 'No cache info for \'{}\'. Skipping checkout.'
            logger.warn(msg.format(os.path.relpath(path)))
            return

        if not self.changed(path_info, checksum_info):
            msg = "Data '{}' didn't change."
            logger.info(msg.format(os.path.relpath(path)))
            return

        if self.changed_cache(md5):
            msg = u'Cache \'{}\' not found. File \'{}\' won\'t be created.'
            logger.warn(msg.format(md5, os.path.relpath(path)))
            remove(path)
            return

        msg = u'Checking out \'{}\' with cache \'{}\'.'
        logger.info(msg.format(os.path.relpath(path), md5))

        if not self.is_dir_cache(cache):
            if os.path.exists(path):
                if force or self._already_cached(path):
                    remove(path)
                else:
                    self._safe_remove(path)

            self.link(cache, path)
            self.state.update_link(path)
            return

        # Create dir separately so that dir is created
        # even if there are no files in it
        if not os.path.exists(path):
            os.makedirs(path)

        dir_info = self.load_dir_cache(md5)
        dir_relpath = os.path.relpath(path)
        dir_size = len(dir_info)
        bar = dir_size > LARGE_DIR_SIZE

        logger.info("Linking directory '{}'.".format(dir_relpath))

        for processed, entry in enumerate(dir_info):
            relpath = entry[self.PARAM_RELPATH]
            m = entry[self.PARAM_MD5]
            p = os.path.join(path, relpath)
            c = self.get(m)

            entry_info = {'scheme': path_info['scheme'], self.PARAM_PATH: p}

            entry_checksum_info = {self.PARAM_MD5: m}

            if self.changed(entry_info, entry_checksum_info):
                if os.path.exists(p):
                    if force or self._already_cached(p):
                        remove(p)
                    else:
                        self._safe_remove(p)

                self.link(c, p)

            if bar:
                progress.update_target(dir_relpath, processed, dir_size)

        self._discard_working_directory_changes(path, dir_info, force=force)

        self.state.update_link(path)

        if bar:
            progress.finish_target(dir_relpath)
Example #19
0
    def push(self, checksum_infos, remote, jobs=None, show_checksums=False):
        logger.info("Preparing to push data to {}".format(remote.url))
        title = "Collecting information"

        progress.set_n_total(1)
        progress.update_target(title, 0, 100)

        # NOTE: verifying that our cache is not corrupted
        def func(info):
            return not self.changed_cache_file(info[self.PARAM_MD5])

        checksum_infos = list(filter(func, checksum_infos))

        progress.update_target(title, 20, 100)

        # NOTE: filter files that are already uploaded
        md5s = [i[self.PARAM_MD5] for i in checksum_infos]
        exists = remote.exists(remote.md5s_to_path_infos(md5s))

        progress.update_target(title, 30, 100)

        def func(entry):
            return not entry[0]

        assert len(exists) == len(checksum_infos)
        infos_exist = list(filter(func, zip(exists, checksum_infos)))
        checksum_infos = [i for e, i in infos_exist]

        progress.update_target(title, 70, 100)

        md5s, names = self._group(checksum_infos,
                                  show_checksums=show_checksums)
        cache = [{'scheme': 'local', 'path': self.get(md5)} for md5 in md5s]

        progress.update_target(title, 80, 100)

        path_infos = remote.md5s_to_path_infos(md5s)

        assert len(path_infos) == len(cache) == len(md5s) == len(names)

        progress.update_target(title, 90, 100)

        if jobs is None:
            jobs = remote.JOBS

        chunks = list(
            zip(to_chunks(path_infos, jobs), to_chunks(cache, jobs),
                to_chunks(names, jobs)))

        progress.finish_target(title)

        progress.set_n_total(len(names))

        if len(chunks) == 0:
            return

        futures = []
        with ThreadPoolExecutor(max_workers=len(chunks)) as executor:
            for to_infos, from_infos, names in chunks:
                res = executor.submit(remote.upload,
                                      from_infos,
                                      to_infos,
                                      names=names)
                futures.append(res)

        for f in futures:
            f.result()