Esempio n. 1
0
 def test_remove_corrupted_files(self):
     hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024)
     corrupted_file_path = os.path.join(self.tmp_dir, 'corrupted_file')
     open(corrupted_file_path, 'a').close()
     self.assertTrue(os.path.exists(corrupted_file_path))
     hfs._remove_corrupted_files([corrupted_file_path], True)
     self.assertFalse(os.path.exists(corrupted_file_path))
Esempio n. 2
0
 def test_put1024K_pathexistence_level3(self):
     hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024, levels=3)
     hfs.put('data/think-hires.jpg')
     fullpath = os.path.join(
         self.tmp_dir, 'hashfs', 'aU', 'No', 'RA',
         'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv')
     self.assertTrue(os.path.exists(fullpath))
Esempio n. 3
0
    def test_remote_fsck(self):
        testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets')
        hfspath = os.path.join(self.tmp_dir, 'objectsfs')
        ohfs = MultihashFS(hfspath)
        ohfs.put(HDATA_IMG_1)

        s3 = boto3.resource(
            's3',
            region_name='us-east-1',
            aws_access_key_id='fake_access_key',
            aws_secret_access_key='fake_secret_key',
        )

        s3.Object(testbucketname, 'zdj7WWsMkELZSGQGgpm5VieCWV8NxY5n5XEP73H4E7eeDMA3A').delete()
        self.assertRaises(botocore.exceptions.ClientError, lambda: self.check_delete(s3, testbucketname))
        mdpath = os.path.join(self.tmp_dir, 'metadata-test')

        dataset_spec = get_sample_spec(testbucketname)
        specpath = os.path.join(mdpath, 'vision-computing', 'images', 'dataset-ex')
        ensure_path_exists(specpath)

        yaml_save(dataset_spec, os.path.join(specpath, 'dataset-ex.spec'))
        manifestpath = os.path.join(specpath, 'MANIFEST.yaml')

        yaml_save({'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh': {'imghires.jpg'}}, manifestpath)
        fullspecpath = os.path.join(specpath, os.path.join(specpath, 'dataset-ex.spec'))
        spec = 'vision-computing__images__dataset-ex__5'
        c = yaml_load('hdata/config.yaml')
        r = LocalRepository(c, hfspath)
        ret = r.remote_fsck(mdpath, spec, fullspecpath, 2, True, True)
        self.assertTrue(ret)

        self.assertEqual(None, s3.Object(testbucketname, 'zdj7WWsMkELZSGQGgpm5VieCWV8NxY5n5XEP73H4E7eeDMA3A').load())
Esempio n. 4
0
 def __init__(self, spec, index_path, object_path, mutability=Mutability.STRICT.value, cache_path=None):
     self._spec = spec
     self._path = index_path
     self._hfs = MultihashFS(object_path)
     self._mf = self._get_index(index_path)
     self._full_idx = FullIndex(spec, index_path, mutability)
     self._cache = cache_path
Esempio n. 5
0
    def test_get_update_links_wspace(self):
        wspath = os.path.join(self.tmp_dir, 'wspace')

        hfspath = os.path.join(self.tmp_dir, 'objectsfs')
        ohfs = MultihashFS(hfspath)
        key = ohfs.put(HDATA_IMG_1)
        fidx = FullIndex(self.tmp_dir, self.tmp_dir)
        cachepath = os.path.join(self.tmp_dir, 'cachefs')
        cache = Cache(cachepath, '', '')

        testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets')
        c = get_sample_config_spec(testbucketname, testprofile, testregion)

        r = LocalRepository(c, hfspath)
        r._update_cache(cache, key)

        mfiles = {}
        files = {DATA_IMG_1}
        r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict')

        wspace_file = os.path.join(wspath, DATA_IMG_1)
        set_write_read(wspace_file)
        self.assertTrue(os.path.exists(wspace_file))
        self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file))
        st = os.stat(wspace_file)
        fi = fidx.get_index()
        for k, v in fi.items():
            self.assertEqual(k, os.path.join('data', 'imghires.jpg'))
            self.assertEqual(v['hash'], 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh')
            self.assertEqual(v['status'], 'u')
            self.assertEqual(v['ctime'], st.st_ctime)
            self.assertEqual(v['mtime'], st.st_mtime)
        self.assertTrue(st.st_nlink == 2)
        self.assertEqual(mfiles, {DATA_IMG_1: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh'})
Esempio n. 6
0
 def test_get_simple(self):
     original_file = 'data/think-hires.jpg'
     dst_file = os.path.join(self.tmp_dir, 'think-hires.jpg')
     hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024)
     objkey = hfs.put(original_file)
     hfs.get(objkey, dst_file)
     self.assertEqual(self.md5sum(original_file), self.md5sum(dst_file))
Esempio n. 7
0
 def test_put1024K_toomany_levels(self):
     hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024, levels=23)
     hfs.put('data/think-hires.jpg')
     fullpath = os.path.join(self.tmp_dir, 'hashfs', 'aU', 'No', 'RA', 'zc', 'iw', '2J', 'Ji', '69', 's2', 'Hj',
                             'fC',
                             'yz', 'Wt', '39', 'BH', 'Cu', 'cC', 'V2', 'Cs', 'AX', '6v', 'Sv',
                             'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv')
     self.assertTrue(os.path.exists(fullpath))
Esempio n. 8
0
 def test_corruption(self):
     original_file = 'data/think-hires.jpg'
     dst_file = os.path.join(self.tmp_dir, 'think-hires.jpg')
     hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024)
     objkey = hfs.put(original_file)
     chunk = os.path.join(self.tmp_dir, 'hashfs', 'aU', 'No', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv')
     with open(chunk, 'wb') as f:
         f.write(b'blabla')
     self.assertFalse(hfs.get(objkey, dst_file))
     self.assertTrue(os.path.exists(dst_file) is False)
Esempio n. 9
0
    def test_get_update_cache(self):
        hfspath = os.path.join(self.tmp_dir, 'objectsfs')
        ohfs = MultihashFS(hfspath)
        key = ohfs.put(HDATA_IMG_1)

        cachepath = os.path.join(self.tmp_dir, 'cachefs')
        cache = Cache(cachepath, '', '')

        testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets')
        c = get_sample_config_spec(testbucketname, testprofile, testregion)

        r = LocalRepository(c, hfspath)
        r._update_cache(cache, key)

        self.assertTrue(os.path.exists(cache.get_keypath(key)))
        self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(cache.get_keypath(key)))
Esempio n. 10
0
    def test_fsck(self):
        hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024)

        original_file = 'data/think-hires.jpg'
        hfs.put(original_file)

        chunk = os.path.join(
            self.tmp_dir, 'hashfs', 'aU', 'No',
            'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv')

        corrupted_files = hfs.fsck()
        self.assertTrue(len(corrupted_files) == 0)

        # Create a hard link placing the file on a wrong directory
        chunk_in_wrong_dir = os.path.join(
            self.tmp_dir, 'hashfs', 'aU', 'NB',
            'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv')
        os.makedirs(os.path.join(self.tmp_dir, 'hashfs', 'aU', 'NB'))
        os.link(chunk, chunk_in_wrong_dir)

        corrupted_files = hfs.fsck()
        self.assertTrue(len(corrupted_files) == 1)
        self.assertTrue('zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv' in
                        corrupted_files)

        with open(chunk, 'wb') as f:
            f.write(b'blabla')

        corrupted_files = hfs.fsck()
        self.assertTrue(len(corrupted_files) == 2)
        self.assertTrue('zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv' in
                        corrupted_files)
Esempio n. 11
0
 def commit_objects(self, index_path, ws_path):
     added_files = []
     deleted_files = []
     idx = MultihashFS(self._objects_path)
     fidx = FullIndex(self.__spec, index_path)
     findex = fidx.get_index()
     log_path = os.path.join(self._logpath, 'store.log')
     with open(log_path, 'a') as log_file:
         for k, v in findex.items():
             if not os.path.exists(os.path.join(ws_path, k)):
                 deleted_files.append(k)
             elif v['status'] == Status.a.name:
                 idx.fetch_scid(v['hash'], log_file)
                 v['status'] = Status.u.name
                 if 'previous_hash' in v:
                     added_files.append((v['previous_hash'], k))
     fidx.get_manifest_index().save()
     return added_files, deleted_files
Esempio n. 12
0
    def push(self, spec, retry=2, clear_on_fail=False):
        repo_type = self.__repo_type
        try:
            objects_path = get_objects_path(self.__config, repo_type)
            metadata_path = get_metadata_path(self.__config, repo_type)
            refs_path = get_refs_path(self.__config, repo_type)
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)
            return

        met = Metadata(spec, metadata_path, self.__config, repo_type)
        fields = met.git_user_config()
        if None in fields.values():
            log.error(
                'Your name and email address need to be configured in git. '
                'Please see the commands below:',
                class_name=REPOSITORY_CLASS_NAME)

            log.error('git config --global user.name \'Your Name\'',
                      class_name=REPOSITORY_CLASS_NAME)
            log.error('git config --global user.email [email protected]',
                      class_name=REPOSITORY_CLASS_NAME)
            return
        if met.fetch() is False:
            return

        ref = Refs(refs_path, spec, repo_type)
        tag, sha = ref.branch()
        categories_path = get_path_with_categories(tag)

        spec_path, spec_file = None, None
        try:
            spec_path, spec_file = search_spec_file(self.__repo_type, spec,
                                                    categories_path)
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)

        if spec_path is None:
            return

        full_spec_path = os.path.join(spec_path, spec_file)

        repo = LocalRepository(self.__config, objects_path, repo_type)
        ret = repo.push(objects_path, full_spec_path, retry, clear_on_fail)

        # ensure first we're on master !
        met.checkout()
        if ret == 0:
            # push metadata spec to LocalRepository git repository
            try:
                met.push()
            except Exception as e:
                log.error(e, class_name=REPOSITORY_CLASS_NAME)
                return
            MultihashFS(objects_path).reset_log()
Esempio n. 13
0
    def test_get_update_links_wspace_with_duplicates(self):
        wspath = os.path.join(self.tmp_dir, 'wspace')

        hfspath = os.path.join(self.tmp_dir, 'objectsfs')
        ohfs = MultihashFS(hfspath)
        key = ohfs.put(HDATA_IMG_1)
        fidx = FullIndex(self.tmp_dir, self.tmp_dir)
        cachepath = os.path.join(self.tmp_dir, 'cachefs')
        cache = Cache(cachepath, '', '')

        testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets')
        c = get_sample_config_spec(testbucketname, testprofile, testregion)

        r = LocalRepository(c, hfspath)
        r._update_cache(cache, key)

        mfiles = {}
        files = {DATA_IMG_1, DATA_IMG_2}
        r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict')

        wspace_file = os.path.join(wspath, DATA_IMG_1)
        self.assertTrue(os.path.exists(wspace_file))
        self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file))

        wspace_file = os.path.join(wspath, DATA_IMG_2)
        self.assertTrue(os.path.exists(wspace_file))
        self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file))
        st = os.stat(wspace_file)
        self.assertTrue(st.st_nlink == 3)
        self.assertEqual(mfiles, {DATA_IMG_1: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh',
                                  DATA_IMG_2: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh'})

        wspath = os.path.join(self.tmp_dir, 'wspace')
        ensure_path_exists(wspath)
        to_be_removed = os.path.join(wspath, 'to_be_removed')
        with open(to_be_removed, 'w') as f:
            f.write('DEAD\n')

        hfspath = os.path.join(self.tmp_dir, 'objectsfs')
        ohfs = MultihashFS(hfspath)
        key = ohfs.put(HDATA_IMG_1)
        fidx = FullIndex(self.tmp_dir, self.tmp_dir)
        cachepath = os.path.join(self.tmp_dir, 'cachefs')
        cache = Cache(cachepath, '', '')
        c = yaml_load('hdata/config.yaml')
        r = LocalRepository(c, hfspath)
        r._update_cache(cache, key)

        mfiles = {}
        files = {DATA_IMG_1, DATA_IMG_2}
        r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict')
        r._remove_unused_links_wspace(wspath, mfiles)
        self.assertFalse(os.path.exists(to_be_removed))
Esempio n. 14
0
    def test_fsck_with_remove_corrupted(self):
        hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024)

        original_file = 'data/think-hires.jpg'
        hfs.put(original_file)

        chunk = os.path.join(self.tmp_dir, 'hashfs', 'aU', 'No', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv')
        corrupted_files = hfs.fsck()
        self.assertTrue(len(corrupted_files) == 0)

        chunk_in_wrong_dir = os.path.join(self.tmp_dir, 'hashfs', 'aU', 'NB',
                                          'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv')

        os.makedirs(os.path.join(self.tmp_dir, 'hashfs', 'aU', 'NB'))
        os.link(chunk, chunk_in_wrong_dir)

        corrupted_files = hfs.fsck(remove_corrupted=True)
        self.assertTrue(len(corrupted_files) == 1)
        self.assertTrue('zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv' in corrupted_files)
        self.assertFalse(os.path.exists(chunk_in_wrong_dir))
Esempio n. 15
0
class MultihashIndex(object):
    def __init__(self,
                 spec,
                 index_path,
                 object_path,
                 mutability=MutabilityType.STRICT.value,
                 cache_path=None):
        self._spec = spec
        self._path = index_path
        self._hfs = MultihashFS(object_path)
        self._mf = self._get_index(index_path)
        self._full_idx = FullIndex(spec, index_path, mutability)
        self._cache = cache_path

    def _get_index(self, idxpath):
        metadatapath = os.path.join(idxpath, 'metadata', self._spec)
        ensure_path_exists(metadatapath)

        mfpath = os.path.join(metadatapath, 'MANIFEST.yaml')
        return Manifest(mfpath)

    def _add_dir(self,
                 dir_path,
                 manifest_path,
                 file_path='',
                 ignore_rules=None):
        self.manifestfiles = yaml_load(manifest_path)
        f_index_file = self._full_idx.get_index()
        all_files = []
        for root, dirs, files in os.walk(os.path.join(dir_path, file_path)):
            base_path = root[:len(dir_path) + 1:]
            relative_path = root[len(dir_path) + 1:]
            if '.' == root[0] or should_ignore_file(
                    ignore_rules, '{}/'.format(relative_path)):
                continue
            for file in files:
                file_path = os.path.join(relative_path, file)
                if ignore_rules is None or not should_ignore_file(
                        ignore_rules, file_path):
                    all_files.append(file_path)
            self.wp.progress_bar_total_inc(len(all_files))
            args = {
                'wp': self.wp,
                'base_path': base_path,
                'f_index_file': f_index_file,
                'all_files': all_files,
                'dir_path': dir_path
            }
            result = run_function_per_group(range(len(all_files)),
                                            10000,
                                            function=self._adding_dir_work,
                                            arguments=args)
            if not result:
                return False
        self._full_idx.save_manifest_index()
        self._mf.save()

    def add(self, path, manifestpath, files=[]):
        self.wp = pool_factory(pb_elts=0, pb_desc='files')
        ignore_rules = get_ignore_rules(path)
        if len(files) > 0:
            single_files = filter(
                lambda x: os.path.isfile(os.path.join(path, x)), files)
            self.wp.progress_bar_total_inc(len(list(single_files)))
            for f in files:
                fullpath = os.path.join(path, f)
                if os.path.isdir(fullpath):
                    self._add_dir(path,
                                  manifestpath,
                                  f,
                                  ignore_rules=ignore_rules)
                elif os.path.isfile(fullpath):
                    if not should_ignore_file(ignore_rules, path):
                        self._add_single_file(path, manifestpath, f)
                else:
                    log.warn(output_messages['WARN_NOT_FOUND'] % fullpath,
                             class_name=MULTI_HASH_CLASS_NAME)
        else:
            if os.path.isdir(path):
                self._add_dir(path, manifestpath, ignore_rules=ignore_rules)
        self.wp.progress_bar_close()

    def _adding_dir_work_future_process(self, futures, wp):
        for future in futures:
            scid, filepath, previous_hash = future.result()
            self.update_index(scid, filepath,
                              previous_hash) if scid is not None else None
        wp.reset_futures()

    def _adding_dir_work(self, files, args):
        for k in files:
            file_path = args['all_files'][k]
            if (SPEC_EXTENSION in file_path) or (file_path == 'README.md') or (
                    file_path == MLGIT_IGNORE_FILE_NAME):
                args['wp'].progress_bar_total_inc(-1)
                self.add_metadata(args['base_path'], file_path)
            else:
                args['wp'].submit(self._add_file, args['base_path'], file_path,
                                  args['f_index_file'])
        futures = self.wp.wait()
        try:
            self._adding_dir_work_future_process(futures, self.wp)
        except Exception as e:
            self._full_idx.save_manifest_index()
            self._mf.save()
            log.error(output_messages['ERROR_ADDING_DIR'] %
                      (args['dir_path'], e),
                      class_name=MULTI_HASH_CLASS_NAME)
            return False
        return True

    def _add_single_file(self, base_path, manifestpath, file_path):
        self.manifestfiles = yaml_load(manifestpath)

        f_index_file = self._full_idx.get_index()
        if (SPEC_EXTENSION in file_path) or ('README' in file_path) or (
                MLGIT_IGNORE_FILE_NAME in file_path):
            self.wp.progress_bar_total_inc(-1)
            self.add_metadata(base_path, file_path)
        else:
            self.wp.submit(self._add_file, base_path, file_path, f_index_file)
            futures = self.wp.wait()
            for future in futures:
                try:
                    scid, filepath, previous_hash = future.result()
                    self.update_index(
                        scid, filepath,
                        previous_hash) if scid is not None else None
                except Exception as e:
                    # save the manifest of files added to index so far
                    self._full_idx.save_manifest_index()
                    self._mf.save()
                    log.error(output_messages['ERROR_ADDING_DIR'] %
                              (base_path, e),
                              class_name=MULTI_HASH_CLASS_NAME)
                    return
            self.wp.reset_futures()
        self._full_idx.save_manifest_index()
        self._mf.save()

    def add_metadata(self, basepath, filepath, automatically_added=False):
        log.debug(output_messages['DEBUG_ADD_FILE'] % filepath,
                  class_name=MULTI_HASH_CLASS_NAME)
        fullpath = os.path.join(basepath, filepath)

        metadatapath = os.path.join(self._path, 'metadata', self._spec)
        ensure_path_exists(metadatapath)

        dstpath = os.path.join(metadatapath, filepath)
        if not os.path.exists(dstpath):
            shutil.copy2(fullpath, dstpath)
        else:
            os.unlink(dstpath)
            shutil.copy2(fullpath, dstpath)
        if automatically_added:
            log.info(output_messages['INFO_FILE_AUTOMATICALLY_ADDED'].format(
                filepath),
                     class_name=MULTI_HASH_CLASS_NAME)

    # TODO add : stat to MANIFEST from original file ...
    def update_index(self, objectkey, filename, previous_hash=None):

        self._mf.add(objectkey, posix_path(filename), previous_hash)

    def remove_manifest(self):
        index_metadata_path = os.path.join(self._path, 'metadata', self._spec)
        try:
            os.unlink(os.path.join(index_metadata_path, 'MANIFEST.yaml'))
        except FileNotFoundError:
            pass

    def _save_index(self):
        self._mf.save()

    def get_index(self):
        return self._mf

    def _add_file(self, basepath, filepath, f_index_file):
        fullpath = os.path.join(basepath, filepath)
        metadatapath = os.path.join(self._path, 'metadata', self._spec)
        ensure_path_exists(metadatapath)

        scid = None
        check_file = f_index_file.get(posix_path(filepath))
        previous_hash = None
        if check_file is not None:
            if self._full_idx.check_and_update(filepath, check_file, self._hfs,
                                               posix_path(filepath), fullpath,
                                               self._cache):
                scid = self._hfs.put(fullpath)

            updated_check = f_index_file.get(posix_path(filepath))
            if 'previous_hash' in updated_check:
                previous_hash = updated_check['previous_hash']
        else:
            scid = self._hfs.put(fullpath)
            self._full_idx.update_full_index(posix_path(filepath), fullpath,
                                             Status.a.name, scid)

        return scid, filepath, previous_hash

    def get(self, objectkey, path, file):
        log.info(output_messages['INFO_GETTING_FILE'] % file,
                 class_name=MULTI_HASH_CLASS_NAME)
        dirs = os.path.dirname(file)
        fulldir = os.path.join(path, dirs)
        ensure_path_exists(fulldir)

        dstfile = os.path.join(path, file)
        return self._hfs.get(objectkey, dstfile)

    def reset(self):
        shutil.rmtree(self._path)
        os.mkdir(self._path)

    def fsck(self, entity_path):
        return self._full_idx.fsck(entity_path, self._hfs, self._cache)

    def update_index_manifest(self, hash_files):
        for key in hash_files:
            values = list(hash_files[key])
            for e in values:
                self._mf.add(key, e)
        self._save_index()

    def get_index_yaml(self):
        return self._full_idx

    def remove_deleted_files_index_manifest(self, deleted_files):
        manifest = self.get_index()
        for file in deleted_files:
            manifest.rm_file(file)
        manifest.save()

    def get_hashes_list(self):
        idx_yaml = self._full_idx.get_index()
        hashes_list = []
        for value in idx_yaml:
            hashes_list.append(idx_yaml[value]['hash'])
        return hashes_list
Esempio n. 16
0
class MultihashIndex(object):

    def __init__(self, spec, index_path, object_path, mutability=Mutability.STRICT.value, cache_path=None):
        self._spec = spec
        self._path = index_path
        self._hfs = MultihashFS(object_path)
        self._mf = self._get_index(index_path)
        self._full_idx = FullIndex(spec, index_path, mutability)
        self._cache = cache_path

    def _get_index(self, idxpath):
        metadatapath = os.path.join(idxpath, 'metadata', self._spec)
        ensure_path_exists(metadatapath)

        mfpath = os.path.join(metadatapath, 'MANIFEST.yaml')
        return Manifest(mfpath)

    def add(self, path, manifestpath, files=[]):
        self.wp = pool_factory(pb_elts=0, pb_desc='files')
        if len(files) > 0:
            single_files = filter(lambda x: os.path.isfile(os.path.join(path, x)), files)
            self.wp.progress_bar_total_inc(len(list(single_files)))
            for f in files:
                fullpath = os.path.join(path, f)
                if os.path.isdir(fullpath):
                    self._add_dir(path, manifestpath, f)
                elif os.path.isfile(fullpath):
                    self._add_single_file(path, manifestpath, f)
                else:
                    log.warn('[%s] Not found!' % fullpath, class_name=MULTI_HASH_CLASS_NAME)
        else:
            if os.path.isdir(path):
                self._add_dir(path, manifestpath)
        self.wp.progress_bar_close()

    def _adding_dir_work_future_process(self, futures, wp):
        for future in futures:
            scid, filepath, previous_hash = future.result()
            self.update_index(scid, filepath, previous_hash) if scid is not None else None
        wp.reset_futures()

    def _adding_dir_work(self, files, args):
        for k in files:
            filepath = args['all_files'][k]
            if ('.spec' in filepath) or ('README' in filepath):
                args['wp'].progress_bar_total_inc(-1)
                self.add_metadata(args['basepath'], filepath)
            else:
                args['wp'].submit(self._add_file, args['basepath'], filepath, args['f_index_file'])
        futures = self.wp.wait()
        try:
            self._adding_dir_work_future_process(futures, self.wp)
        except Exception as e:
            self._full_idx.save_manifest_index()
            self._mf.save()
            log.error('Error adding dir [%s] -- [%s]' % (args['dirpath'], e), class_name=MULTI_HASH_CLASS_NAME)
            return False
        return True

    def _add_dir(self, dirpath, manifestpath, file_path='', trust_links=True):
        self.manifestfiles = yaml_load(manifestpath)
        f_index_file = self._full_idx.get_index()
        all_files = []
        for root, dirs, files in os.walk(os.path.join(dirpath, file_path)):
            if '.' == root[0]:
                continue
            basepath = root[:len(dirpath)+1:]
            relativepath = root[len(dirpath)+1:]
            for file in files:
                all_files.append(os.path.join(relativepath, file))
            self.wp.progress_bar_total_inc(len(all_files))
            args = {'wp': self.wp, 'basepath': basepath, 'f_index_file': f_index_file, 'all_files': all_files, 'dirpath': dirpath}
            result = run_function_per_group(range(len(all_files)), 10000, function=self._adding_dir_work, arguments=args)
            if not result:
                return False
        self._full_idx.save_manifest_index()
        self._mf.save()

    def _add_single_file(self, base_path, manifestpath, file_path):
        self.manifestfiles = yaml_load(manifestpath)

        f_index_file = self._full_idx.get_index()
        if ('.spec' in file_path) or ('README' in file_path):
            self.wp.progress_bar_total_inc(-1)
            self.add_metadata(base_path, file_path)
        else:
            self.wp.submit(self._add_file, base_path, file_path, f_index_file)
            futures = self.wp.wait()
            for future in futures:
                try:
                    scid, filepath, previous_hash = future.result()
                    self.update_index(scid, filepath, previous_hash) if scid is not None else None
                except Exception as e:
                    # save the manifest of files added to index so far
                    self._full_idx.save_manifest_index()
                    self._mf.save()
                    log.error('Error adding dir [%s] -- [%s]' % (base_path, e), class_name=MULTI_HASH_CLASS_NAME)
                    return
            self.wp.reset_futures()
        self._full_idx.save_manifest_index()
        self._mf.save()

    def add_metadata(self, basepath, filepath):
        log.debug('Add file [%s] to ml-git index' % filepath, class_name=MULTI_HASH_CLASS_NAME)
        fullpath = os.path.join(basepath, filepath)

        metadatapath = os.path.join(self._path, 'metadata', self._spec)
        ensure_path_exists(metadatapath)

        dstpath = os.path.join(metadatapath, filepath)
        if not os.path.exists(dstpath):
            shutil.copy2(fullpath, dstpath)
        else:
            os.unlink(dstpath)
            shutil.copy2(fullpath, dstpath)

    # TODO add : stat to MANIFEST from original file ...
    def update_index(self, objectkey, filename, previous_hash=None):

        self._mf.add(objectkey, posix_path(filename), previous_hash)

    def remove_manifest(self):
        index_metadata_path = os.path.join(self._path, 'metadata', self._spec)
        try:
            os.unlink(os.path.join(index_metadata_path, 'MANIFEST.yaml'))
        except FileNotFoundError:
            pass

    def _save_index(self):
        self._mf.save()

    def get_index(self):
        return self._mf

    def _add_file(self, basepath, filepath, f_index_file):
        fullpath = os.path.join(basepath, filepath)
        metadatapath = os.path.join(self._path, 'metadata', self._spec)
        ensure_path_exists(metadatapath)

        scid = None
        check_file = f_index_file.get(posix_path(filepath))
        previous_hash = None
        if check_file is not None:
            if self._full_idx.check_and_update(filepath, check_file, self._hfs, posix_path(filepath), fullpath, self._cache):
                scid = self._hfs.put(fullpath)

            updated_check = f_index_file.get(posix_path(filepath))
            if 'previous_hash' in updated_check:
                previous_hash = updated_check['previous_hash']
        else:
            scid = self._hfs.put(fullpath)
            self._full_idx.update_full_index(posix_path(filepath), fullpath, Status.a.name, scid)

        return scid, filepath, previous_hash

    def get(self, objectkey, path, file):
        log.info('Getting file [%s] from local index' % file, class_name=MULTI_HASH_CLASS_NAME)
        dirs = os.path.dirname(file)
        fulldir = os.path.join(path, dirs)
        ensure_path_exists(fulldir)

        dstfile = os.path.join(path, file)
        return self._hfs.get(objectkey, dstfile)

    def reset(self):
        shutil.rmtree(self._path)
        os.mkdir(self._path)

    def fsck(self):
        return self._hfs.fsck()

    def update_index_manifest(self, hash_files):
        for key in hash_files:
            values = list(hash_files[key])
            for e in values:
                self._mf.add(key, e)
        self._save_index()

    def get_index_yalm(self):
        return self._full_idx

    def remove_deleted_files_index_manifest(self, deleted_files):
        manifest = self.get_index()
        for file in deleted_files:
            manifest.rm_file(file)
        manifest.save()
Esempio n. 17
0
 def test_put1024K(self):
     hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024)
     hfs.put('data/think-hires.jpg')
     for files in hfs.walk():
         for file in files:
             self.assertTrue(file in chunks1024)
Esempio n. 18
0
    def reset(self, spec, reset_type, head):
        log.info(output_messages['INFO_INITIALIZING_RESET'] %
                 (reset_type, head),
                 class_name=REPOSITORY_CLASS_NAME)
        if (reset_type == '--soft'
                or reset_type == '--mixed') and head == HEAD:
            return
        try:
            repo_type = self.__repo_type
            metadata_path = get_metadata_path(self.__config, repo_type)
            index_path = get_index_path(self.__config, repo_type)
            refs_path = get_refs_path(self.__config, repo_type)
            object_path = get_objects_path(self.__config, repo_type)
            met = Metadata(spec, metadata_path, self.__config, repo_type)
            ref = Refs(refs_path, spec, repo_type)
            idx = MultihashIndex(spec, index_path, object_path)
            fidx = FullIndex(spec, index_path)
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)
            return

        # get tag before reset
        tag = met.get_current_tag()
        categories_path = get_path_with_categories(str(tag))
        # current manifest file before reset
        manifest_path = os.path.join(metadata_path, categories_path, spec,
                                     MANIFEST_FILE)
        _manifest = Manifest(manifest_path).load()

        if head == HEAD_1:  # HEAD~1
            try:
                # reset the repo
                met.reset()
            except Exception:
                return

        # get tag after reset
        tag_after_reset = met.get_current_tag()
        sha = met.sha_from_tag(tag_after_reset)

        # update ml-git ref HEAD
        ref.update_head(str(tag_after_reset), sha)

        # # get path to reset workspace in case of --hard
        path, file = None, None
        try:
            path, file = search_spec_file(self.__repo_type, spec,
                                          categories_path)
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)

        if reset_type == '--hard' and path is None:
            return

        # get manifest from metadata after reset
        _manifest_changed = Manifest(manifest_path)

        hash_files, file_names = _manifest_changed.get_diff(_manifest)
        idx_mf = idx.get_index().load()

        if reset_type == '--soft':
            # add in index/metadata/<entity-name>/MANIFEST
            idx.update_index_manifest(idx_mf)
            idx.update_index_manifest(hash_files)
            fidx.update_index_status(file_names, Status.a.name)

        else:  # --hard or --mixed
            # remove hash from index/hashsh/store.log
            file_names.update(*idx_mf.values())
            objs = MultihashFS(index_path)
            for key_hash in hash_files:
                objs.remove_hash(key_hash)
            idx.remove_manifest()
            fidx.remove_from_index_yaml(file_names)
            fidx.remove_uncommitted()

        if reset_type == '--hard':  # reset workspace
            remove_from_workspace(file_names, path, spec)