Exemple #1
0
def store_factory(config, store_string):
    stores = {
        StoreType.S3.value: S3Store,
        StoreType.S3H.value: S3MultihashStore,
        StoreType.AZUREBLOBH.value: AzureMultihashStore,
        StoreType.GDRIVEH.value: GoogleDriveMultihashStore,
        StoreType.GDRIVE.value: GoogleDriveStore
    }
    sp = store_string.split('/')
    config_bucket_name, bucket_name = None, None

    try:
        store_type = sp[0][:-1]
        bucket_name = sp[2]
        config_bucket_name = []
        log.debug('Store [%s] ; bucket [%s]' % (store_type, bucket_name),
                  class_name=STORE_FACTORY_CLASS_NAME)
        for k in config['store'][store_type]:
            config_bucket_name.append(k)
        if bucket_name not in config_bucket_name:
            log.warn(
                'Exception creating store -- Configuration not found for bucket [%s]. '
                'The available buckets in config file for store type [%s] are: %s'
                % (bucket_name, store_type, config_bucket_name),
                class_name=STORE_FACTORY_CLASS_NAME)
            return None
        bucket = config['store'][store_type][bucket_name]
        return stores[store_type](bucket_name, bucket)
    except ProfileNotFound as pfn:
        log.error(pfn, class_name=STORE_FACTORY_CLASS_NAME)
        return None
Exemple #2
0
    def _submit_fn(self, userfn, *args, **kwds):
        ctx = self._get_ctx()

        result = False
        retry_cnt = 0
        while True:
            try:
                if ctx is not None:
                    result = userfn(ctx, *args, **kwds)
                else:
                    result = userfn(*args, **kwds)
            except Exception as e:
                if retry_cnt < self._retry:
                    retry_cnt += 1
                    log.warn(output_messages['WARN_WORKER_EXCEPTION'] %
                             (e, retry_cnt),
                             class_name=POOL_CLASS_NAME)
                    self._retry_wait(retry_cnt)
                    continue
                else:
                    log.error(output_messages['ERROR_WORKER_FAILURE'] %
                              (e, retry_cnt),
                              class_name=POOL_CLASS_NAME)
                    self._release_ctx(ctx)
                    raise e
            break

        log.debug(output_messages['DEBUG_WORKER_SUCESS'] % (retry_cnt + 1),
                  class_name=POOL_CLASS_NAME)
        self._release_ctx(ctx)
        self._progress()

        return result
Exemple #3
0
    def put(self, key_path, file_path):

        if not self.drive_path_id:
            log.error('Drive path [%s] not found.' % self._drive_path,
                      class_name=GDRIVE_STORE)
            return False

        if self.key_exists(key_path):
            log.debug('Key path [%s] already exists in drive path [%s].' %
                      (key_path, self._drive_path),
                      class_name=GDRIVE_STORE)
            return True

        if not os.path.exists(file_path):
            log.error('[%s] not found.' % file_path, class_name=GDRIVE_STORE)
            return False

        file_metadata = {'name': key_path, 'parents': [self.drive_path_id]}
        try:
            media = MediaFileUpload(file_path, chunksize=-1, resumable=True)
            self._store.files().create(body=file_metadata,
                                       media_body=media).execute()
        except Exception:
            raise RuntimeError('The file could not be uploaded: [%s]' %
                               file_path,
                               class_name=GDRIVE_STORE)

        return True
Exemple #4
0
    def _execute(self, command, change_dir=True):
        cwd = None
        if change_dir:
            cwd = self._path
        proc = subprocess.run(command,
                              stdout=subprocess.PIPE,
                              stderr=subprocess.STDOUT,
                              universal_newlines=True,
                              shell=True,
                              cwd=cwd)
        input_known_errors = [
            'Permission denied (publickey)',
            '/dev/tty: No such device or address',
            'Host key verification failed.'
        ]
        if any(error in proc.stdout for error in input_known_errors):
            proc = subprocess.run(command,
                                  stdout=subprocess.PIPE,
                                  universal_newlines=True,
                                  cwd=cwd)

        log.debug(output_messages['DEBUG_EXECUTING_COMMAND'] % command,
                  class_name=GIT_CLIENT_CLASS_NAME)
        self._check_output(proc)
        return proc
Exemple #5
0
 def put(self, key_path, file_path):
     self._storage.put(file_path, self._bucket + '/' + key_path)
     version = None
     log.debug(output_messages['INFO_FILE_STORED_IN_BUCKET'] %
               (file_path, self._bucket, key_path, version),
               class_name=SFTPSTORE_NAME)
     return key_path
Exemple #6
0
    def __commit_metadata(self, full_metadata_path, index_path, metadata,
                          specs, ws_path):
        idx_path = os.path.join(index_path, 'metadata', self._spec)
        log.debug(output_messages['DEBUG_COMMIT_SPEC'] % self._spec,
                  class_name=METADATA_CLASS_NAME)
        # saves README.md if any
        readme = 'README.md'
        src_readme = os.path.join(idx_path, readme)
        self._copy_to_metadata_path(src_readme, full_metadata_path,
                                    'README.md')
        src_ignore_path = os.path.join(idx_path, MLGIT_IGNORE_FILE_NAME)
        self._copy_to_metadata_path(src_ignore_path, full_metadata_path,
                                    MLGIT_IGNORE_FILE_NAME)
        amount, workspace_size = self._get_amount_and_size_of_workspace_files(
            full_metadata_path, ws_path)
        # saves metadata and commit

        entity_spec_key = get_spec_key(self.__repo_type)
        metadata[entity_spec_key]['manifest']['files'] = MANIFEST_FILE
        metadata[entity_spec_key]['manifest']['size'] = humanize.naturalsize(
            workspace_size)
        metadata[entity_spec_key]['manifest']['amount'] = amount
        storage = metadata[entity_spec_key]['manifest'][STORAGE_SPEC_KEY]

        manifest = metadata[entity_spec_key]['manifest']
        PluginCaller(manifest).call(ADD_METADATA, ws_path, manifest)

        # Add metadata specific to labels ML entity type
        self._add_associate_entity_metadata(metadata, specs)
        self.__commit_spec(full_metadata_path, metadata)

        return storage
Exemple #7
0
def storage_factory(config, storage_string):
    storages = {StorageType.S3.value: S3Storage, StorageType.S3H.value: S3MultihashStorage,
                StorageType.AZUREBLOBH.value: AzureMultihashStorage,
                StorageType.GDRIVEH.value: GoogleDriveMultihashStorage,
                StorageType.GDRIVE.value: GoogleDriveStorage,
                StorageType.SFTPH.value: SFtpStorage}
    sp = storage_string.split('/')
    config_bucket_name, bucket_name = None, None

    try:
        storage_type = sp[0][:-1]
        bucket_name = sp[2]
        config_bucket_name = []
        log.debug(output_messages['DEBUG_STORAGE_AND_BUCKET'] % (storage_type, bucket_name), class_name=STORAGE_FACTORY_CLASS_NAME)
        for k in config[STORAGE_CONFIG_KEY][storage_type]:
            config_bucket_name.append(k)
        if bucket_name not in config_bucket_name:
            log.warn(output_messages['WARN_EXCPETION_CREATING_STORAGE'] % (
                bucket_name, storage_type, config_bucket_name), class_name=STORAGE_FACTORY_CLASS_NAME)
            return None
        bucket = config[STORAGE_CONFIG_KEY][storage_type][bucket_name]
        return storages[storage_type](bucket_name, bucket)
    except ProfileNotFound as pfn:
        log.error(pfn, class_name=STORAGE_FACTORY_CLASS_NAME)
        return None
Exemple #8
0
    def __commit_metadata(self, full_metadata_path, index_path, metadata,
                          specs, ws_path):
        idx_path = os.path.join(index_path, 'metadata', self._spec)
        log.debug(output_messages['DEBUG_COMMIT_SPEC'] % self._spec,
                  class_name=METADATA_CLASS_NAME)
        # saves README.md if any
        readme = 'README.md'
        src_readme = os.path.join(idx_path, readme)
        if os.path.exists(src_readme):
            dst_readme = os.path.join(full_metadata_path, readme)
            try:
                shutil.copy2(src_readme, dst_readme)
            except Exception as e:
                log.error(output_messages['ERROR_COULD_NOT_FIND_README'],
                          class_name=METADATA_CLASS_NAME)
                raise e
        amount, workspace_size = self._get_amount_and_size_of_workspace_files(
            full_metadata_path, ws_path)
        # saves metadata and commit

        entity_spec_key = get_spec_key(self.__repo_type)
        metadata[entity_spec_key]['manifest']['files'] = MANIFEST_FILE
        metadata[entity_spec_key]['manifest']['size'] = humanize.naturalsize(
            workspace_size)
        metadata[entity_spec_key]['manifest']['amount'] = amount
        storage = metadata[entity_spec_key]['manifest'][STORAGE_SPEC_KEY]

        manifest = metadata[entity_spec_key]['manifest']
        PluginCaller(manifest).call(ADD_METADATA, ws_path, manifest)

        # Add metadata specific to labels ML entity type
        self._add_associate_entity_metadata(metadata, specs)
        self.__commit_spec(full_metadata_path, metadata)

        return storage
Exemple #9
0
    def get_entities(self):
        """Get a list of entities found in config.yaml.

        Returns:
            list of class Entity.
        """
        entities = []
        metadata_repository = namedtuple(
            'Repository',
            ['private', 'full_name', 'ssh_url', 'html_url', 'owner'])
        metadata_owner = namedtuple('Owner', ['email', 'name'])
        try:
            for type_entity in EntityType:
                self.__init_manager(type_entity.value)
                if not self._manager:
                    continue
                repository = metadata_repository(False, '', '', '',
                                                 metadata_owner('', ''))
                for obj in Repo(
                        self._manager.path).head.commit.tree.traverse():
                    if SPEC_EXTENSION in obj.name:
                        entity_spec = yaml_load_str(
                            io.BytesIO(obj.data_stream.read()))
                        entity = Entity(repository, entity_spec)
                        if entity.type in type_entity.value and entity not in entities:
                            entities.append(entity)
        except Exception as error:
            log.debug(
                output_messages['DEBUG_ENTITIES_RELATIONSHIP'].format(error),
                class_name=LocalEntityManager.__name__)

        return entities
Exemple #10
0
    def __commit_metadata(self, full_metadata_path, index_path, metadata,
                          specs, ws_path):
        idx_path = os.path.join(index_path, 'metadata', self._spec)
        log.debug('Commit spec [%s] to ml-git metadata' % self._spec,
                  class_name=METADATA_CLASS_NAME)
        # saves README.md if any
        readme = 'README.md'
        src_readme = os.path.join(idx_path, readme)
        if os.path.exists(src_readme):
            dst_readme = os.path.join(full_metadata_path, readme)
            try:
                shutil.copy2(src_readme, dst_readme)
            except Exception as e:
                log.error(
                    'Could not find file README.md. Entity repository must have README.md file',
                    class_name=METADATA_CLASS_NAME)
                raise e
        amount, workspace_size = self._get_amount_and_size_of_workspace_files(
            full_metadata_path, ws_path)
        # saves metadata and commit
        metadata[self.__repo_type]['manifest']['files'] = MANIFEST_FILE
        metadata[self.__repo_type]['manifest']['size'] = humanize.naturalsize(
            workspace_size)
        metadata[self.__repo_type]['manifest']['amount'] = amount
        store = metadata[self.__repo_type]['manifest']['store']

        manifest = metadata[self.__repo_type]['manifest']
        PluginCaller(manifest).call(ADD_METADATA, ws_path, manifest)

        # Add metadata specific to labels ML entity type
        self._add_associate_entity_metadata(metadata, specs)
        self.__commit_spec(full_metadata_path, metadata)

        return store
Exemple #11
0
 def _log(self, objkey, links=[], log_file=None):
     log.debug('Update log for key [%s]' % objkey,
               class_name=HASH_FS_CLASS_NAME)
     log_file.write("%s\n" % (objkey))
     for link in links:
         h = link['Hash']
         log_file.write("%s\n" % (h))
Exemple #12
0
    def _get(self, file, key_path):
        bucket = self._bucket
        s3_resource = self._store

        res = s3_resource.Object(bucket, key_path).get()
        c = res['Body']
        log.debug('Get - downloading [%s] from bucket [%s] into file [%s]' %
                  (key_path, bucket, file),
                  class_name=S3_MULTI_HASH_STORE_NAME)
        with open(file, 'wb') as f:
            m = hashlib.sha256()
            while True:
                chunk = c.read(self._blk_size)
                if not chunk:
                    break
                m.update(chunk)
                f.write(chunk)
            h = m.hexdigest()
            mh = multihash.encode(bytes.fromhex(h), 'sha2-256')
            cid = CIDv1('dag-pb', mh)
            ncid = str(cid)
            if self.check_integrity(key_path, ncid) is False:
                return False
        c.close()
        return True
Exemple #13
0
 def fetch_scid(self, key, log_file=None):
     log.debug(output_messages['DEBUG_BUILDING_STORAGE_LOG'], class_name=HASH_FS_CLASS_NAME)
     if self._exists(key):
         links = self.load(key)
         self._log(key, links['Links'], log_file)
     else:
         log.debug(output_messages['DEBUG_BLOB_ALREADY_COMMITED'] % key, class_name=HASH_FS_CLASS_NAME)
Exemple #14
0
 def _check_integrity(self, cid, data):
     cid0 = self._digest(data)
     if cid == cid0:
         log.debug(output_messages['DEBUG_CHECKSUM_VERIFIED'] % cid, class_name=HASH_FS_CLASS_NAME)
         return True
     log.debug(output_messages['DEBUG_CORRUPTION_DETECTED'] % (cid, cid0), class_name=HASH_FS_CLASS_NAME)
     return False
Exemple #15
0
    def get(self, object_key, dst_file_path):
        size = 0
        descriptor = json_load(self._get_hashpath(object_key))
        json_objects = json.dumps(descriptor).encode()
        is_corrupted = not self._check_integrity(object_key, json_objects)
        if is_corrupted:
            return size
        successfully_wrote = True
        # concat all chunks to dstfile
        try:
            with open(dst_file_path, 'wb') as dst_file:
                for chunk in descriptor['Links']:
                    chunk_hash = chunk['Hash']
                    blob_size = chunk['Size']
                    log.debug(output_messages['DEBUG_GET_CHUNK'] % (chunk_hash, blob_size), class_name=HASH_FS_CLASS_NAME)
                    size += int(blob_size)

                    successfully_wrote = self._write_chunk_in_file(chunk_hash, dst_file)
                    if not successfully_wrote:
                        break
        except Exception as e:
            if os.path.exists(dst_file_path):
                os.remove(dst_file_path)
            raise e

        if not successfully_wrote:
            size = 0
            os.unlink(dst_file_path)
        return size
Exemple #16
0
 def _update_file_status(self, cache, filepath, fullpath, scid, st, value):
     status = Status.a.name
     prev_hash = value['hash']
     scid_ret = scid
     is_flexible = self._mutability == MutabilityType.FLEXIBLE.value
     is_strict = self._mutability == MutabilityType.STRICT.value
     not_unlocked = value['mtime'] != st.st_mtime and 'untime' not in value
     bare_mode = os.path.exists(
         os.path.join(self._path, 'metadata', self._spec, 'bare'))
     if (is_flexible and not_unlocked) or is_strict:
         if value['status'] == Status.c.name and 'previous_hash' in value:
             prev_hash = value['previous_hash']
             if scid == prev_hash:
                 prev_hash = None
                 status = Status.u.name
                 log.debug(output_messages['DEBUG_RESTORED_FILE'].format(
                     posix_path(filepath)),
                           class_name=MULTI_HASH_CLASS_NAME)
         else:
             status = Status.c.name
             scid_ret = None
             file_path = Cache(cache).get_keypath(value['hash'])
             if os.path.exists(file_path):
                 os.unlink(file_path)
     elif bare_mode and self._mutability == MutabilityType.MUTABLE.value:
         print('\n')
         log.warn(output_messages['WARN_FILE_EXISTS_IN_REPOSITORY'] %
                  filepath,
                  class_name=MULTI_HASH_CLASS_NAME)
     self.update_full_index(posix_path(filepath), fullpath, status, scid,
                            prev_hash)
     return scid_ret
Exemple #17
0
 def update_index_unlock(self, filename):
     findex = self.get_index()
     try:
         findex[filename]['untime'] = time.time()
     except Exception:
         log.debug('The file [{}] isn\'t in index'.format(filename), class_name=MULTI_HASH_CLASS_NAME)
     self._fidx.save()
Exemple #18
0
    def _submit_fn(self, userfn, *args, **kwds):
        ctx = self._get_ctx()

        result = False
        retry_cnt = 0
        while True:
            try:
                if ctx is not None:
                    result = userfn(ctx, *args, **kwds)
                else:
                    result = userfn(*args, **kwds)
            except Exception as e:
                if retry_cnt < self._retry:
                    retry_cnt += 1
                    log.warn('Worker exception - [%s] -- retry [%d]' %
                             (e, retry_cnt),
                             class_name=POOL_CLASS_NAME)
                    self._retry_wait(retry_cnt)
                    continue
                else:
                    log.error('Worker failure - [%s] -- [%d] attempts' %
                              (e, retry_cnt),
                              class_name=POOL_CLASS_NAME)
                    self._release_ctx(ctx)
                    raise e
            break

        log.debug('Worker success at attempt [%d]' % (retry_cnt + 1),
                  class_name=POOL_CLASS_NAME)
        self._release_ctx(ctx)
        self._progress()

        return result
Exemple #19
0
 def _delete(self, key_path):
     bucket = self._bucket
     s3_resource = self._store
     log.debug('Delete - deleting [%s] from bucket [%s]' %
               (key_path, bucket),
               class_name=S3_MULTI_HASH_STORE_NAME)
     return s3_resource.Object(bucket, key_path).delete()
Exemple #20
0
def search_spec_file(repotype, spec, categories_path):
    root_path = get_root_path()
    dir_with_cat_path = os.path.join(root_path, repotype, categories_path, spec)
    dir_without_cat_path = os.path.join(root_path, repotype, spec)

    files = None
    dir_files = None

    try:
        files = os.listdir(dir_with_cat_path)
        dir_files = dir_with_cat_path
    except Exception:
        try:
            files = os.listdir(dir_without_cat_path)
            dir_files = dir_without_cat_path
        except Exception:  # TODO: search '.' path as well
            # if 'files_without_cat_path' and 'files_with_cat_path' remains as None, the system couldn't find the directory
            #  which means that the entity name passed is wrong
            if files is None:
                raise SearchSpecException('The entity name passed is wrong. Please check again')

    if len(files) > 0:
        for file in files:
            if spec in file:
                log.debug('search spec file: found [%s]-[%s]' % (dir_files, file), class_name=ML_GIT_PROJECT_NAME)
                return dir_files, file
    raise SearchSpecException('The entity name passed is wrong. Please check again')
Exemple #21
0
 def _delete(self, key_path, version=None):
     bucket = self._bucket
     s3_resource = self._storage
     log.debug(output_messages['DEBUG_DELETING_FROM_BUCKET'] % (key_path, bucket), class_name=S3STORAGE_NAME)
     if version is not None:
         s3_resource.Object(bucket, key_path).delete(VersionId=version)
     else:
         return s3_resource.Object(bucket, key_path).delete()
Exemple #22
0
 def update_log(self, files_to_keep):
     log.debug(output_messages['DEBUG_UPDATE_LOG_LIST_FILES'], class_name=HASH_FS_CLASS_NAME)
     fullpath = os.path.join(self._logpath, STORAGE_LOG)
     if not os.path.exists(fullpath):
         return None
     with open(fullpath, 'w') as log_file:
         for file in files_to_keep:
             log_file.write("%s\n" % file)
Exemple #23
0
 def update_index_unlock(self, filename):
     findex = self.get_index()
     try:
         findex[filename]['untime'] = time.time()
     except Exception:
         log.debug(output_messages['DEBUG_FILE_NOT_INDEX'].format(filename),
                   class_name=MULTI_HASH_CLASS_NAME)
     self._fidx.save()
Exemple #24
0
 def connect(self):
     log.debug('Connect - Storage [%s] ;' % self._store_type,
               class_name=AZURE_STORE_NAME)
     try:
         self._store = BlobServiceClient.from_connection_string(
             self._account, connection_timeout=300)
     except Exception:
         raise RuntimeError('Unable to connect to the Azure storage.')
Exemple #25
0
 def check_integrity(self, cid, ncid):
     if cid == ncid:
         log.debug(output_messages['DEBUG_CHECKSUM_VERIFIED'] % cid,
                   class_name=MULTI_HASH_STORAGE_NAME)
         return True
     log.debug(output_messages['DEBUG_CORRUPTION_DETECTED'] % (cid, ncid),
               class_name=MULTI_HASH_STORAGE_NAME)
     return False
Exemple #26
0
 def bucket_exists(self):
     container = ContainerClient.from_connection_string(self._account, self._bucket, connection_timeout=300)
     try:
         container.get_container_properties()
         log.debug(output_messages['DEBUG_CONTAINER_ALREADY_EXISTS'] % self._bucket, class_name=AZURE_STORAGE_NAME)
         return True
     except Exception:
         return False
Exemple #27
0
 def check_integrity(self, cid, ncid):
     if cid == ncid:
         log.debug('Checksum verified for chunk [%s]' % cid,
                   class_name=MULTI_HASH_STORE_NAME)
         return True
     log.error('Corruption detected for chunk [%s] - got [%s]' %
               (cid, ncid),
               class_name=MULTI_HASH_STORE_NAME)
     return False
Exemple #28
0
 def check_exists(self):
     log.debug('Metadata check existence [%s] @ [%s]' %
               (self.__git, self.__path),
               class_name=METADATA_MANAGER_CLASS_NAME)
     try:
         Repo(self.__path)
     except Exception:
         return False
     return True
Exemple #29
0
 def update_log(self, files_to_keep):
     log.debug('Update hashfs log with a list of files to keep',
               class_name=HASH_FS_CLASS_NAME)
     fullpath = os.path.join(self._logpath, 'store.log')
     if not os.path.exists(fullpath):
         return None
     with open(fullpath, 'w') as log_file:
         for file in files_to_keep:
             log_file.write("%s\n" % file)
Exemple #30
0
 def fetch_scid(self, key, log_file=None):
     log.debug('Building the store.log with these added files',
               class_name=HASH_FS_CLASS_NAME)
     if self._exists(key):
         links = self.load(key)
         self._log(key, links['Links'], log_file)
     else:
         log.debug('Blob %s already commited' % key,
                   class_name=HASH_FS_CLASS_NAME)