def store_path(self, artifact, path, name=None, checksum=True, max_objects=None): self.init_gcs() bucket, key = self._parse_uri(path) max_objects = max_objects or DEFAULT_MAX_OBJECTS if checksum == False: return [ArtifactManifestEntry(name or key, path, digest=path)] start_time = None obj = self._client.bucket(bucket).get_blob(key) multi = obj is None if multi: start_time = time.time() termlog( 'Generating checksum for up to %i objects with prefix "%s"... ' % (max_objects, key), newline=False, ) objects = self._client.bucket(bucket).list_blobs( prefix=key, max_results=max_objects ) else: objects = [obj] entries = [ self._entry_from_obj(obj, path, name, prefix=key, multi=multi) for obj in objects ] if start_time is not None: termlog("Done. %.1fs" % (time.time() - start_time), prefix=False) if len(entries) >= max_objects: raise ValueError( "Exceeded %i objects tracked, pass max_objects to add_reference" % max_objects ) return entries
def add_dir(self, local_path: str, name: Optional[str] = None): self._ensure_can_add() if not os.path.isdir(local_path): raise ValueError("Path is not a directory: %s" % local_path) termlog( "Adding directory to artifact (%s)... " % os.path.join(".", os.path.normpath(local_path)), newline=False, ) start_time = time.time() paths = [] for dirpath, _, filenames in os.walk(local_path, followlinks=True): for fname in filenames: physical_path = os.path.join(dirpath, fname) logical_path = os.path.relpath(physical_path, start=local_path) if name is not None: logical_path = os.path.join(name, logical_path) paths.append((logical_path, physical_path)) def add_manifest_file(log_phy_path): logical_path, physical_path = log_phy_path self._add_local_file(logical_path, physical_path) import multiprocessing.dummy # this uses threads NUM_THREADS = 8 pool = multiprocessing.dummy.Pool(NUM_THREADS) pool.map(add_manifest_file, paths) pool.close() pool.join() termlog("Done. %.1fs" % (time.time() - start_time), prefix=False)
def store_path(self, artifact, path, name=None, checksum=True, max_objects=None): url = urlparse(path) local_path = "%s%s" % (url.netloc, url.path) max_objects = max_objects or DEFAULT_MAX_OBJECTS # We have a single file or directory # Note, we follow symlinks for files contained within the directory entries = [] if checksum == False: return [ ArtifactManifestEntry(name or os.path.basename(path), path, digest=path) ] if os.path.isdir(local_path): i = 0 start_time = time.time() termlog( 'Generating checksum for up to %i files in "%s"...\n' % (max_objects, local_path), newline=False, ) for root, dirs, files in os.walk(local_path): for sub_path in files: i += 1 if i >= max_objects: raise ValueError( "Exceeded %i objects tracked, pass max_objects to add_reference" % max_objects) physical_path = os.path.join(root, sub_path) logical_path = os.path.relpath(physical_path, start=local_path) entry = ArtifactManifestEntry( logical_path, os.path.join(path, logical_path), size=os.path.getsize(physical_path), digest=md5_file_b64(physical_path), ) entries.append(entry) termlog("Done. %.1fs" % (time.time() - start_time), prefix=False) elif os.path.isfile(local_path): name = name or os.path.basename(local_path) entry = ArtifactManifestEntry( name, path, size=os.path.getsize(local_path), digest=md5_file_b64(local_path), ) entries.append(entry) else: # TODO: update error message if we don't allow directories. raise ValueError( 'Path "%s" must be a valid file or directory path' % path) return entries
def store_path(self, artifact, path, name=None, checksum=True, max_objects=None): self.init_boto() bucket, key = self._parse_uri(path) max_objects = max_objects or DEFAULT_MAX_OBJECTS if not checksum: return [ArtifactManifestEntry(name or key, path, digest=path)] objs = [self._s3.Object(bucket, key)] start_time = None multi = False try: objs[0].load() except self._botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == "404": multi = True start_time = time.time() termlog( 'Generating checksum for up to %i objects with prefix "%s"... ' % (max_objects, key), newline=False, ) objs = ( self._s3.Bucket(bucket) .objects.filter(Prefix=key) .limit(max_objects) ) else: raise CommError( "Unable to connect to S3 (%s): %s" % (e.response["Error"]["Code"], e.response["Error"]["Message"]) ) # Weird iterator scoping makes us assign this to a local function size = self._size_from_obj entries = [ self._entry_from_obj(obj, path, name, prefix=key, multi=multi) for obj in objs if size(obj) > 0 ] if start_time is not None: termlog("Done. %.1fs" % (time.time() - start_time), prefix=False) if len(entries) >= max_objects: raise ValueError( "Exceeded %i objects tracked, pass max_objects to add_reference" % max_objects ) return entries