def __init__(self, project: str = "brain-deepviz", bucket: str = "lucid-flow") -> None: self.project_name = project self.bucket_name = bucket self.tempdir = AbsolutePath(mkdtemp()) self._file_list = None self._bucket = None
def _writing(self, path: AbsolutePath, mode: str = "w+b") -> IO: blob = storage.blob.Blob(path.as_relative_path(), self.bucket) local_path = self.tempdir.append(path.as_relative_path()) makedirs(dirname(local_path), exist_ok=True) writing_file = localfs_open(local_path, mode=mode) yield writing_file writing_file.close() blob.upload_from_filename(local_path)
def load(path: str, transform: str = "None") -> Sequence: assert path.startswith("/") # path = PathTemplate.path_template_prefix + raw_path # TODO: rethink with io.reading(AbsolutePath(path)) as handle: result = lucid_io_load(handle) if transform == "lines": result = result.split("\n") return result
def _glob(self, glob_path: AbsolutePath) -> List[AbsolutePath]: fields = "items/name,items/updated,nextPageToken" matched_paths: List[AbsolutePath] = [] # GCS returns folders iff a trailing slash is specified, so we try both: if glob_path.endswith("/"): other_path = AbsolutePath(glob_path[:-1]) else: other_path = AbsolutePath(glob_path + "/") for glob_string in [glob_path, other_path]: # prefix = glob_string.split('*')[0] # == entire string if no '*' found # bucket_listing = self.bucket.list_blobs(fields=fields, prefix=prefix) # file_paths = [blob.name for blob in bucket_listing] # file_paths = self.file_list.glob(glob_string) # matched_paths += fnmatch.filter(file_paths, glob_string) matched_paths += self.file_list.glob(glob_string) # matched_paths = list(sorted(set(matched_paths))) # should already be unique return matched_paths
def value_for_output(cls, output: str) -> str: if isinstance(output, str): # TODO: decide if we can't find a better way to represent paths # E.g. maybe we should just not allow tasks to save their own results. # or provide a write_handle or sth. if output.startswith("/"): return AbsoluteGCSURL.from_absolute_path(AbsolutePath(output)) else: return output else: raise NotImplementedError
def normpath(self, path: str) -> AbsolutePath: # logging.debug(f"normpathing: {path}") if path.startswith("gs://"): path = path[5:] # logging.debug(f"removed gs scheme: {path}") if path.startswith(self.bucket_name): path = path[len(self.bucket_name):] # logging.debug(f"removed bucket: {path}") # if path.startswith("/"): # path = path[1:] # logging.debug(f"removed leading slash: {path}") return AbsolutePath(path)
def test_joining_absolute_paths(): absolute_one = AbsolutePath("/an/absolute/path.ext") absolute_two = AbsolutePath("/prefix/absolute/path") relative_one = absolute_one.as_relative_path() joined_1 = relative_one.prepend(absolute_two) joined_2 = absolute_two.append(relative_one) assert joined_1 == joined_2
def value_for_input(cls, input: object) -> object: if isinstance(input, str): # logging.debug("input is str") # TODO: here is where we would add convenience loading etc. if input.startswith("/"): return AbsoluteGCSURL.from_absolute_path(AbsolutePath(input)) else: return input if isinstance(input, dict): # logging.debug("input is dict!") # this signifies an aggregating input spec! let's resolve it: if len(input.items()) == 1: logging.debug( "input dict had one entry, assuming AggregatingIS") placeholder_string, path_template_string = list( input.items())[0] path_template = PathTemplate(path_template_string) placeholders = placeholder_string.split(",") assert placeholders == path_template.placeholders paths = io.glob(path_template.glob) value = {} for path in paths: matches = path_template.match(path) if matches: key = tuple(matches[ph] for ph in placeholders) value[key] = AbsoluteGCSURL.from_absolute_path( AbsolutePath(path)) return value else: logging.debug( "input dict had multiple entries, simply setting value.") return input if isinstance(input, (int, float, tuple, list, dict, set)): # logging.debug("input is value") return input else: raise NotImplementedError
def test_absolute_path_relative(): with pytest.raises(ValueError): rp = AbsolutePath("a/relative/path.ext")
def test_absolute_path(): ap = AbsolutePath("/an/absolute/path.ext")
def exist(self, paths: List[AbsolutePath]) -> List[bool]: normalized = [AbsolutePath(self.normpath(path)) for path in paths] return self._exist(normalized)
def test_file_list_glob_negative(): glob_path = AbsolutePath("/a/different/*/path.ext") absolute_path = AbsolutePath("/an/absolute/path.ext") file_list = FileList(paths=[absolute_path]) assert absolute_path not in file_list.glob(glob_path)
def test_absolute_path_basename(): ap = AbsolutePath("/an/absolute/path.ext") dir = ap.basename assert dir == "path.ext"
class TaskSpec(object): """Data object describing which inputs a task expects.""" task_specification_glob = AbsolutePath("/tasks/*.py") variable_to_input_spec: Mapping[Variable, List[InputSpec]] def __init__(self, inputs: List[InputSpec], output: OutputSpec, src_path: str, name: str) -> None: self.input_specs = inputs self.output_spec = output self.src_path = src_path self.name = name self._verify_placeholders() self.variable_to_input_spec = defaultdict(list) for input_spec in inputs: for variable in input_spec.declared_variables(): self.variable_to_input_spec[variable].append(input_spec) def __repr__(self) -> str: ios: List[Spec] = cast(List[Spec], self.input_specs) + [self.output_spec] reprs = ", ".join([repr(input) for input in ios]) return "<TaskSpec {}, ({})>".format(self.name, reprs) def _verify_placeholders(self) -> None: return # TODO: this currently doesn't correctly cover all cases. Disabled for now. # outputs = set(self.output_spec.placeholders) # inputs = set.union(*[input_spec.depends_on() for input_spec in self.input_specs]) # inputs = inputs.union(set(input_spec.name for input_spec in self.input_specs)) # if not outputs == inputs: # if outputs.issuperset(inputs): # difference = outputs - inputs # raise ValueError("Placeholders '{}' in task_spec '{}' do not have input variables that could replace them. (Inputs: {})".format(difference, self.name, inputs)) # else: # TODO: this covers both subset and entirely disjoint. is the error message bringing that across? no. # difference = inputs - outputs # raise ValueError("Input variables '{}' in task_spec '{}' do not have any corresponding output placeholders. (Inputs: {})".format(difference, self.name, inputs)) @classmethod def is_task_path(cls, path: str) -> bool: return fnmatch.fnmatch(path, cls.task_specification_glob) @staticmethod def estimate_cost(num_jobs: int, example_runs: List[JobSpec]) -> None: """Print naive CPU time and cost estimates based on supplied sample runs.""" print( "Estimates are 95% conf. intervals based on std of supplied runs. Only reasonable if colab instance has similar specs as requested AE instances!" ) durations = [run.execution_duration for run in example_runs] duration_mean, duration_std = mean(durations), std(durations) cpu_time_mean = timedelta(seconds=num_jobs * duration_mean) cpu_time_std = timedelta(seconds=num_jobs * duration_std) print( f"Expecting to use {format_timedelta(cpu_time_mean)}±{format_timedelta(cpu_time_std)} of CPU time." ) price_per_hour_in_usd = ( 0.0526 + 2 * 0.0071 ) # 1CPU, 2GB RAM, https://cloud.google.com/appengine/pricing#flexible-environment-instances total_price_mean = price_per_hour_in_usd * ( cpu_time_mean.total_seconds() / (60 * 60)) total_price_std = price_per_hour_in_usd * ( cpu_time_std.total_seconds() / (60 * 60)) print( f"Expecting to cost ${total_price_mean:.2f}±{total_price_std:.2f}." ) @property def manifest_path(self) -> str: return self.src_path.replace(".py", ".json") @property def input_names(self) -> List[str]: return [input_spec.name for input_spec in self.input_specs] @property def dependencies(self) -> Mapping[Variable, Set[Variable]]: return {spec.name: spec.depends_on() for spec in self.input_specs} def to_job_spec(self, bindings: Bindings) -> "JobSpec": str_bindings = stringify_bindings(bindings) output_path = self.output_spec.with_replacements(str_bindings) return JobSpec(bindings, output_path, self.src_path) def to_job_specs(self, initial_bindings: Bindings = {}) -> Iterable[JobSpec]: return map(self.to_job_spec, self.all_bindings(initial_bindings)) def all_bindings(self, initial_bindings: Bindings = {}) -> Sequence[Bindings]: # TODO: return empty list if self.dependencies is empty??? all_bindings = [initial_bindings] sorted_dependencies = toposort_flatten(self.dependencies) logging.debug("Sorted sorted_dependencies: %s", sorted_dependencies) for variable_name in sorted_dependencies: variable = Variable(variable_name) input_specs = self.variable_to_input_spec[variable] for input_spec in input_specs: relevant_vars = input_spec.depends_on() | set( [input_spec.name]) logging.debug( f"Resolving '{variable}' via {input_spec} on relevant vars {relevant_vars}." ) new_bindings: List[Bindings] = [] memoized_values: Dict[FrozenSet[Tuple[str, str]], Set[Value]] = {} for bindings in all_bindings: relevant_bs = frozenset((var, str(value)) for var, value in bindings.items() if var in relevant_vars) if relevant_bs in memoized_values: values = memoized_values[relevant_bs] logging.debug( "Found cached values %s for bindings %s", list(values), bindings, ) else: values = input_spec.values(variable, bindings) memoized_values[relevant_bs] = values logging.debug("Memoized values %s for bindings %s", list(values), bindings) for value in values: value_binding = {variable: value} value_binding.update(bindings) new_bindings.append(value_binding) # logging.debug("New bindings: %s", new_bindings) all_bindings = new_bindings logging.debug("Done resolving: %s", variable) # TODO: what if new_bindings empty because values empty? return all_bindings def matching_input_spec(self, src_path: str) -> Optional[InputSpec]: for input_spec in self.input_specs: if input_spec.matches(src_path): return input_spec return None def should_handle_file(self, src_path: str) -> bool: return self.matching_input_spec(src_path) is not None def manifest(self, all_bindings: Optional[List[Bindings]] = None) -> Dict: bindings = all_bindings or self.all_bindings() keys = self.output_spec.placeholders assignments = sorted([binding[key] for key in keys] for binding in bindings) return { "output": { "template": self.output_spec.path_template.template }, "bindings": { "values": assignments }, } def preflight(self, num_tried_jobs: int = 3) -> None: logging.info(f"Starting preflight, running {num_tried_jobs} jobs...") job_specs = list(self.to_job_specs()) preflight_jobs = sample(job_specs, num_tried_jobs) for job in preflight_jobs: job.execute() logging.info(f"Job completed without error.") self.estimate_cost(len(job_specs), preflight_jobs) def deploy(self, preflight: bool = True) -> None: if preflight: self.preflight() remote_path = f"tasks/{self.name}" io.upload(self.src_path, remote_path)
def absolute_path(): return AbsolutePath("/an/absolute/path.ext")
class GCStorageAdapter(IOAdapter): _file_list: Optional[FileList] _bucket: Optional[Any] def __init__(self, project: str = "brain-deepviz", bucket: str = "lucid-flow") -> None: self.project_name = project self.bucket_name = bucket self.tempdir = AbsolutePath(mkdtemp()) self._file_list = None self._bucket = None @property def bucket(self) -> Any: if not self._bucket: self._client = storage.Client(project=self.project_name) self._bucket = self._client.bucket(self.bucket_name) return self._bucket @property def file_list(self) -> FileList: if not self._file_list: self._file_list = FileList(project=self.project_name, bucket=self.bucket_name) self._file_list._get_all_gcs_files() return self._file_list def normpath(self, path: str) -> AbsolutePath: # logging.debug(f"normpathing: {path}") if path.startswith("gs://"): path = path[5:] # logging.debug(f"removed gs scheme: {path}") if path.startswith(self.bucket_name): path = path[len(self.bucket_name):] # logging.debug(f"removed bucket: {path}") # if path.startswith("/"): # path = path[1:] # logging.debug(f"removed leading slash: {path}") return AbsolutePath(path) @contextmanager def _reading(self, path: AbsolutePath, mode: str = "r+b") -> IO: local_path = self._download(path) reading_file = localfs_open(local_path, mode=mode) yield reading_file reading_file.close() @contextmanager def _writing(self, path: AbsolutePath, mode: str = "w+b") -> IO: blob = storage.blob.Blob(path.as_relative_path(), self.bucket) local_path = self.tempdir.append(path.as_relative_path()) makedirs(dirname(local_path), exist_ok=True) writing_file = localfs_open(local_path, mode=mode) yield writing_file writing_file.close() blob.upload_from_filename(local_path) def _makedirs(self, path: str) -> None: pass def _glob(self, glob_path: AbsolutePath) -> List[AbsolutePath]: fields = "items/name,items/updated,nextPageToken" matched_paths: List[AbsolutePath] = [] # GCS returns folders iff a trailing slash is specified, so we try both: if glob_path.endswith("/"): other_path = AbsolutePath(glob_path[:-1]) else: other_path = AbsolutePath(glob_path + "/") for glob_string in [glob_path, other_path]: # prefix = glob_string.split('*')[0] # == entire string if no '*' found # bucket_listing = self.bucket.list_blobs(fields=fields, prefix=prefix) # file_paths = [blob.name for blob in bucket_listing] # file_paths = self.file_list.glob(glob_string) # matched_paths += fnmatch.filter(file_paths, glob_string) matched_paths += self.file_list.glob(glob_string) # matched_paths = list(sorted(set(matched_paths))) # should already be unique return matched_paths def _exist(self, paths: List[AbsolutePath]) -> List[bool]: return [self.file_list.exists(path) for path in paths] def _download(self, path: AbsolutePath) -> AbsolutePath: local_path = self.tempdir.append(path.as_relative_path()) makedirs(dirname(local_path), exist_ok=True) blob = storage.blob.Blob(path.as_relative_path(), self.bucket) blob.download_to_filename(local_path) return local_path def _upload(self, local_path: str, remote_path: RelativePath) -> None: assert not remote_path.startswith("/") blob = storage.blob.Blob(remote_path, self.bucket) blob.upload_from_filename(local_path)
def test_gcs_connection(empty_file_list): file_list = empty_file_list file_list._get_all_gcs_files() assert file_list.exists(AbsolutePath("/data/noop"))
def test_file_list_invalid_glob_fails_loudly(): glob_url = AbsoluteURL("gs://lucid-flow/an/*/path.ext") absolute_path = AbsolutePath("/an/absolute/path.ext") file_list = FileList(paths=[absolute_path]) with pytest.raises(ValueError): file_list.glob(glob_url)
def test_absolute_path_url(): with pytest.raises(ValueError): rp = AbsolutePath("gs://an/absolute/url.ext")
def test_absolute_path_dirname(): ap = AbsolutePath("/an/absolute/path.ext") dir = ap.dirname assert dir == "/an/absolute"
def _download(self, path: AbsolutePath) -> AbsolutePath: local_path = self.tempdir.append(path.as_relative_path()) makedirs(dirname(local_path), exist_ok=True) blob = storage.blob.Blob(path.as_relative_path(), self.bucket) blob.download_to_filename(local_path) return local_path
def glob(self, glob_string: AbsolutePath) -> List[AbsolutePath]: if not isinstance(glob_string, AbsolutePath): raise ValueError("Can only use AbsolutePath objects with FileList!") paths = fnmatch.filter(self.paths, glob_string) return [AbsolutePath(path) for path in paths]
def test_file_list_glob(): glob_path = AbsolutePath("/an/*/path.ext") absolute_path = AbsolutePath("/an/absolute/path.ext") file_list = FileList(paths=[absolute_path]) assert absolute_path in file_list.glob(glob_path)