def run_flow(cls, name, context='default') -> Run: """ This function runs Metaflow flow and returns Run object. :param name: filename.flowname (e.g bidding.AdNetworkOptimizationFlow) :param context: command context (env. variables, arguments etc.) :return: metaflow.client.Run """ context = cls.load_contexts()[context] run_id_file = f"unittest-{str(uuid.uuid4())}" python_path = os.environ['_'] file_name, flow_name = name.split('.') cmd = [ python_path, '-B', f"{join(APP_DIR, file_name)}.py", ] cmd.extend(context['args']) cmd.extend(['run', f'--run-id-file={run_id_file}']) env = dict(os.environ) env.update(context['env']) subprocess.check_call(" ".join(cmd), env=env, cwd=APP_DIR, shell=True) with open(join(APP_DIR, run_id_file)) as f: run_id = f.read() # cleanup Metaflow run_id file os.remove(join(APP_DIR, run_id_file)) return Run(f"{flow_name}/{run_id}")
def execute(cls, message=None, keys=None, existing_keys={}, stream_output=None, invalidate_cache=False, **kwargs): results = {} flow_id = message['flow_id'] run_number = message['run_number'] result_key = [key for key in keys if key.startswith('dag:result')][0] with streamed_errors(stream_output): run = Run("{}/{}".format(flow_id, run_number)) param_step = Step("{}/_parameters".format(run.pathspec)) try: dag = DataArtifact("{}/_graph_info".format( param_step.task.pathspec)).data except MetaflowNotFound: dag = generate_dag(run) results[result_key] = json.dumps(dag) return results
def batch_recommend(self): from movie_model import load_model, recommend run = Run(self.model_run) model_ann, model_users_mtx, model_movies_mtx = load_model(run) self.recs = list( recommend(self.input, model_movies_mtx, model_users_mtx, model_ann, self.num_recs)) self.next(self.join)
def wrapper(*args): run = args[0] if run.cache: namespace(None) run._run = Run(run.pathspec) run.cache = False return func(run)
def DoltDT( run: Optional[Union[str, FlowSpec]] = None, audit: Optional[dict] = None, config: Optional[DoltConfig] = None, ): _run = Run(run) if type(run) == str else run if config and audit: logger.warning("Specified audit or config mode, will use aduit.") elif audit: return DoltAuditDT(audit=audit, run=_run) elif config: return DoltBranchDT(_run, config) elif _run and hasattr(_run, "data") and hasattr(_run.data, "dolt"): return DoltAuditDT(audit=_run.data.dolt, run=_run) else: raise ValueError("Specify one of: audit, config")
def start(self): analysis_conf = DoltConfig( database=self.hospital_price_analysis_db, branch=self.hospital_price_analysis_db_branch) audit = Run(self.historical_run_path ).data.dolt if self.historical_run_path else None with DoltDT(run=self, config=analysis_conf, audit=audit) as dolt: median_price_by_state = dolt.read("state_procedure_medians") variance_by_procedure = median_price_by_state.groupby( "code").var().reset_index() variance_by_procedure = variance_by_procedure[ ~variance_by_procedure['code'].str.startswith('nan')] dolt.write(variance_by_procedure, "variance_by_procedure") self.next(self.end)
def start(self): """ Load test data set """ from io import StringIO # Load the data set into a pandas dataframe. self.X = pd.read_csv(StringIO(self.test_data)) print('run id: ', self.run_id) if self.run_id == 'latest_successful': self.train_run = Flow('TitanicModeling').latest_successful_run else: self.train_run = Run(f'TitanicModeling/{self.run_id}') # Compute our two recomendation types in parallel. self.next(self.categorical_prep, self.numerical_prep)
def get_formatted_steps(self): # results = [] # st = time.time() # for s in self._run.steps(): # results.append(StepWrapper(s.pathspec).json_without_tasks()) # e = time.time() # print(e-st) # return results results = [] st = time.time() for step in Run(f"{self.flow_name}/{self.run_id}").steps(): results.append({ "step": step.path_components[-1], "finished_at": step.finished_at, "created_at": step.created_at # "tasks": self.get_formatted_tasks() }) # e = time.time() # print(e-st) return results
def start(self): audit = Run(f"VersioningDemo/{self.read_run_id}").data.dolt with DoltDT(run=self, audit=audit) as dolt: df = dolt.read("bar") self.next(self.end)
def test_init_options(s3root, pathspecs, expected): [pathspec] = pathspecs flow_name, run_id = pathspec.split("/") plen = len(s3root) # option 1) s3root as prefix with S3(s3root=s3root) as s3: for url, exp in expected.items(): # s3root should work as a prefix s3obj = s3.get(url[plen:]) assert s3obj.key == url[plen:] assert_results([s3obj], {url: exp}) with pytest.raises(MetaflowS3URLException): s3.get("s3://some/fake/address") # option 2) full url as s3root for url, exp in expected.items(): with S3(s3root=url) as s3: s3obj = s3.get() assert_results([s3obj], {url: exp}) # option 3) full urls with S3() as s3: for url, exp in expected.items(): # s3root should work as a prefix s3obj = s3.get(url) assert s3obj.key == url assert_results([s3obj], {url: exp}) with pytest.raises(MetaflowS3URLException): s3.get("suffix") with pytest.raises(MetaflowS3URLException): s3.get("s3://nopath") with pytest.raises(MetaflowS3URLException): s3.get_many(["suffixes"]) with pytest.raises(MetaflowS3URLException): s3.get_recursive(["suffixes"]) with pytest.raises(MetaflowS3URLException): s3.get_all() # option 4) 'current' environment (fake a running flow) flow = FakeFlow(use_cli=False) parsed = urlparse(s3root) with pytest.raises(MetaflowS3URLException): # current not set yet, so this should fail with S3(run=flow): pass current._set_env( FakeFlow(name=flow_name), run_id, "no_step", "no_task", "no_origin_run_id", "no_ns", "no_user", ) with S3(bucket=parsed.netloc, prefix=parsed.path, run=flow) as s3: for url, exp in expected.items(): name = url.split("/")[-1] s3obj = s3.get(name) assert s3obj.key == name assert_results([s3obj], {url: exp}) names = [url.split("/")[-1] for url in expected] s3objs = s3.get_many(names) assert {e.key for e in s3objs} == set(names) assert_results(s3objs, expected) assert_results(s3.get_all(), expected, info_should_be_empty=True) # option 5) run object if DO_TEST_RUN: # Only works if a metadata service exists with the run in question. namespace(None) with S3(bucket=parsed.netloc, prefix=parsed.path, run=Run(pathspec)) as s3: names = [url.split("/")[-1] for url in expected] assert_results(s3.get_many(names), expected)
def lambda_handler(event, context): print(event) for record in event['Records']: key = record['s3']['object']['key'] bucket_name = record['s3']['bucket']['name'] os.environ['METAFLOW_HOME'] = '/tmp' os.environ['USERNAME'] = "******" obj = { 'METAFLOW_DEFAULT_METADATA': 'service', 'METAFLOW_DEFAULT_DATASTORE': 's3', 'METAFLOW_DATASTORE_SYSROOT_S3': f"s3://{bucket_name}", 'METAFLOW_SERVICE_AUTH_KEY': "yvhNDfEzcRa5fxKq2ZELda1zk8wNXxMs17Jt4OGs", 'METAFLOW_SERVICE_URL': "https://5sqcgnuyte.execute-api.eu-west-1.amazonaws.com/api/" } with open('/tmp/config.json', 'w', encoding='utf-8') as f: json.dump(obj, f, ensure_ascii=False, indent=4) from metaflow import Run, get_metadata, namespace namespace(None) print(get_metadata()) step = key.split("/")[2] flow = key.split("/")[0] run_id = key.split("/")[1] run = Run(f"{flow}/{run_id}") dynamo_object = { "created_at": int( datetime.strptime( run.created_at.split(".")[0], '%Y-%m-%dT%H:%M:%S').timestamp()), "flow_name": flow, "run_id": int(run_id), "success": run.successful, "finished": run.finished, "finished_at": 0 if run.finished_at == None else int( datetime.strptime( run.finished_at.split(".")[0], '%Y-%m-%dT%H:%M:%S').timestamp()), "current_step": step, "user": _parse_tags(run.tags, "user"), "tags": run.tags, "bucket": bucket_name } print(dynamo_object) table = dynamodb.Table(EVENTS_RECORD_STORE) table.put_item(Item=dynamo_object) return