Ejemplo n.º 1
0
    def run_flow(cls, name, context='default') -> Run:
        """
        This function runs Metaflow flow and returns Run object.
        :param name: filename.flowname (e.g bidding.AdNetworkOptimizationFlow)
        :param context: command context (env. variables, arguments etc.)
        :return: metaflow.client.Run
        """
        context = cls.load_contexts()[context]
        run_id_file = f"unittest-{str(uuid.uuid4())}"
        python_path = os.environ['_']
        file_name, flow_name = name.split('.')

        cmd = [
            python_path,
            '-B',
            f"{join(APP_DIR, file_name)}.py",
        ]
        cmd.extend(context['args'])
        cmd.extend(['run', f'--run-id-file={run_id_file}'])

        env = dict(os.environ)
        env.update(context['env'])

        subprocess.check_call(" ".join(cmd), env=env, cwd=APP_DIR, shell=True)

        with open(join(APP_DIR, run_id_file)) as f:
            run_id = f.read()

        # cleanup Metaflow run_id file
        os.remove(join(APP_DIR, run_id_file))

        return Run(f"{flow_name}/{run_id}")
Ejemplo n.º 2
0
    def execute(cls,
                message=None,
                keys=None,
                existing_keys={},
                stream_output=None,
                invalidate_cache=False,
                **kwargs):
        results = {}
        flow_id = message['flow_id']
        run_number = message['run_number']

        result_key = [key for key in keys if key.startswith('dag:result')][0]

        with streamed_errors(stream_output):
            run = Run("{}/{}".format(flow_id, run_number))
            param_step = Step("{}/_parameters".format(run.pathspec))
            try:
                dag = DataArtifact("{}/_graph_info".format(
                    param_step.task.pathspec)).data
            except MetaflowNotFound:
                dag = generate_dag(run)

            results[result_key] = json.dumps(dag)

        return results
Ejemplo n.º 3
0
 def batch_recommend(self):
     from movie_model import load_model, recommend
     run = Run(self.model_run)
     model_ann, model_users_mtx, model_movies_mtx = load_model(run)
     self.recs = list(
         recommend(self.input, model_movies_mtx, model_users_mtx, model_ann,
                   self.num_recs))
     self.next(self.join)
Ejemplo n.º 4
0
    def wrapper(*args):
        run = args[0]

        if run.cache:
            namespace(None)
            run._run = Run(run.pathspec)
            run.cache = False

        return func(run)
Ejemplo n.º 5
0
def DoltDT(
    run: Optional[Union[str, FlowSpec]] = None,
    audit: Optional[dict] = None,
    config: Optional[DoltConfig] = None,
):
    _run = Run(run) if type(run) == str else run
    if config and audit:
        logger.warning("Specified audit or config mode, will use aduit.")
    elif audit:
        return DoltAuditDT(audit=audit, run=_run)
    elif config:
        return DoltBranchDT(_run, config)
    elif _run and hasattr(_run, "data") and hasattr(_run.data, "dolt"):
        return DoltAuditDT(audit=_run.data.dolt, run=_run)
    else:
        raise ValueError("Specify one of: audit, config")
    def start(self):
        analysis_conf = DoltConfig(
            database=self.hospital_price_analysis_db,
            branch=self.hospital_price_analysis_db_branch)
        audit = Run(self.historical_run_path
                    ).data.dolt if self.historical_run_path else None

        with DoltDT(run=self, config=analysis_conf, audit=audit) as dolt:
            median_price_by_state = dolt.read("state_procedure_medians")
            variance_by_procedure = median_price_by_state.groupby(
                "code").var().reset_index()
            variance_by_procedure = variance_by_procedure[
                ~variance_by_procedure['code'].str.startswith('nan')]
            dolt.write(variance_by_procedure, "variance_by_procedure")

        self.next(self.end)
Ejemplo n.º 7
0
    def start(self):
        """
        Load test data set
        """
        from io import StringIO

        # Load the data set into a pandas dataframe.
        self.X = pd.read_csv(StringIO(self.test_data))

        print('run id: ', self.run_id)
        if self.run_id == 'latest_successful':
            self.train_run = Flow('TitanicModeling').latest_successful_run
        else:
            self.train_run = Run(f'TitanicModeling/{self.run_id}')

        # Compute our two recomendation types in parallel.
        self.next(self.categorical_prep, self.numerical_prep)
Ejemplo n.º 8
0
    def get_formatted_steps(self):
        # results = []
        # st = time.time()
        # for s in self._run.steps():
        #     results.append(StepWrapper(s.pathspec).json_without_tasks())
        # e = time.time()
        # print(e-st)
        # return results

        results = []
        st = time.time()

        for step in Run(f"{self.flow_name}/{self.run_id}").steps():
            results.append({
                "step": step.path_components[-1],
                "finished_at": step.finished_at,
                "created_at": step.created_at
                # "tasks": self.get_formatted_tasks()
            })
        # e = time.time()
        # print(e-st)
        return results
Ejemplo n.º 9
0
    def start(self):
        audit = Run(f"VersioningDemo/{self.read_run_id}").data.dolt
        with DoltDT(run=self, audit=audit) as dolt:
            df = dolt.read("bar")

        self.next(self.end)
Ejemplo n.º 10
0
def test_init_options(s3root, pathspecs, expected):
    [pathspec] = pathspecs
    flow_name, run_id = pathspec.split("/")
    plen = len(s3root)

    # option 1) s3root as prefix
    with S3(s3root=s3root) as s3:
        for url, exp in expected.items():
            # s3root should work as a prefix
            s3obj = s3.get(url[plen:])
            assert s3obj.key == url[plen:]
            assert_results([s3obj], {url: exp})
        with pytest.raises(MetaflowS3URLException):
            s3.get("s3://some/fake/address")

    # option 2) full url as s3root
    for url, exp in expected.items():
        with S3(s3root=url) as s3:
            s3obj = s3.get()
            assert_results([s3obj], {url: exp})

    # option 3) full urls
    with S3() as s3:
        for url, exp in expected.items():
            # s3root should work as a prefix
            s3obj = s3.get(url)
            assert s3obj.key == url
            assert_results([s3obj], {url: exp})
        with pytest.raises(MetaflowS3URLException):
            s3.get("suffix")
        with pytest.raises(MetaflowS3URLException):
            s3.get("s3://nopath")
        with pytest.raises(MetaflowS3URLException):
            s3.get_many(["suffixes"])
        with pytest.raises(MetaflowS3URLException):
            s3.get_recursive(["suffixes"])
        with pytest.raises(MetaflowS3URLException):
            s3.get_all()

    # option 4) 'current' environment (fake a running flow)
    flow = FakeFlow(use_cli=False)

    parsed = urlparse(s3root)
    with pytest.raises(MetaflowS3URLException):
        # current not set yet, so this should fail
        with S3(run=flow):
            pass

    current._set_env(
        FakeFlow(name=flow_name),
        run_id,
        "no_step",
        "no_task",
        "no_origin_run_id",
        "no_ns",
        "no_user",
    )

    with S3(bucket=parsed.netloc, prefix=parsed.path, run=flow) as s3:
        for url, exp in expected.items():
            name = url.split("/")[-1]
            s3obj = s3.get(name)
            assert s3obj.key == name
            assert_results([s3obj], {url: exp})
        names = [url.split("/")[-1] for url in expected]
        s3objs = s3.get_many(names)
        assert {e.key for e in s3objs} == set(names)
        assert_results(s3objs, expected)
        assert_results(s3.get_all(), expected, info_should_be_empty=True)

    # option 5) run object
    if DO_TEST_RUN:
        # Only works if a metadata service exists with the run in question.
        namespace(None)
        with S3(bucket=parsed.netloc, prefix=parsed.path,
                run=Run(pathspec)) as s3:
            names = [url.split("/")[-1] for url in expected]
            assert_results(s3.get_many(names), expected)
Ejemplo n.º 11
0
def lambda_handler(event, context):

    print(event)

    for record in event['Records']:
        key = record['s3']['object']['key']
        bucket_name = record['s3']['bucket']['name']

        os.environ['METAFLOW_HOME'] = '/tmp'
        os.environ['USERNAME'] = "******"

        obj = {
            'METAFLOW_DEFAULT_METADATA':
            'service',
            'METAFLOW_DEFAULT_DATASTORE':
            's3',
            'METAFLOW_DATASTORE_SYSROOT_S3':
            f"s3://{bucket_name}",
            'METAFLOW_SERVICE_AUTH_KEY':
            "yvhNDfEzcRa5fxKq2ZELda1zk8wNXxMs17Jt4OGs",
            'METAFLOW_SERVICE_URL':
            "https://5sqcgnuyte.execute-api.eu-west-1.amazonaws.com/api/"
        }

        with open('/tmp/config.json', 'w', encoding='utf-8') as f:
            json.dump(obj, f, ensure_ascii=False, indent=4)

        from metaflow import Run, get_metadata, namespace

        namespace(None)
        print(get_metadata())

        step = key.split("/")[2]
        flow = key.split("/")[0]
        run_id = key.split("/")[1]

        run = Run(f"{flow}/{run_id}")

        dynamo_object = {
            "created_at":
            int(
                datetime.strptime(
                    run.created_at.split(".")[0],
                    '%Y-%m-%dT%H:%M:%S').timestamp()),
            "flow_name":
            flow,
            "run_id":
            int(run_id),
            "success":
            run.successful,
            "finished":
            run.finished,
            "finished_at":
            0 if run.finished_at == None else int(
                datetime.strptime(
                    run.finished_at.split(".")[0],
                    '%Y-%m-%dT%H:%M:%S').timestamp()),
            "current_step":
            step,
            "user":
            _parse_tags(run.tags, "user"),
            "tags":
            run.tags,
            "bucket":
            bucket_name
        }

        print(dynamo_object)

        table = dynamodb.Table(EVENTS_RECORD_STORE)

        table.put_item(Item=dynamo_object)

    return