Esempio n. 1
0
    def start(self):
        """
        Use the Metaflow client to retrieve the latest successful run from our
        MovieStatsFlow and assign them as data artifacts in this flow.

        This step uses 'conda' to isolate the environment. This step will
        always use pandas==1.3.3 regardless of what is installed on the
        system.

        """
        # Load the analysis from the MovieStatsFlow.
        from metaflow import Flow, get_metadata

        # Print metadata provider
        print("Using metadata provider: %s" % get_metadata())

        # Load the analysis from the MovieStatsFlow.
        run = Flow("MovieStatsFlow").latest_successful_run
        print("Using analysis from '%s'" % str(run))

        # Get the dataframe from the start step before we sliced into into
        # genre specific dataframes.
        self.dataframe = run["start"].task.data.dataframe

        # Also grab the summary statistics.
        self.genre_stats = run.data.genre_stats

        # Compute our two recommendation types in parallel.
        self.next(self.bonus_movie, self.genre_movies)
Esempio n. 2
0
 def eval(self):
     with profile("Evaluating: %s" % self.model_name):
         mod = MODELS[self.model_name]
         data_run = Flow('TaxiRegressionDataFlow')[self.data_run_id]
         model = mod.load_model(self.model)
         self.mse = mod.mse(model, data_run.data.test_data)
     self.next(self.join)
Esempio n. 3
0
 def train(self):
     self.model_name = self.input
     with profile('Training model: %s' % self.model_name):
         mod = MODELS[self.model_name]
         data_run = Flow('TaxiRegressionDataFlow')[self.data_run_id]
         model = mod.fit(data_run.data.train_data)
         self.model = mod.save_model(model)
     self.next(self.eval)
Esempio n. 4
0
    def start(self):
        flow = Flow('TrainModels').latest_successful_run
        print('using data from flow: %s' % flow.id)

        self.model = flow.data.simple_rf
        self.config = Config(**yaml.load(self.config_file))

        self.next(self.end)
Esempio n. 5
0
 def start(self):
     run = Flow('TaxiRegressionDataFlow').latest_run
     self.data_run_id = run.id
     self.features = run.data.features
     self.models = [name for name, model in MODELS.items()
                    if all(feat in self.features\
                           for feat in model.FEATURES)]
     print("Building models: %s" % ', '.join(self.models))
     self.next(self.train, foreach='models')
Esempio n. 6
0
    def start(self):
        flow = Flow('PreprocessPaginate').latest_successful_run
        print('using data from flow: %s' % flow.id)

        self.signals = flow.data.signals_df

        flow = Flow('PrepareFeatures').latest_successful_run
        print('using data from flow: %s' % flow.id)

        self.features = flow.data.recent_annoated_simple_features

        flow = Flow('DeployModel').latest_successful_run
        print('using data from flow: %s' % flow.id)

        self.model = flow.data.model
        self.config = Config(**yaml.load(self.config_file))

        self.next(self.predict_vocab)
Esempio n. 7
0
    def start(self):
        audit = Flow("VersioningDemo").latest_successful_run.data.dolt
        master_conf = DoltConfig(database="foo")
        with DoltDT(run=self, audit=audit) as dolt:
            self.df1 = dolt.read("bar", as_key="bar1")
        with DoltDT(run=self, config=master_conf) as dolt:
            self.df2 = dolt.read("bar", as_key="bar2")

        self.next(self.end)
Esempio n. 8
0
    def start(self):
        flow = Flow('PrepareFeatures').latest_successful_run
        print('using data from flow: %s' % flow.id)

        self.fetures = flow.data.annoated_simple_features

        self.config = Config(**yaml.load(self.config_file))

        self.next(self.train_simple_rf_model)
Esempio n. 9
0
    def start(self):
        flow = Flow('PreprocessPaginate').latest_successful_run
        print('using data from flow: %s' % flow.id)

        self.signals_df = flow.data.signals_df
        self.clean_signals_df = flow.data.clean_signals_df
        self.config = Config(**yaml.load(self.config_file))

        self.next(self.prepare_simple_features)
Esempio n. 10
0
    def start(self):
        flow = Flow('Download').latest_successful_run
        print('using data from flow: %s' % flow.id)

        self.books = flow.data.books
        self.logs = flow.data.logs
        self.config = Config(**yaml.load(self.config_file))

        self.next(self.preprocess_pages_df)
Esempio n. 11
0
def resolve_task_from_pathspec(flow_name, pathspec):
    """
    resolves a task object for the pathspec query on the CLI.
    Args:
        flow_name : (str) : name of flow
        pathspec (str) : can be `stepname` / `runid/stepname` / `runid/stepname/taskid`

    Returns:
        metaflow.Task | None
    """
    from metaflow import Flow, Step, Task
    from metaflow.exception import MetaflowNotFound

    # since pathspec can have many variations.
    pthsplits = pathspec.split("/")
    task = None
    run_id = None
    resolving_from = "task_pathspec"
    if len(pthsplits) == 1:
        # This means stepname
        resolving_from = "stepname"
        latest_run = Flow(flow_name).latest_run
        if latest_run is not None:
            run_id = latest_run.pathspec
            try:
                task = latest_run[pathspec].task
            except KeyError:
                pass
    elif len(pthsplits) == 2:
        # This means runid/stepname
        namespace(None)
        resolving_from = "step_pathspec"
        try:
            task = Step("/".join([flow_name, pathspec])).task
        except MetaflowNotFound:
            pass
    elif len(pthsplits) == 3:
        # this means runid/stepname/taskid
        namespace(None)
        resolving_from = "task_pathspec"
        try:
            task = Task("/".join([flow_name, pathspec]))
        except MetaflowNotFound:
            pass
    else:
        # raise exception for invalid pathspec format
        raise CommandException(
            msg=
            "The PATHSPEC argument should be of the form 'stepname' Or '<runid>/<stepname>' Or '<runid>/<stepname>/<taskid>'"
        )

    if task is None:
        # raise Exception that task could not be resolved for the query.
        raise TaskNotFoundException(pathspec, resolving_from, run_id=run_id)

    return task
Esempio n. 12
0
 def start(self):
     from movie_recs_util import make_batches, top_movies
     run = Flow('MovieTrainFlow').latest_successful_run
     self.movie_names = run['start'].task['movie_names'].data
     self.model_run = run.pathspec
     print('Using model from', self.model_run)
     model_users_mtx = run['start'].task['model_users_mtx'].data
     self.top_movies = top_movies(model_users_mtx, self.num_top_movies)
     self.pairs = make_batches(combinations(self.top_movies, 2))
     self.next(self.batch_recommend, foreach='pairs')
def list_many_cards(
    ctx,
    type=None,
    hash=None,
    card_id=None,
    follow_resumed=None,
    as_json=None,
):
    from metaflow import Flow

    flow = Flow(ctx.obj.flow.name)
    run = flow.latest_run
    cards_found = 0
    if not as_json:
        pass
        ctx.obj.echo("Listing cards for run %s" % run.pathspec, fg="green")
    js_list = []
    for step in run:
        step_str_printed = False  # variable to control printing stepname once.
        for task in step:
            try:
                available_card_paths, card_datastore, pathspec = resolve_card(
                    ctx,
                    "/".join(task.pathspec.split("/")[1:]),
                    type=type,
                    hash=hash,
                    card_id=card_id,
                    follow_resumed=follow_resumed,
                    no_echo=True,
                )
                if not step_str_printed and not as_json:
                    ctx.obj.echo("Step : %s" % step.id, fg="green")
                    step_str_printed = True

                js_resp = list_available_cards(
                    ctx,
                    pathspec,
                    available_card_paths,
                    card_datastore,
                    command=None,
                    show_list_as_json=as_json,
                    list_many=True,
                )
                if as_json:
                    js_list.append(js_resp)
                cards_found += 1
            except CardNotPresentException:
                pass
    if cards_found == 0:
        raise CardNotPresentException(run.pathspec,
                                      card_hash=hash,
                                      card_type=type,
                                      card_id=card_id)
    if as_json:
        print(json.dumps(js_list, indent=4))
def get_run_stats(min_num_epochs=100,min_demos=100):
    save_objs = [] 
    for run in Flow('TrainingSimulatorFlow').runs():
        if not run.finished:
            continue
        flow_init_datum = list(run.steps())[-1].task.data 
        if flow_init_datum.num_demos >= min_demos and flow_init_datum.num_epochs >= min_num_epochs: 
            nw_objs = [data.to_json()for data in run.data.final_data] # capture flows > 100 demos/ 100 epochs. 
            save_objs = save_objs + nw_objs
    
    return save_objs
Esempio n. 15
0
    def start(self):
        flow = Flow('Download').latest_successful_run
        print('using users data from flow: %s' % flow.id)

        self.users = flow.data.users
        self.vocab_skills = flow.data.vocab_skills

        flow = Flow('PredictVocab').latest_successful_run
        print('using vocab data from flow: %s' % flow.id)

        self.known_words_df = flow.data.known_words_df

        flow = Flow('PreprocessPaginate').latest_successful_run
        print('using signals data from flow: %s' % flow.id)

        self.signals_df = flow.data.signals_df
        self.clean_pages_df = flow.data.clean_pages_df

        import yaml
        self.config = Config(**yaml.load(self.config_file))

        self.next(self.generate_stats)
Esempio n. 16
0
 def start(self):
     if self.use_ctas:
         self.paths = Flow('TaxiETLFlow').latest_run.data.paths
     else:
         with S3() as s3:
             objs = s3.list_recursive(URLS)
             self.paths = [obj.url for obj in objs]
     print("Processing %d Parquet files" % len(self.paths))
     n = max(round(len(self.paths) / NUM_SHARDS), 1)
     self.shards = [
         self.paths[i * n:(i + 1) * n] for i in range(NUM_SHARDS - 1)
     ]
     self.shards.append(self.paths[(NUM_SHARDS - 1) * n:])
     self.next(self.preprocess_data, foreach='shards')
Esempio n. 17
0
    def start(self):
        with DoltDT(run=self, database='foo', branch="master") as dolt:
            self.df = dolt.read_table('bar')

        first_run = Flow("SucceedsFirstDemo").latest_successful_run
        first_run_ts = datetime.datetime.strptime(first_run.finished_at,
                                                  "%Y-%m-%dT%H:%M:%SZ")
        one_minute_ago = datetime.datetime.now() + datetime.timedelta(
            hours=8) - datetime.timedelta(minutes=1)
        if first_run_ts < one_minute_ago:
            raise Exception(
                "Run `FirstDemo` within one minute of `SecondDemo`")

        self.next(self.middle)
Esempio n. 18
0
def list_flows(names="all"):
    columns = ["flow", "id", "start", "finish"]
    if isinstance(names,str) and names=="all": 
        names = []
        flows = Metaflow().flows
        for flow in flows:
            names.append(flow.pathspec)
    df = pd.DataFrame(columns= columns)
    for name in names:
        runs = list(Flow(name))
        for run in runs:
            index = run.path_components[1]
            start = run.created_at
            finish = run.finished_at
            df = df.append(dict(zip(columns, [name, index, start, finish])), ignore_index=True)
    return(df.sort_values(by=columns[:2]))
Esempio n. 19
0
    def start(self):
        """
        Load test data set
        """
        from io import StringIO

        # Load the data set into a pandas dataframe.
        self.X = pd.read_csv(StringIO(self.test_data))

        print('run id: ', self.run_id)
        if self.run_id == 'latest_successful':
            self.train_run = Flow('TitanicModeling').latest_successful_run
        else:
            self.train_run = Run(f'TitanicModeling/{self.run_id}')

        # Compute our two recomendation types in parallel.
        self.next(self.categorical_prep, self.numerical_prep)
Esempio n. 20
0
    def start(self):
        """
        Use the Metaflow client to retrieve the latest successful run from our
        MovieStatsFlow and assign them as data artifacts in this flow.

        """
        from metaflow import Flow, get_metadata

        # Print metadata provider
        print("Using metadata provider: %s" % get_metadata())

        # Load the analysis from the MovieStatsFlow.
        run = Flow("MovieStatsFlow").latest_successful_run
        print("Using analysis from '%s'" % str(run))

        self.genre_stats = run.data.genre_stats

        # Compute our two recommendation types in parallel.
        self.next(self.bonus_movie, self.genre_movies)
Esempio n. 21
0
 def __init__(self):
     self.run = Flow('MovieTrainFlow').latest_successful_run
     self.model_ann,\
     self.model_users_mtx,\
     self.model_movies_mtx = load_model(self.run)
     self.names = load_movie_names()
Esempio n. 22
0
from metaflow import Flow, get_metadata

# Print metadata provider
print("Using metadata provider: %s" % get_metadata())

# Load the analysis from the MovieStatsFlow.
run = Flow('GenreStatsFlow').latest_successful_run
print("Using analysis from '%s'" % str(run))

genre_stats = run.data.genre_stats
print(genre_stats)
Esempio n. 23
0
 def start(self):
     self.model = Flow('FirstFlow').latest_run.data.model
     print('model:', self.model)
     self.next(self.end)
Esempio n. 24
0
 def start(self):
     run = Flow('ClassifierTrainFlow').latest_run
     self.train_run_id = run.pathspec
     self.model = run['end'].task.data.model
     print("Input vector", self.vector)
     self.next(self.end)
Esempio n. 25
0
from metaflow import Flow, get_metadata
from metaflow.datatools.dolt import DoltDT
from doltpy.core import Dolt


def print_data_map(data_map):
    for run_step in data_map.keys():
        for table in data_map[run_step]:
            print('{}, {}'.format(run_step, table))
            #print(data_map[run_step][table])


print("Current metadata provider: %s" % get_metadata())
doltdb_path = './imdb-reviews'
flow = Flow('IMDBSentimentsFlow')
run = flow.latest_successful_run
print("Using run: %s" % str(run))
'''
Ex 1: Get all the inputs used by a specific run of a flow
'''
# doltdt = DoltDT(run, doltdb_path, 'master')
# data_map_for_run = doltdt.get_reads(steps=['start'])
# print_data_map(data_map_for_run)
'''
Ex 2: Get all the inputs used by a specific step of a run of a flow
'''
# doltdt = DoltDT(run, doltdb_path, 'vinai/add-rotten-data')
# data_map_for_run = doltdt.get_reads(steps=['start'])
# print_data_map(data_map_for_run)
'''
Ex 3 Outputs are handled identically