Example #1
0
class ForecastFlow(FlowSpec):

    appid = Parameter('appid', required=True)
    location = Parameter('location', default='36.1699,115.1398')

    @conda(python='3.8.10', libraries={'sktime': '0.6.1'})
    @step
    def start(self):
        from openweatherdata import get_historical_weather_data, series_to_list
        lat, lon = map(float, self.location.split(','))
        self.pd_past5days = get_historical_weather_data(self.appid, lat, lon)
        self.past5days = series_to_list(self.pd_past5days)
        self.next(self.plot)

    @conda(python='3.8.10', libraries={'sktime': '0.6.1', 'seaborn': '0.11.1'})
    @step
    def plot(self):
        from sktime.utils.plotting import plot_series
        from io import BytesIO
        buf = BytesIO()
        fig, _ = plot_series(self.pd_past5days, labels=['past5days'])
        fig.savefig(buf)
        self.plot = buf.getvalue()
        self.next(self.end)

    @conda(python='3.8.10')
    @step
    def end(self):
        pass
Example #2
0
class CoocFlow(FlowSpec):

    algo = Parameter('algo', help='Co-oc Algorithm', default='plain')
    num_cpu = Parameter('num-cpu', help='Number of CPU cores', default=32)
    num_docs = Parameter('num-docs', help='Number of documents', default=1000)

    @resources(memory=4000)
    @step
    def start(self):
        import scale_data
        docs = scale_data.load_yelp_reviews(self.num_docs)
        self.mtx, self.cols = scale_data.make_matrix(docs, binary=True)
        print("matrix size: %dx%d" % self.mtx.shape)
        self.next(self.compute_cooc)

    @resources(cpu=32, memory=64000)
    @step
    def compute_cooc(self):
        module = import_module('cooc_%s' % self.algo)
        with profile('Computing co-occurrences with the %s algorithm' %
                     self.algo):
            self.cooc = module.compute_cooc(self.mtx, self.num_cpu)
        self.next(self.end)

    @step
    def end(self):
        pass
class WandbExampleFlowDecoClass(FlowSpec):
    # Not obvious how to support metaflow.IncludeFile
    seed = Parameter("seed", default=1337)
    test_size = Parameter("test_size", default=0.2)
    raw_data = Parameter(
        "raw_data",
        default=
        "https://gist.githubusercontent.com/tijptjik/9408623/raw/b237fa5848349a14a14e5d4107dc7897c21951f5/wine.csv",
        help="path to the raw data",
    )

    @step
    def start(self):
        self.raw_df = pd.read_csv(self.raw_data)
        self.next(self.split_data)

    @step
    def split_data(self):
        X = self.raw_df.drop("Wine", axis=1)
        y = self.raw_df[["Wine"]]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=self.test_size, random_state=self.seed)
        self.next(self.train)

    @step
    def train(self):
        self.clf = RandomForestClassifier(random_state=self.seed)
        self.clf.fit(self.X_train, self.y_train)
        self.next(self.end)

    @step
    def end(self):
        self.preds = self.clf.predict(self.X_test)
        self.accuracy = accuracy_score(self.y_test, self.preds)
Example #4
0
class CSPageRankFinder(FlowSpec):
    '''
    Build Citation Graph as JSON From Dataset. 
    Save To S3. 

    Use For Calculating Page Rank Later. 
    '''
    tolerence = Parameter('tolerence',
                          default=1e-8,
                          help='Error Tolerance for Page Conversion')

    max_iter = Parameter('max_iter', default=100, help='Max Iterations to Run')

    @batch(cpu=16, memory=256000)
    @conda(python='3.7.2', libraries=CONDA_DEPS)
    @step
    def start(self):
        graph_json = self.load_graph()
        import networkx as nx
        import page_rank
        G = nx.from_dict_of_dicts(graph_json, create_using=nx.DiGraph)
        print(f"Size of the Graph is {len(graph_json)}")
        del graph_json
        rank_dict, err_log = page_rank.pagerank(G,
                                                tol=self.tolerence,
                                                max_iter=self.max_iter)
        self.error_log = err_log
        self.rank_save_path = self.save_json(
            rank_dict, save_name=f'page-rank-{current.run_id}.json')
        print(f"Saved Rank at {self.rank_save_path}")
        self.next(self.end)

    def save_json(self,
                  data_json,
                  tmp_pth='temp_save.json',
                  save_name='data.json'):
        from metaflow import S3
        import shutil
        final_path = os.path.join(SAVE_PROCESSED_DATA_PATH,
                                  self.__class__.__name__)
        with S3(s3root=final_path) as s3:
            print(f"Saving data_json To S3")
            with open(tmp_pth, 'w') as f:
                json.dump(data_json, f)
            put_pth = s3.put_files([(save_name, tmp_pth)])[0][1]

        return put_pth

    def load_graph(self):
        from metaflow import S3
        import json
        with S3(s3root=PROCESSED_CS_PATH) as s3:
            s3_resp = s3.get('citation_network_graph.json')
            return json.loads(s3_resp.text)

    @step
    def end(self):
        print("Done Computation")
Example #5
0
class HospitalPriceStateMedians(FlowSpec):

    hospital_price_db = Parameter("hospital-price-db",
                                  help="Database of hospital procedure prices",
                                  required=True)

    hospital_price_db_branch = Parameter(
        "hospital-price-db-branch",
        help="Specify branch version",
        default="master",
    )

    historical_run_path = Parameter(
        "historical-run-path",
        help="Read the same data as a path to a previous run")

    hospital_price_analysis_db = Parameter(
        "hospital-price-analysis-db",
        help="Dolt database to write analysis to",
        required=True)

    hospital_price_analysis_db_branch = Parameter(
        "hospital-price-analysis-db-branch",
        help="Specify branch version",
        default="master",
    )

    @step
    def start(self):
        read_conf = DoltConfig(database=self.hospital_price_db,
                               branch=self.hospital_price_db_branch)

        with DoltDT(run=self.historical_run_path or self,
                    config=read_conf) as dolt:
            prices_sql = "SELECT npi_number, code, payer, price FROM prices"
            prices = dolt.sql(prices_sql,
                              as_key='prices').set_index('npi_number')
            hospitals_sql = "SELECT state, npi_number FROM hospitals"
            hospitals = dolt.sql(hospitals_sql,
                                 as_key='hospitals').set_index('npi_number')

        prices_by_state = prices.join(hospitals, how='left', on='npi_number')
        median_price_by_state = prices_by_state.groupby(
            ['state', 'code']).median().reset_index()

        write_conf = DoltConfig(database=self.hospital_price_analysis_db,
                                branch=self.hospital_price_analysis_db_branch)
        with DoltDT(run=self, config=write_conf) as dolt:
            dolt.write(median_price_by_state, "state_procedure_medians",
                       ["state", "code"])

        self.next(self.end)

    @step
    def end(self):
        pass
Example #6
0
class MovieRecsFlow(FlowSpec):

    num_recs = Parameter('num_recs',
                         help="Number of recommendations per user",
                         default=3)
    num_top_movies = Parameter('num_top',
                               help="Produce recs for num_top movies",
                               default=100)

    @resources(memory=10000)
    @step
    def start(self):
        from movie_recs_util import make_batches, top_movies
        run = Flow('MovieTrainFlow').latest_successful_run
        self.movie_names = run['start'].task['movie_names'].data
        self.model_run = run.pathspec
        print('Using model from', self.model_run)
        model_users_mtx = run['start'].task['model_users_mtx'].data
        self.top_movies = top_movies(model_users_mtx, self.num_top_movies)
        self.pairs = make_batches(combinations(self.top_movies, 2))
        self.next(self.batch_recommend, foreach='pairs')

    @resources(memory=10000)
    @step
    def batch_recommend(self):
        from movie_model import load_model, recommend
        run = Run(self.model_run)
        model_ann, model_users_mtx, model_movies_mtx = load_model(run)
        self.recs = list(
            recommend(self.input, model_movies_mtx, model_users_mtx, model_ann,
                      self.num_recs))
        self.next(self.join)

    @step
    def join(self, inputs):
        import movie_db
        self.model_run = inputs[0].model_run
        names = inputs[0].movie_names
        top = inputs[0].top_movies
        recs = chain.from_iterable(inp.recs for inp in inputs)
        name_data = [(movie_id, int(movie_id in top), name)
                     for movie_id, name in names.items()]
        self.db_version = movie_db.save(current.run_id, recs, name_data)
        self.next(self.end)

    @step
    def end(self):
        pass
Example #7
0
class FlowSpec(MetaFlowSpec):
    auth_file = IncludeFile('auth_file',
                            is_text=True,
                            help='My input',
                            default=Configuration.auth["file"])
    auth_env = Parameter('auth_env',
                         type=str,
                         default=Configuration.auth["env"])

    def databases(self):
        for auth_type in [self.auth_file, self.auth_env]:
            try:
                return Databases(connections=json.loads(auth_type))
            except:
                continue
        else:
            raise AttributeError("No authentication provided")

    @classmethod
    def hacky_run(cls):
        cmd = [
            'python',
            inspect.getfile(cls),  # The name of the file to be run
            '--no-pylint',
            'run',
        ]
        result = subprocess.run(cmd, capture_output=False)
        return result.returncode == 0
Example #8
0
class MultiFlowDemo2(FlowSpec):

    flow_dep = Parameter('flow-dep',
                         help="Specifc the tag for the input version",
                         required=True)

    @step
    def start(self):
        flow, run = self.flow_dep.split("/")
        d = DoltRun(flow_name=flow, run_id=run)
        f_input = d.reads[0]
        f_output = d.writes[0]
        with DoltDT(run=self) as dolt:
            self.inp1 = dolt.read_table(f_input.table_name,
                                        commit=f_input.commit)
            self.inp2 = dolt.read_table(f_output.table_name,
                                        commit=f_output.commit)

        self.next(self.middle)

    @step
    def middle(self):
        with DoltDT(run=self) as dolt:

            df = self.inp1 + self.inp2

            dolt.write_table(table_name='baz', df=df, pks=['index'])

        self.next(self.end)

    @step
    def end(self):
        pass
class ParallelTest(FlowSpec):
    """
    Test flow to test @parallel.
    """

    num_parallel = Parameter("num_parallel",
                             help="Number of nodes in cluster",
                             default=3)

    @step
    def start(self):
        self.next(self.parallel_step, num_parallel=self.num_parallel)

    @parallel
    @step
    def parallel_step(self):
        self.node_index = current.parallel.node_index
        self.num_nodes = current.parallel.num_nodes
        print("parallel_step: node {} finishing.".format(self.node_index))
        self.next(self.multinode_end)

    @step
    def multinode_end(self, inputs):
        j = 0
        for input in inputs:
            assert input.node_index == j
            assert input.num_nodes == self.num_parallel
            j += 1
        assert j == self.num_parallel
        self.next(self.end)

    @step
    def end(self):
        pass
Example #10
0
class VersioningDemo(FlowSpec):
    bar_version = Parameter('bar-version',
                            help="Specifc the tag for the input version",
                            required=True)

    @step
    def start(self):
        with DoltDT(run=self, database='foo', branch="master") as dolt:
            self.df = dolt.read_table('bar', commit=self.bar_version)

        self.next(self.middle)

    @step
    def middle(self):
        with DoltDT(run=self, database='foo', branch="master") as dolt:

            df = self.df
            df["B"] = df["B"].map(lambda x: x * 2)

            dolt.write_table(table_name='baz', df=df, pks=['index'])

        self.next(self.end)

    @step
    def end(self):
        pass
Example #11
0
class SucceedsSecondDemo(FlowSpec):

    bar_version = Parameter('bar-version',
                            help="Specifc the tag for the input version",
                            required=True)

    @step
    def start(self):
        with DoltDT(run=self, database='foo', branch="master") as dolt:
            self.df = dolt.read_table('bar')

        first_run = Flow("SucceedsFirstDemo").latest_successful_run
        first_run_ts = datetime.datetime.strptime(first_run.finished_at,
                                                  "%Y-%m-%dT%H:%M:%SZ")
        one_minute_ago = datetime.datetime.now() + datetime.timedelta(
            hours=8) - datetime.timedelta(minutes=1)
        if first_run_ts < one_minute_ago:
            raise Exception(
                "Run `FirstDemo` within one minute of `SecondDemo`")

        self.next(self.middle)

    @step
    def middle(self):
        with DoltDT(run=self, database='foo', branch="master") as dolt:
            df = self.df
            df["B"] = df["B"].map(lambda x: x * 2)

            dolt.write_table(table_name='baz', df=df, pks=['index'])

        self.next(self.end)

    @step
    def end(self):
        pass
Example #12
0
class KmeansFlow(FlowSpec):

    num_docs = Parameter('num-docs',
                         help='Number of documents',
                         default=1000000)

    @resources(memory=4000)
    @step
    def start(self):
        import scale_data
        docs = scale_data.load_yelp_reviews(self.num_docs)
        self.mtx, self.cols = scale_data.make_matrix(docs)
        print("matrix size: %dx%d" % self.mtx.shape)
        self.next(self.train_kmeans)

    @resources(cpu=16, memory=4000)
    @step
    def train_kmeans(self):
        from sklearn.cluster import KMeans
        with profile('k-means'):
            kmeans = KMeans(n_clusters=10, verbose=1, n_init=1)
            kmeans.fit(self.mtx)
        self.clusters = kmeans.labels_
        self.next(self.end)

    @step
    def end(self):
        pass
Example #13
0
class MF_Common_Base(object):
    VALID_KEYRUNECODES = ["M21", "IKO", "THB", "ELD", "M20", "WAR"]
    keyruneCodes = Parameter("keyruneCodes",
                             default=",".join(
                                 keyrune for keyrune in VALID_KEYRUNECODES))

    VALID_CLEANUP = ["all", "models", "temp", "text_features"]

    file_json = f"{config.DATASET}/%s.json"
    file_parquet = f"{config.OUTPUT_DATASET}/%s_cards.parquet"

    def parse_keyruneCodes(self, ftype="json"):
        """ """
        self.list_keyruneCodes = list()
        result = list()

        keyruneCodes = self.keyruneCodes
        if "," not in keyruneCodes:
            keyruneCodes += ","

        fpattern = (MF_Common_Base.file_json
                    if ftype == "json" else MF_Common_Base.file_parquet)

        for code in keyruneCodes.split(","):
            if code in MF_Common_Base.VALID_KEYRUNECODES:
                check_file = fpattern % (code)

                if os.path.exists(check_file):
                    result.append(code)

        self.list_keyruneCodes = result

        return result

    def load_parquet_for_keyrune(self, code):
        """ Load a Parquet file into a DataFrame. """
        parquet_file = f"{config.OUTPUT_DATASET}/{code}_cards.parquet"

        spark = preprocess_fn.spark_session()

        df = spark.read.parquet(parquet_file)

        return df

    def cleanUp_for_code(self, code, what="all"):
        tmp_dir = f"{config.TEMP}/{code}_cards.parquet"
        models_dir = f"{config.SPARK_MODELS}/{code}"
        dataset_dir = f"{config.OUTPUT_DATASET}/{code}_cards.parquet"
        text_features_dir = f"{config.OUTPUT_DATASET}/{code}_cards_text_.parquet"

        clean_dirs = [tmp_dir, models_dir, dataset_dir, text_features_dir]
        for clean_whichdir in clean_dirs:
            print(f"Will clean {clean_whichdir}...")
            shutil.rmtree(clean_whichdir, ignore_errors=True)

    def prepare_dirs_for_code(self, code):
        dir_spark_models = f"{config.SPARK_MODELS}/{code}/"

        if not os.path.exists(dir_spark_models):
            os.mkdir(f"{config.SPARK_MODELS}/{code}/")
Example #14
0
class WandbForeachFlow(FlowSpec):
    seed = Parameter("seed", default=1337)
    test_size = Parameter("test_size", default=0.2)
    raw_data = Parameter(
        "raw_data",
        default=
        "https://gist.githubusercontent.com/tijptjik/9408623/raw/b237fa5848349a14a14e5d4107dc7897c21951f5/wine.csv",
        help="path to the raw data",
    )

    @step
    def start(self):
        self.models = ["RandomForestClassifier", "GradientBoostingClassifier"]
        self.raw_df = pd.read_csv(self.raw_data)
        self.next(self.split_data)

    @wandb_log(datasets=True, models=True, others=True)
    @step
    def split_data(self):
        X = self.raw_df.drop("Wine", axis=1)
        y = self.raw_df[["Wine"]]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=self.test_size, random_state=self.seed)
        self.next(self.train, foreach="models")

    @step
    def train(self):
        self.model_name = self.input
        # self.clf = RandomForestClassifier(random_state=self.seed)
        self.clf = setup_model(self.model_name, random_state=self.seed)
        self.clf.fit(self.X_train, self.y_train)
        self.preds = self.clf.predict(self.X_test)
        self.accuracy = accuracy_score(self.y_test, self.preds)
        self.next(self.join_train)

    @step
    def join_train(self, inputs):
        self.results = [{
            "model_name": input.model_name,
            "preds": input.preds,
            "accuracy": input.accuracy,
        } for input in inputs]
        self.next(self.end)

    @step
    def end(self):
        pass
Example #15
0
class TaxiPlotterFlow(FlowSpec):

    use_ctas = Parameter('use_ctas_data', help='Use CTAS data', default=False)

    @conda(python='3.8.10')
    @step
    def start(self):
        if self.use_ctas:
            self.paths = Flow('TaxiETLFlow').latest_run.data.paths
        else:
            with S3() as s3:
                objs = s3.list_recursive([URL])
                self.paths = [obj.url for obj in objs]
        #self.paths = athena_ctas()
        print("Processing %d Parquet files" % len(self.paths))
        n = round(len(self.paths) / NUM_SHARDS)
        self.shards = [
            self.paths[i * n:(i + 1) * n] for i in range(NUM_SHARDS - 1)
        ]
        self.shards.append(self.paths[(NUM_SHARDS - 1) * n:])
        self.next(self.preprocess_data, foreach='shards')

    @resources(memory=16000)
    @conda(python='3.8.10', libraries={'pyarrow': '5.0.0'})
    @step
    def preprocess_data(self):
        with S3() as s3:
            from pyarrow.parquet import ParquetDataset
            if self.input:
                objs = s3.get_many(self.input)
                orig_table = ParquetDataset([obj.path for obj in objs]).read()
                self.num_rows_before = orig_table.num_rows
                table = process_data(orig_table)
                self.num_rows_after = table.num_rows
                print('selected %d/%d rows'\
                      % (self.num_rows_after, self.num_rows_before))
                self.lat = table['pickup_latitude'].to_numpy()
                self.lon = table['pickup_longitude'].to_numpy()
        self.next(self.join)

    @resources(memory=16000)
    @conda(python='3.8.10',
           libraries={
               'pyarrow': '5.0.0',
               'datashader': '0.13.0'
           })
    @step
    def join(self, inputs):
        import numpy
        lat = numpy.concatenate([inp.lat for inp in inputs])
        lon = numpy.concatenate([inp.lon for inp in inputs])
        print("Plotting %d locations" % len(lat))
        self.image = taxiviz.visualize(lat, lon)
        self.next(self.end)

    @conda(python='3.8.10')
    @step
    def end(self):
        pass
Example #16
0
class DiscussionFlow(FlowSpec):
    """
    Parse the discussions scraped from canvas
    """

    remove_index = Parameter('remove',
                             help="The index of the assignment description",
                             default=2)

    @step
    def start(self):
        """
        Load the discussions
        """

        # Load the discussions
        discussions = pd.read_csv('canvas.csv')

        # Drop unused columns
        discussions = discussions.drop(
            ['web-scraper-order', 'web-scraper-start-url'], 1)

        # Remove the discussion description
        self.discussions = discussions.drop(
            discussions.index[self.remove_index]).reset_index(drop=True)

        self.next(self.clean)

    @step
    def clean(self):
        """
        Clean the discussion posts
        """

        # Remove all the text after the 'Edited by' text
        def clean_string(text):
            return re.sub(r'(?=Edited).*$', ' ', text.replace('\n',
                                                              ' ')).strip()

        # Clean the text
        self.discussions['discussion'] = self.discussions[
            'discussion_subentries'].apply(clean_string)

        # Drop the discussion_subentries column
        self.discussions = self.discussions.drop(['discussion_subentries'], 1)

        self.next(self.end)

    @step
    def end(self):
        """
        Store the dataframe
        """

        print(f'Parsed {len(self.discussions)} discussions')

        self.discussions.to_csv('canvas_parsed.csv', index=False)
class JsonCleaningFlow(FlowSpec):
    """
    A flow to clean jsons and persist them in S3

    The flow performs the following steps:
    1)

    """

    inputFile = Parameter("inputFile", help="file uri to read with pandas")
    outputDir = Parameter("outputDir",  help="uri of a directory", default="s3://...")

    @batch(cpu=1, memory=500)
    @step
    def start(self):
        """
        Load files from S3
        """
        self.prefixDataframeTuple = []
        self.next(self.clean_dataframe, foreach="prefixDataframeTuple")

    @batch(cpu=1, memory=500)
    @retry
    @step
    def clean_dataframe(self):
        """clean"""
        self.next(self.save_dataframe)

    @step
    def save_dataframe(self):
        """save"""
        self.next(self.join)

    @step
    def join(self, inputs):
        """ This does nothing but is unfortunately required by metaflow"""
        self.next(self.end)

    @step
    def end(self):
        """
        End the flow.
        """
        pass
Example #18
0
class ForecastFlow(FlowSpec):

    appid = Parameter('appid', default='6e5da44abe65e3320be635abfb9b0aa5')
    location = Parameter('location', default='36.1699,115.1398')

    @conda(python='3.8.10', libraries={'sktime': '0.6.1'})
    @step
    def start(self):
        from openweatherdata import get_historical_weather_data, series_to_list
        lat, lon = map(float, self.location.split(','))
        self.pd_past5days = get_historical_weather_data(self.appid, lat, lon)
        self.past5days = series_to_list(self.pd_past5days)
        self.next(self.forecast)

    @conda(python='3.8.10', libraries={'sktime': '0.6.1'})
    @step
    def forecast(self):
        from openweatherdata import series_to_list
        from sktime.forecasting.theta import ThetaForecaster
        import numpy
        forecaster = ThetaForecaster(sp=48)
        forecaster.fit(self.pd_past5days)
        self.pd_predictions = forecaster.predict(numpy.arange(1, 48))
        self.predictions = series_to_list(self.pd_predictions)
        self.next(self.plot)

    @conda(python='3.8.10', libraries={'sktime': '0.6.1', 'seaborn': '0.11.1'})
    @step
    def plot(self):
        from sktime.utils.plotting import plot_series
        from io import BytesIO
        buf = BytesIO()
        fig, _ = plot_series(self.pd_past5days,
                             self.pd_predictions,
                             labels=['past5days', 'predictions'])
        fig.savefig(buf)
        self.plot = buf.getvalue()
        self.next(self.end)

    @conda(python='3.8.10')
    @step
    def end(self):
        pass
Example #19
0
class ProphetFlow(FlowSpec):
    """
    ProphetFlow use Facebook Prophet to predict future values of a
    timeseries.
    """
    data_file = IncludeFile('datafile',
                            is_text=True,
                            help='Time series data file - csv file format',
                            default='data/daily-min-temperatures.txt')
    columns_mapping = Parameter(
        'columns',
        default={
            'Date': 'ds',
            'Temp': 'y'
        },
        help="Rename columns according to Prophet standards")

    @step
    def start(self):
        """
        Raw data is loaded and prepared
        """
        # Load csv in pandas dataframe
        self.df = pd.read_csv(StringIO(self.data_file))

        # Rename columns to meet Prophet input dataframe standards
        self.df.rename(columns=self.columns_mapping, inplace=True)

        # Convert Date column to datetime64 dtype
        self.df['ds'] = pd.to_datetime(self.df['ds'],
                                       infer_datetime_format=True)

        self.next(self.train)

    @step
    def train(self):
        """
        A new Prophet model is fitted.
        """
        # Fit a new model using defaults
        self.m = Prophet()
        self.m.fit(self.df)

        self.next(self.end)

    @step
    def end(self):
        """
        Last step, process is finished
        """
        print("ProphetFlow is all done.")
class HospitalProcedurePriceVarianceByState(FlowSpec):

    hospital_price_analysis_db = Parameter(
        "hospital-price-analysis-db",
        help="Dolt database to write analysis to",
        required=True)

    hospital_price_analysis_db_branch = Parameter(
        "hospital-price-analysis-db-branch",
        help="Specify branch version",
        default="master",
    )

    historical_run_path = Parameter(
        "historical-run-path",
        help="Read the same data as a path to a previous run")

    @step
    def start(self):
        analysis_conf = DoltConfig(
            database=self.hospital_price_analysis_db,
            branch=self.hospital_price_analysis_db_branch)
        audit = Run(self.historical_run_path
                    ).data.dolt if self.historical_run_path else None

        with DoltDT(run=self, config=analysis_conf, audit=audit) as dolt:
            median_price_by_state = dolt.read("state_procedure_medians")
            variance_by_procedure = median_price_by_state.groupby(
                "code").var().reset_index()
            variance_by_procedure = variance_by_procedure[
                ~variance_by_procedure['code'].str.startswith('nan')]
            dolt.write(variance_by_procedure, "variance_by_procedure")

        self.next(self.end)

    @step
    def end(self):
        pass
Example #21
0
class KmeansFlow(FlowSpec):

    num_docs = Parameter('num-docs', help='Number of documents', default=1000)

    @resources(memory=1000)
    @step
    def start(self):
        import scale_data
        scale_data.load_yelp_reviews(self.num_docs)
        self.next(self.end)

    @step
    def end(self):
        pass
Example #22
0
class SFNTestFlow(FlowSpec):

    num = Parameter('num',
                    help="Give a number",
                    default=1)

    @step
    def start(self):
        print("The number defined as a parameter is", self.num)
        self.next(self.end)

    @step
    def end(self):
        print('done!')
Example #23
0
class ClassifierPredictFlow(FlowSpec):

    vector = Parameter('vector', type=JSONType, required=True)

    @step
    def start(self):
        run = Flow('ClassifierTrainFlow').latest_run
        self.train_run_id = run.pathspec
        self.model = run['end'].task.data.model
        print("Input vector", self.vector)
        self.next(self.end)

    @step
    def end(self):
        print("Predicted class", self.model.predict([self.vector])[0])
Example #24
0
class ParameterFlow(FlowSpec):

    animal = Parameter("creature",
                       help="Specify an animal",
                       required=True)

    count = Parameter("count",  
                      help="Number of animals",
                      default=1)

    ratio = Parameter("ratio",
                      help="Ratio between 0.0 and 1.0",
                      type=float)

    @step
    def start(self):
        print(self.animal, "is a string of", len(self.animal), "characters")
        print("Count is an integer: %s+1=%s" % (self.count, self.count + 1))
        print("Ratio is a", type(self.ratio), "whose value is", self.ratio)
        self.next(self.end)

    @step
    def end(self):
        print("done!")
Example #25
0
class S3BenchmarkFlow(FlowSpec):

    local_dir = Parameter('local_dir',
                          help='Read local files from this directory')

    num = Parameter('num_files',
                    help='maximum number of files to read',
                    default=50)

    @step
    def start(self):
        with S3() as s3:
            with profile('Loading and processing'):
                if self.local_dir:
                    files = [
                        os.path.join(self.local_dir, f)
                        for f in os.listdir(self.local_dir)
                    ][:self.num]
                else:
                    files = load_s3(s3, self.num)

                print("Reading %d objects" % len(files))
                stats = {}
                with profile('reading', stats_dict=stats):
                    size = sum(
                        parallel_map(lambda x: len(open(x, 'rb').read()),
                                     files)) / 1024**3

                read_gbps = (size * 8) / (stats['reading'] / 1000.)
                print("Read %2.fGB. Throughput: %2.1f Gb/s" %
                      (size, read_gbps))
        self.next(self.end)

    @step
    def end(self):
        pass
Example #26
0
class JSONParameterFlow(FlowSpec):

    mapping = Parameter('mapping',
                        help="Specify a mapping",
                        default='{"some": "default"}',
                        type=JSONType)

    @step
    def start(self):
        for key, value in self.mapping.items():
            print('key', key, 'value', value)
        self.next(self.end)

    @step
    def end(self):
        print("done!")
Example #27
0
class AuditDemo(FlowSpec):
    read_run_id = Parameter("read-run-id",
                            help="Pass a run_id for a VersionDemo flow",
                            required=True)

    @step
    def start(self):
        audit = Run(f"VersioningDemo/{self.read_run_id}").data.dolt
        with DoltDT(run=self, audit=audit) as dolt:
            df = dolt.read("bar")

        self.next(self.end)

    @step
    def end(self):
        print(json.dumps(self.dolt, indent=4))
Example #28
0
class CSVFileFlow(FlowSpec):

    data = IncludeFile('csv', help="CSV file to be parsed", is_text=True)

    delimiter = Parameter('delimiter', help="delimiter", default=',')

    @step
    def start(self):
        fileobj = StringIO(self.data)
        for i, row in enumerate(csv.reader(fileobj, delimiter=self.delimiter)):
            print("row %d: %s" % (i, row))
        self.next(self.end)

    @step
    def end(self):
        print('done!')
Example #29
0
class TaxiDataLoader(FlowSpec):

    table = Parameter('table', help='Table name', default='nyc_taxi')

    @conda(python='3.8.10', libraries={'pyarrow': '5.0.0'})
    @step
    def start(self):
        import pyarrow.parquet as pq

        def make_key(obj):
            key = '%s/month=%s/%s' % tuple([self.table] + obj.key.split('/'))
            return key, obj.path

        def hive_field(f):
            return f.name, TYPES.get(str(f.type), str(f.type))

        with S3() as s3down:
            with profile('Dowloading data'):
                loaded = list(map(make_key, s3down.get_recursive([URL])))
            table = pq.read_table(loaded[0][1])
            self.schema = dict(map(hive_field, table.schema))
            with S3(run=self) as s3up:
                with profile('Uploading data'):
                    uploaded = s3up.put_files(loaded)
                key, url = uploaded[0]
                self.s3_prefix = url[:-(len(key) - len(self.table))]
        self.next(self.end)

    @conda(python='3.8.10', libraries={'awswrangler': '1.10.1'})
    @step
    def end(self):
        import awswrangler as wr
        try:
            wr.catalog.create_database(name=GLUE_DB)
        except:
            pass
        wr.athena.create_athena_bucket()
        with profile('Creating table'):
            wr.catalog.create_parquet_table(database=GLUE_DB,
                                            table=self.table,
                                            path=self.s3_prefix,
                                            columns_types=self.schema,
                                            partitions_types={'month': 'int'},
                                            mode='overwrite')
            wr.athena.repair_table(self.table, database=GLUE_DB)
Example #30
0
class ManyKmeansFlow(FlowSpec):

    num_docs = Parameter('num-docs',
                         help='Number of documents',
                         default=1000000)

    @resources(memory=4000)
    @step
    def start(self):
        import scale_data
        docs = scale_data.load_yelp_reviews(self.num_docs)
        self.mtx, self.cols = scale_data.make_matrix(docs)
        self.k_params = list(range(5, 55, 5))
        self.next(self.train_kmeans, foreach='k_params')

    @resources(cpu=4, memory=4000)
    @step
    def train_kmeans(self):
        from sklearn.cluster import KMeans
        self.k = self.input
        with profile('k-means'):
            kmeans = KMeans(n_clusters=self.k, verbose=1, n_init=1)
            kmeans.fit(self.mtx)
        self.clusters = kmeans.labels_
        self.next(self.analyze)

    @step
    def analyze(self):
        from analyze_kmeans import top_words
        self.top = top_words(self.k, self.clusters, self.mtx, self.cols)
        self.next(self.join)

    @step
    def join(self, inputs):
        self.top = {inp.k: inp.top for inp in inputs}
        self.next(self.end)

    @step
    def end(self):
        pass