class ForecastFlow(FlowSpec): appid = Parameter('appid', required=True) location = Parameter('location', default='36.1699,115.1398') @conda(python='3.8.10', libraries={'sktime': '0.6.1'}) @step def start(self): from openweatherdata import get_historical_weather_data, series_to_list lat, lon = map(float, self.location.split(',')) self.pd_past5days = get_historical_weather_data(self.appid, lat, lon) self.past5days = series_to_list(self.pd_past5days) self.next(self.plot) @conda(python='3.8.10', libraries={'sktime': '0.6.1', 'seaborn': '0.11.1'}) @step def plot(self): from sktime.utils.plotting import plot_series from io import BytesIO buf = BytesIO() fig, _ = plot_series(self.pd_past5days, labels=['past5days']) fig.savefig(buf) self.plot = buf.getvalue() self.next(self.end) @conda(python='3.8.10') @step def end(self): pass
class CoocFlow(FlowSpec): algo = Parameter('algo', help='Co-oc Algorithm', default='plain') num_cpu = Parameter('num-cpu', help='Number of CPU cores', default=32) num_docs = Parameter('num-docs', help='Number of documents', default=1000) @resources(memory=4000) @step def start(self): import scale_data docs = scale_data.load_yelp_reviews(self.num_docs) self.mtx, self.cols = scale_data.make_matrix(docs, binary=True) print("matrix size: %dx%d" % self.mtx.shape) self.next(self.compute_cooc) @resources(cpu=32, memory=64000) @step def compute_cooc(self): module = import_module('cooc_%s' % self.algo) with profile('Computing co-occurrences with the %s algorithm' % self.algo): self.cooc = module.compute_cooc(self.mtx, self.num_cpu) self.next(self.end) @step def end(self): pass
class WandbExampleFlowDecoClass(FlowSpec): # Not obvious how to support metaflow.IncludeFile seed = Parameter("seed", default=1337) test_size = Parameter("test_size", default=0.2) raw_data = Parameter( "raw_data", default= "https://gist.githubusercontent.com/tijptjik/9408623/raw/b237fa5848349a14a14e5d4107dc7897c21951f5/wine.csv", help="path to the raw data", ) @step def start(self): self.raw_df = pd.read_csv(self.raw_data) self.next(self.split_data) @step def split_data(self): X = self.raw_df.drop("Wine", axis=1) y = self.raw_df[["Wine"]] self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( X, y, test_size=self.test_size, random_state=self.seed) self.next(self.train) @step def train(self): self.clf = RandomForestClassifier(random_state=self.seed) self.clf.fit(self.X_train, self.y_train) self.next(self.end) @step def end(self): self.preds = self.clf.predict(self.X_test) self.accuracy = accuracy_score(self.y_test, self.preds)
class CSPageRankFinder(FlowSpec): ''' Build Citation Graph as JSON From Dataset. Save To S3. Use For Calculating Page Rank Later. ''' tolerence = Parameter('tolerence', default=1e-8, help='Error Tolerance for Page Conversion') max_iter = Parameter('max_iter', default=100, help='Max Iterations to Run') @batch(cpu=16, memory=256000) @conda(python='3.7.2', libraries=CONDA_DEPS) @step def start(self): graph_json = self.load_graph() import networkx as nx import page_rank G = nx.from_dict_of_dicts(graph_json, create_using=nx.DiGraph) print(f"Size of the Graph is {len(graph_json)}") del graph_json rank_dict, err_log = page_rank.pagerank(G, tol=self.tolerence, max_iter=self.max_iter) self.error_log = err_log self.rank_save_path = self.save_json( rank_dict, save_name=f'page-rank-{current.run_id}.json') print(f"Saved Rank at {self.rank_save_path}") self.next(self.end) def save_json(self, data_json, tmp_pth='temp_save.json', save_name='data.json'): from metaflow import S3 import shutil final_path = os.path.join(SAVE_PROCESSED_DATA_PATH, self.__class__.__name__) with S3(s3root=final_path) as s3: print(f"Saving data_json To S3") with open(tmp_pth, 'w') as f: json.dump(data_json, f) put_pth = s3.put_files([(save_name, tmp_pth)])[0][1] return put_pth def load_graph(self): from metaflow import S3 import json with S3(s3root=PROCESSED_CS_PATH) as s3: s3_resp = s3.get('citation_network_graph.json') return json.loads(s3_resp.text) @step def end(self): print("Done Computation")
class HospitalPriceStateMedians(FlowSpec): hospital_price_db = Parameter("hospital-price-db", help="Database of hospital procedure prices", required=True) hospital_price_db_branch = Parameter( "hospital-price-db-branch", help="Specify branch version", default="master", ) historical_run_path = Parameter( "historical-run-path", help="Read the same data as a path to a previous run") hospital_price_analysis_db = Parameter( "hospital-price-analysis-db", help="Dolt database to write analysis to", required=True) hospital_price_analysis_db_branch = Parameter( "hospital-price-analysis-db-branch", help="Specify branch version", default="master", ) @step def start(self): read_conf = DoltConfig(database=self.hospital_price_db, branch=self.hospital_price_db_branch) with DoltDT(run=self.historical_run_path or self, config=read_conf) as dolt: prices_sql = "SELECT npi_number, code, payer, price FROM prices" prices = dolt.sql(prices_sql, as_key='prices').set_index('npi_number') hospitals_sql = "SELECT state, npi_number FROM hospitals" hospitals = dolt.sql(hospitals_sql, as_key='hospitals').set_index('npi_number') prices_by_state = prices.join(hospitals, how='left', on='npi_number') median_price_by_state = prices_by_state.groupby( ['state', 'code']).median().reset_index() write_conf = DoltConfig(database=self.hospital_price_analysis_db, branch=self.hospital_price_analysis_db_branch) with DoltDT(run=self, config=write_conf) as dolt: dolt.write(median_price_by_state, "state_procedure_medians", ["state", "code"]) self.next(self.end) @step def end(self): pass
class MovieRecsFlow(FlowSpec): num_recs = Parameter('num_recs', help="Number of recommendations per user", default=3) num_top_movies = Parameter('num_top', help="Produce recs for num_top movies", default=100) @resources(memory=10000) @step def start(self): from movie_recs_util import make_batches, top_movies run = Flow('MovieTrainFlow').latest_successful_run self.movie_names = run['start'].task['movie_names'].data self.model_run = run.pathspec print('Using model from', self.model_run) model_users_mtx = run['start'].task['model_users_mtx'].data self.top_movies = top_movies(model_users_mtx, self.num_top_movies) self.pairs = make_batches(combinations(self.top_movies, 2)) self.next(self.batch_recommend, foreach='pairs') @resources(memory=10000) @step def batch_recommend(self): from movie_model import load_model, recommend run = Run(self.model_run) model_ann, model_users_mtx, model_movies_mtx = load_model(run) self.recs = list( recommend(self.input, model_movies_mtx, model_users_mtx, model_ann, self.num_recs)) self.next(self.join) @step def join(self, inputs): import movie_db self.model_run = inputs[0].model_run names = inputs[0].movie_names top = inputs[0].top_movies recs = chain.from_iterable(inp.recs for inp in inputs) name_data = [(movie_id, int(movie_id in top), name) for movie_id, name in names.items()] self.db_version = movie_db.save(current.run_id, recs, name_data) self.next(self.end) @step def end(self): pass
class FlowSpec(MetaFlowSpec): auth_file = IncludeFile('auth_file', is_text=True, help='My input', default=Configuration.auth["file"]) auth_env = Parameter('auth_env', type=str, default=Configuration.auth["env"]) def databases(self): for auth_type in [self.auth_file, self.auth_env]: try: return Databases(connections=json.loads(auth_type)) except: continue else: raise AttributeError("No authentication provided") @classmethod def hacky_run(cls): cmd = [ 'python', inspect.getfile(cls), # The name of the file to be run '--no-pylint', 'run', ] result = subprocess.run(cmd, capture_output=False) return result.returncode == 0
class MultiFlowDemo2(FlowSpec): flow_dep = Parameter('flow-dep', help="Specifc the tag for the input version", required=True) @step def start(self): flow, run = self.flow_dep.split("/") d = DoltRun(flow_name=flow, run_id=run) f_input = d.reads[0] f_output = d.writes[0] with DoltDT(run=self) as dolt: self.inp1 = dolt.read_table(f_input.table_name, commit=f_input.commit) self.inp2 = dolt.read_table(f_output.table_name, commit=f_output.commit) self.next(self.middle) @step def middle(self): with DoltDT(run=self) as dolt: df = self.inp1 + self.inp2 dolt.write_table(table_name='baz', df=df, pks=['index']) self.next(self.end) @step def end(self): pass
class ParallelTest(FlowSpec): """ Test flow to test @parallel. """ num_parallel = Parameter("num_parallel", help="Number of nodes in cluster", default=3) @step def start(self): self.next(self.parallel_step, num_parallel=self.num_parallel) @parallel @step def parallel_step(self): self.node_index = current.parallel.node_index self.num_nodes = current.parallel.num_nodes print("parallel_step: node {} finishing.".format(self.node_index)) self.next(self.multinode_end) @step def multinode_end(self, inputs): j = 0 for input in inputs: assert input.node_index == j assert input.num_nodes == self.num_parallel j += 1 assert j == self.num_parallel self.next(self.end) @step def end(self): pass
class VersioningDemo(FlowSpec): bar_version = Parameter('bar-version', help="Specifc the tag for the input version", required=True) @step def start(self): with DoltDT(run=self, database='foo', branch="master") as dolt: self.df = dolt.read_table('bar', commit=self.bar_version) self.next(self.middle) @step def middle(self): with DoltDT(run=self, database='foo', branch="master") as dolt: df = self.df df["B"] = df["B"].map(lambda x: x * 2) dolt.write_table(table_name='baz', df=df, pks=['index']) self.next(self.end) @step def end(self): pass
class SucceedsSecondDemo(FlowSpec): bar_version = Parameter('bar-version', help="Specifc the tag for the input version", required=True) @step def start(self): with DoltDT(run=self, database='foo', branch="master") as dolt: self.df = dolt.read_table('bar') first_run = Flow("SucceedsFirstDemo").latest_successful_run first_run_ts = datetime.datetime.strptime(first_run.finished_at, "%Y-%m-%dT%H:%M:%SZ") one_minute_ago = datetime.datetime.now() + datetime.timedelta( hours=8) - datetime.timedelta(minutes=1) if first_run_ts < one_minute_ago: raise Exception( "Run `FirstDemo` within one minute of `SecondDemo`") self.next(self.middle) @step def middle(self): with DoltDT(run=self, database='foo', branch="master") as dolt: df = self.df df["B"] = df["B"].map(lambda x: x * 2) dolt.write_table(table_name='baz', df=df, pks=['index']) self.next(self.end) @step def end(self): pass
class KmeansFlow(FlowSpec): num_docs = Parameter('num-docs', help='Number of documents', default=1000000) @resources(memory=4000) @step def start(self): import scale_data docs = scale_data.load_yelp_reviews(self.num_docs) self.mtx, self.cols = scale_data.make_matrix(docs) print("matrix size: %dx%d" % self.mtx.shape) self.next(self.train_kmeans) @resources(cpu=16, memory=4000) @step def train_kmeans(self): from sklearn.cluster import KMeans with profile('k-means'): kmeans = KMeans(n_clusters=10, verbose=1, n_init=1) kmeans.fit(self.mtx) self.clusters = kmeans.labels_ self.next(self.end) @step def end(self): pass
class MF_Common_Base(object): VALID_KEYRUNECODES = ["M21", "IKO", "THB", "ELD", "M20", "WAR"] keyruneCodes = Parameter("keyruneCodes", default=",".join( keyrune for keyrune in VALID_KEYRUNECODES)) VALID_CLEANUP = ["all", "models", "temp", "text_features"] file_json = f"{config.DATASET}/%s.json" file_parquet = f"{config.OUTPUT_DATASET}/%s_cards.parquet" def parse_keyruneCodes(self, ftype="json"): """ """ self.list_keyruneCodes = list() result = list() keyruneCodes = self.keyruneCodes if "," not in keyruneCodes: keyruneCodes += "," fpattern = (MF_Common_Base.file_json if ftype == "json" else MF_Common_Base.file_parquet) for code in keyruneCodes.split(","): if code in MF_Common_Base.VALID_KEYRUNECODES: check_file = fpattern % (code) if os.path.exists(check_file): result.append(code) self.list_keyruneCodes = result return result def load_parquet_for_keyrune(self, code): """ Load a Parquet file into a DataFrame. """ parquet_file = f"{config.OUTPUT_DATASET}/{code}_cards.parquet" spark = preprocess_fn.spark_session() df = spark.read.parquet(parquet_file) return df def cleanUp_for_code(self, code, what="all"): tmp_dir = f"{config.TEMP}/{code}_cards.parquet" models_dir = f"{config.SPARK_MODELS}/{code}" dataset_dir = f"{config.OUTPUT_DATASET}/{code}_cards.parquet" text_features_dir = f"{config.OUTPUT_DATASET}/{code}_cards_text_.parquet" clean_dirs = [tmp_dir, models_dir, dataset_dir, text_features_dir] for clean_whichdir in clean_dirs: print(f"Will clean {clean_whichdir}...") shutil.rmtree(clean_whichdir, ignore_errors=True) def prepare_dirs_for_code(self, code): dir_spark_models = f"{config.SPARK_MODELS}/{code}/" if not os.path.exists(dir_spark_models): os.mkdir(f"{config.SPARK_MODELS}/{code}/")
class WandbForeachFlow(FlowSpec): seed = Parameter("seed", default=1337) test_size = Parameter("test_size", default=0.2) raw_data = Parameter( "raw_data", default= "https://gist.githubusercontent.com/tijptjik/9408623/raw/b237fa5848349a14a14e5d4107dc7897c21951f5/wine.csv", help="path to the raw data", ) @step def start(self): self.models = ["RandomForestClassifier", "GradientBoostingClassifier"] self.raw_df = pd.read_csv(self.raw_data) self.next(self.split_data) @wandb_log(datasets=True, models=True, others=True) @step def split_data(self): X = self.raw_df.drop("Wine", axis=1) y = self.raw_df[["Wine"]] self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( X, y, test_size=self.test_size, random_state=self.seed) self.next(self.train, foreach="models") @step def train(self): self.model_name = self.input # self.clf = RandomForestClassifier(random_state=self.seed) self.clf = setup_model(self.model_name, random_state=self.seed) self.clf.fit(self.X_train, self.y_train) self.preds = self.clf.predict(self.X_test) self.accuracy = accuracy_score(self.y_test, self.preds) self.next(self.join_train) @step def join_train(self, inputs): self.results = [{ "model_name": input.model_name, "preds": input.preds, "accuracy": input.accuracy, } for input in inputs] self.next(self.end) @step def end(self): pass
class TaxiPlotterFlow(FlowSpec): use_ctas = Parameter('use_ctas_data', help='Use CTAS data', default=False) @conda(python='3.8.10') @step def start(self): if self.use_ctas: self.paths = Flow('TaxiETLFlow').latest_run.data.paths else: with S3() as s3: objs = s3.list_recursive([URL]) self.paths = [obj.url for obj in objs] #self.paths = athena_ctas() print("Processing %d Parquet files" % len(self.paths)) n = round(len(self.paths) / NUM_SHARDS) self.shards = [ self.paths[i * n:(i + 1) * n] for i in range(NUM_SHARDS - 1) ] self.shards.append(self.paths[(NUM_SHARDS - 1) * n:]) self.next(self.preprocess_data, foreach='shards') @resources(memory=16000) @conda(python='3.8.10', libraries={'pyarrow': '5.0.0'}) @step def preprocess_data(self): with S3() as s3: from pyarrow.parquet import ParquetDataset if self.input: objs = s3.get_many(self.input) orig_table = ParquetDataset([obj.path for obj in objs]).read() self.num_rows_before = orig_table.num_rows table = process_data(orig_table) self.num_rows_after = table.num_rows print('selected %d/%d rows'\ % (self.num_rows_after, self.num_rows_before)) self.lat = table['pickup_latitude'].to_numpy() self.lon = table['pickup_longitude'].to_numpy() self.next(self.join) @resources(memory=16000) @conda(python='3.8.10', libraries={ 'pyarrow': '5.0.0', 'datashader': '0.13.0' }) @step def join(self, inputs): import numpy lat = numpy.concatenate([inp.lat for inp in inputs]) lon = numpy.concatenate([inp.lon for inp in inputs]) print("Plotting %d locations" % len(lat)) self.image = taxiviz.visualize(lat, lon) self.next(self.end) @conda(python='3.8.10') @step def end(self): pass
class DiscussionFlow(FlowSpec): """ Parse the discussions scraped from canvas """ remove_index = Parameter('remove', help="The index of the assignment description", default=2) @step def start(self): """ Load the discussions """ # Load the discussions discussions = pd.read_csv('canvas.csv') # Drop unused columns discussions = discussions.drop( ['web-scraper-order', 'web-scraper-start-url'], 1) # Remove the discussion description self.discussions = discussions.drop( discussions.index[self.remove_index]).reset_index(drop=True) self.next(self.clean) @step def clean(self): """ Clean the discussion posts """ # Remove all the text after the 'Edited by' text def clean_string(text): return re.sub(r'(?=Edited).*$', ' ', text.replace('\n', ' ')).strip() # Clean the text self.discussions['discussion'] = self.discussions[ 'discussion_subentries'].apply(clean_string) # Drop the discussion_subentries column self.discussions = self.discussions.drop(['discussion_subentries'], 1) self.next(self.end) @step def end(self): """ Store the dataframe """ print(f'Parsed {len(self.discussions)} discussions') self.discussions.to_csv('canvas_parsed.csv', index=False)
class JsonCleaningFlow(FlowSpec): """ A flow to clean jsons and persist them in S3 The flow performs the following steps: 1) """ inputFile = Parameter("inputFile", help="file uri to read with pandas") outputDir = Parameter("outputDir", help="uri of a directory", default="s3://...") @batch(cpu=1, memory=500) @step def start(self): """ Load files from S3 """ self.prefixDataframeTuple = [] self.next(self.clean_dataframe, foreach="prefixDataframeTuple") @batch(cpu=1, memory=500) @retry @step def clean_dataframe(self): """clean""" self.next(self.save_dataframe) @step def save_dataframe(self): """save""" self.next(self.join) @step def join(self, inputs): """ This does nothing but is unfortunately required by metaflow""" self.next(self.end) @step def end(self): """ End the flow. """ pass
class ForecastFlow(FlowSpec): appid = Parameter('appid', default='6e5da44abe65e3320be635abfb9b0aa5') location = Parameter('location', default='36.1699,115.1398') @conda(python='3.8.10', libraries={'sktime': '0.6.1'}) @step def start(self): from openweatherdata import get_historical_weather_data, series_to_list lat, lon = map(float, self.location.split(',')) self.pd_past5days = get_historical_weather_data(self.appid, lat, lon) self.past5days = series_to_list(self.pd_past5days) self.next(self.forecast) @conda(python='3.8.10', libraries={'sktime': '0.6.1'}) @step def forecast(self): from openweatherdata import series_to_list from sktime.forecasting.theta import ThetaForecaster import numpy forecaster = ThetaForecaster(sp=48) forecaster.fit(self.pd_past5days) self.pd_predictions = forecaster.predict(numpy.arange(1, 48)) self.predictions = series_to_list(self.pd_predictions) self.next(self.plot) @conda(python='3.8.10', libraries={'sktime': '0.6.1', 'seaborn': '0.11.1'}) @step def plot(self): from sktime.utils.plotting import plot_series from io import BytesIO buf = BytesIO() fig, _ = plot_series(self.pd_past5days, self.pd_predictions, labels=['past5days', 'predictions']) fig.savefig(buf) self.plot = buf.getvalue() self.next(self.end) @conda(python='3.8.10') @step def end(self): pass
class ProphetFlow(FlowSpec): """ ProphetFlow use Facebook Prophet to predict future values of a timeseries. """ data_file = IncludeFile('datafile', is_text=True, help='Time series data file - csv file format', default='data/daily-min-temperatures.txt') columns_mapping = Parameter( 'columns', default={ 'Date': 'ds', 'Temp': 'y' }, help="Rename columns according to Prophet standards") @step def start(self): """ Raw data is loaded and prepared """ # Load csv in pandas dataframe self.df = pd.read_csv(StringIO(self.data_file)) # Rename columns to meet Prophet input dataframe standards self.df.rename(columns=self.columns_mapping, inplace=True) # Convert Date column to datetime64 dtype self.df['ds'] = pd.to_datetime(self.df['ds'], infer_datetime_format=True) self.next(self.train) @step def train(self): """ A new Prophet model is fitted. """ # Fit a new model using defaults self.m = Prophet() self.m.fit(self.df) self.next(self.end) @step def end(self): """ Last step, process is finished """ print("ProphetFlow is all done.")
class HospitalProcedurePriceVarianceByState(FlowSpec): hospital_price_analysis_db = Parameter( "hospital-price-analysis-db", help="Dolt database to write analysis to", required=True) hospital_price_analysis_db_branch = Parameter( "hospital-price-analysis-db-branch", help="Specify branch version", default="master", ) historical_run_path = Parameter( "historical-run-path", help="Read the same data as a path to a previous run") @step def start(self): analysis_conf = DoltConfig( database=self.hospital_price_analysis_db, branch=self.hospital_price_analysis_db_branch) audit = Run(self.historical_run_path ).data.dolt if self.historical_run_path else None with DoltDT(run=self, config=analysis_conf, audit=audit) as dolt: median_price_by_state = dolt.read("state_procedure_medians") variance_by_procedure = median_price_by_state.groupby( "code").var().reset_index() variance_by_procedure = variance_by_procedure[ ~variance_by_procedure['code'].str.startswith('nan')] dolt.write(variance_by_procedure, "variance_by_procedure") self.next(self.end) @step def end(self): pass
class KmeansFlow(FlowSpec): num_docs = Parameter('num-docs', help='Number of documents', default=1000) @resources(memory=1000) @step def start(self): import scale_data scale_data.load_yelp_reviews(self.num_docs) self.next(self.end) @step def end(self): pass
class SFNTestFlow(FlowSpec): num = Parameter('num', help="Give a number", default=1) @step def start(self): print("The number defined as a parameter is", self.num) self.next(self.end) @step def end(self): print('done!')
class ClassifierPredictFlow(FlowSpec): vector = Parameter('vector', type=JSONType, required=True) @step def start(self): run = Flow('ClassifierTrainFlow').latest_run self.train_run_id = run.pathspec self.model = run['end'].task.data.model print("Input vector", self.vector) self.next(self.end) @step def end(self): print("Predicted class", self.model.predict([self.vector])[0])
class ParameterFlow(FlowSpec): animal = Parameter("creature", help="Specify an animal", required=True) count = Parameter("count", help="Number of animals", default=1) ratio = Parameter("ratio", help="Ratio between 0.0 and 1.0", type=float) @step def start(self): print(self.animal, "is a string of", len(self.animal), "characters") print("Count is an integer: %s+1=%s" % (self.count, self.count + 1)) print("Ratio is a", type(self.ratio), "whose value is", self.ratio) self.next(self.end) @step def end(self): print("done!")
class S3BenchmarkFlow(FlowSpec): local_dir = Parameter('local_dir', help='Read local files from this directory') num = Parameter('num_files', help='maximum number of files to read', default=50) @step def start(self): with S3() as s3: with profile('Loading and processing'): if self.local_dir: files = [ os.path.join(self.local_dir, f) for f in os.listdir(self.local_dir) ][:self.num] else: files = load_s3(s3, self.num) print("Reading %d objects" % len(files)) stats = {} with profile('reading', stats_dict=stats): size = sum( parallel_map(lambda x: len(open(x, 'rb').read()), files)) / 1024**3 read_gbps = (size * 8) / (stats['reading'] / 1000.) print("Read %2.fGB. Throughput: %2.1f Gb/s" % (size, read_gbps)) self.next(self.end) @step def end(self): pass
class JSONParameterFlow(FlowSpec): mapping = Parameter('mapping', help="Specify a mapping", default='{"some": "default"}', type=JSONType) @step def start(self): for key, value in self.mapping.items(): print('key', key, 'value', value) self.next(self.end) @step def end(self): print("done!")
class AuditDemo(FlowSpec): read_run_id = Parameter("read-run-id", help="Pass a run_id for a VersionDemo flow", required=True) @step def start(self): audit = Run(f"VersioningDemo/{self.read_run_id}").data.dolt with DoltDT(run=self, audit=audit) as dolt: df = dolt.read("bar") self.next(self.end) @step def end(self): print(json.dumps(self.dolt, indent=4))
class CSVFileFlow(FlowSpec): data = IncludeFile('csv', help="CSV file to be parsed", is_text=True) delimiter = Parameter('delimiter', help="delimiter", default=',') @step def start(self): fileobj = StringIO(self.data) for i, row in enumerate(csv.reader(fileobj, delimiter=self.delimiter)): print("row %d: %s" % (i, row)) self.next(self.end) @step def end(self): print('done!')
class TaxiDataLoader(FlowSpec): table = Parameter('table', help='Table name', default='nyc_taxi') @conda(python='3.8.10', libraries={'pyarrow': '5.0.0'}) @step def start(self): import pyarrow.parquet as pq def make_key(obj): key = '%s/month=%s/%s' % tuple([self.table] + obj.key.split('/')) return key, obj.path def hive_field(f): return f.name, TYPES.get(str(f.type), str(f.type)) with S3() as s3down: with profile('Dowloading data'): loaded = list(map(make_key, s3down.get_recursive([URL]))) table = pq.read_table(loaded[0][1]) self.schema = dict(map(hive_field, table.schema)) with S3(run=self) as s3up: with profile('Uploading data'): uploaded = s3up.put_files(loaded) key, url = uploaded[0] self.s3_prefix = url[:-(len(key) - len(self.table))] self.next(self.end) @conda(python='3.8.10', libraries={'awswrangler': '1.10.1'}) @step def end(self): import awswrangler as wr try: wr.catalog.create_database(name=GLUE_DB) except: pass wr.athena.create_athena_bucket() with profile('Creating table'): wr.catalog.create_parquet_table(database=GLUE_DB, table=self.table, path=self.s3_prefix, columns_types=self.schema, partitions_types={'month': 'int'}, mode='overwrite') wr.athena.repair_table(self.table, database=GLUE_DB)
class ManyKmeansFlow(FlowSpec): num_docs = Parameter('num-docs', help='Number of documents', default=1000000) @resources(memory=4000) @step def start(self): import scale_data docs = scale_data.load_yelp_reviews(self.num_docs) self.mtx, self.cols = scale_data.make_matrix(docs) self.k_params = list(range(5, 55, 5)) self.next(self.train_kmeans, foreach='k_params') @resources(cpu=4, memory=4000) @step def train_kmeans(self): from sklearn.cluster import KMeans self.k = self.input with profile('k-means'): kmeans = KMeans(n_clusters=self.k, verbose=1, n_init=1) kmeans.fit(self.mtx) self.clusters = kmeans.labels_ self.next(self.analyze) @step def analyze(self): from analyze_kmeans import top_words self.top = top_words(self.k, self.clusters, self.mtx, self.cols) self.next(self.join) @step def join(self, inputs): self.top = {inp.k: inp.top for inp in inputs} self.next(self.end) @step def end(self): pass