def create_dataframe(self, s3_paths): from metaflow import parallel_map, S3 import pandas def form_df(pth): try: df = pandas.read_csv(pth.path) print(f"Retrieved Df for Key {pth.key}") return df except: print(f"Couldn't Extract Dataframe For {pth.key}") return None with S3(s3root=PROCESSED_CS_PATH) as s3: s3_objs = s3.get_many(s3_paths) print("Got the Data From S3. Now Concatenating") # final_dfs = parallel_map(lambda x: form_df(x),s3_objs) final_dfs = [] for x in s3_objs: add_df = form_df(x) if add_df is not None: final_dfs.append(add_df) # concat_df = pandas.concat(list(filter(lambda x: x is not None,final_dfs))) final_dfs = pandas.concat(final_dfs) save_file_name = 'cs-concat-data.csv' final_dfs.to_csv(save_file_name) s3_save_path = os.path.join(SAVE_PROCESSED_DATA_PATH, self.__class__.__name__) with S3(s3root=s3_save_path) as s3: print("Saving Concat DF") df_save_path = s3.put_files([(os.path.join(save_file_name), save_file_name)])[0] return df_save_path
def extract_individual_chunk(self, s3_chunk_url): from metaflow import S3 import io s = io.StringIO() csv_str = None with S3(s3root=S3_TAR_DATA_PATH) as s3: s3_obj = s3.get(s3_chunk_url) print(f"Extracted S3 Data {s3_obj.path}") ss_df = get_ctndf_from_gz(s3_obj.path) ss_df['num_out_ctn'] = ss_df['outCitations'].apply( lambda x: len(x)) ss_df['num_in_ctn'] = ss_df['inCitations'].apply(lambda x: len(x)) useful_df = ss_df[~ss_df.apply(lambda row: row['num_in_ctn'] == 0 and row['num_out_ctn'] == 0, axis=1)] flat_in_ids = list( set([ item for sublist in useful_df['inCitations'].values for item in sublist ])) flat_out_ids = list( set([ item for sublist in useful_df['outCitations'].values for item in sublist ])) present_ids = list(set(useful_df['id'])) useful_df.to_csv(s) csv_str = s.getvalue() print(f"Extracted UseFul Information {s3_obj.path}") citation_meta_object = dict(citation_ids=present_ids, in_citations=flat_in_ids, out_citations=flat_out_ids) print("Now Starting Uploading Of Parsed Data") tar_file_name = s3_chunk_url.split('/')[-1].split('.gz')[0] s3_save_path = os.path.join(PROCESSED_DATA_PATH, self.__class__.__name__, tar_file_name) with S3(s3root=s3_save_path) as s3: print("Saving Metadata") df_save_path = s3.put( # Add the Citation File. 'usefull_citations.csv', csv_str) print("DF Saved") meta_save_path = s3.put('citation_info.json', json.dumps(citation_meta_object)) print(f"Saved Metadata {s3_obj.path}") return_object = dict( meta_save_path=meta_save_path, df_save_path=df_save_path, citation_meta_object=citation_meta_object, ) return return_object
def process_features(self): from pyarrow.parquet import ParquetDataset with S3() as s3: objs = s3.get_many(self.input) table = ParquetDataset([obj.path for obj in objs]).read() self.shards = encoders.execute(table, self.sample) self.next(self.join_data)
def train_model(self): from zipfile import ZipFile from io import BytesIO import random import imagenet_pytorch import json with S3() as s3: zipped_dataset = s3.get(self.dataset_s3_path).blob # Create Directory for deloading dataset. random_hex = str(hex(random.randint(0,16777215))) self.dataset_final_path = script_path('dataset-'+random_hex) safe_mkdir(self.dataset_final_path) # Create the Directory for the dataset using Zip Provided as the input dataset. zipdata = BytesIO() zipdata.write(zipped_dataset) dataset_zip_file = ZipFile(zipdata) # Set Arch here for parallel distributed training. self.arch = self.input # Extract the dataset dataset_zip_file.extractall(self.dataset_final_path) print("Extracted Dataset. Now Training :",self.arch) self.dataset_final_path = os.path.join(self.dataset_final_path,self.zipped_dataset_name) results,model = imagenet_pytorch.start_training_session(self) model = model.to('cpu') # Save CPU based model state dict. self.model = model.state_dict() # Need to save like this otherwise there are Pickle Serialisation problems. self.epoch_histories = json.loads(json.dumps(results)) self.next(self.join)
def classify_ontology(self,df,from_pth): import cso_classifier.classifier.classifier as CSO import json from metaflow import parallel_map, S3 from functools import reduce import pandas model_identity_dict = dict() partition_name = from_pth.split('/')[-2] # json_df_data = json.loads(df.to_json(orient='records')) id_indxd_df = df[['id','paperAbstract','title']].set_index('id') da_js = json.loads(id_indxd_df.rename(columns={'paperAbstract':'abstract'}).to_json(orient='index')) ont_result = CSO.run_cso_classifier_batch_mode(da_js,workers=MAX_WORKERS,print_counter=self.print_counter) renamed_cols={ "syntactic": "ontology_syntactic", "semantic": "ontology_semantic", "union": "ontology_union", "enhanced": "ontology_enhanced", "explanation": 'ontology_explanation' } ont_df = pandas.read_json(json.dumps(ont_result),orient='index').rename(columns=renamed_cols) ont_df.index.name = 'id' df = pandas.concat((df.set_index('id'),ont_df),axis=1) final_path = os.path.join( SAVE_PROCESSED_DATA_PATH,self.__class__.__name__,partition_name ) with S3(s3root=final_path) as s3: rss = s3.put('ontology.json',json.dumps(ont_result)) print(f"Ontology Saved At {rss}") return df
def start(self): with S3(run=self, s3root="s3://ish-metaflow-hackday") as s3: print("Singular scoped puts") message = json.dumps({"message": "hello world!"}) s3.put("sample_obj_1", message) s3.put("sample_obj_2", message) self.next(self.singular_access)
def multiple_access(self): print("Multiple object access") many = {"first_key": "foo", "second_key": "bar"} with S3(s3root="s3://ish-metaflow-hackday") as s3: s3.put_many(many.items()) objs = s3.get_many(["first_key", "second_key"]) print(objs) self.next(self.recursive_listing)
def get_artifacts(cls, artifacts_to_prefetch): artifact_list = [] from metaflow import S3 with S3() as s3: for obj in s3.get_many(artifacts_to_prefetch): sha = obj.key.split('/')[-1] artifact_list.append((sha, cls.decode_gzip_data(obj.path))) return artifact_list
def start(self): import pyarrow.parquet as pq with S3() as s3: res = s3.get(URL) table = pq.read_table(res.path) os.rename(res.path, 'taxi.parquet') table.to_pandas().to_csv('taxi.csv') self.stats = {} self.next(self.load_csv, self.load_parquet, self.load_pandas)
def start(self): self.features = list(FEATURES) print("Encoding features: %s" % ', '.join(FEATURES)) with S3() as s3: self.shards = [] for prefix in TEST + TRAIN: objs = s3.list_recursive([prefix]) self.shards.append([obj.url for obj in objs]) self.next(self.process_features, foreach='shards')
def singular_access(self): with S3(run=self, s3root="s3://ish-metaflow-hackday") as s3: print("Singular scoped gets") s3obj_1 = s3.get("sample_obj_1") print("Object found at", s3obj_1.url) print("Message:", json.loads(s3obj_1.text)) s3obj_2 = s3.get("sample_obj_2") print("Object found at", s3obj_2.url) print("Message:", json.loads(s3obj_2.text)) self.next(self.multiple_access)
def preprocessing(self): import boto3 from metaflow import S3 import re import pandas as pd from nltk import tokenize import string import nltk from nltk.corpus import stopwords from smart_open import smart_open nltk.download('punkt') nltk.download('stopwords') nltk.download('words') listed = [] with smart_open('s3://inputbucket1221/input.txt', 'r') as s3_source: Line = s3_source.readline() while Line != '': Line1 = Line.split(".") for Sentence in Line1: listed.append(Sentence) Line = s3_source.readline() L = [] for x in listed: if len(x) > 5: L.append(x) df = pd.DataFrame() df['Text'] = L print(df['Text']) def remove_punct(text): text = "".join( [char for char in text if char not in string.punctuation]) text = re.sub('[0-9]+', '', text) return text df['Textclean'] = df['Text'].apply(lambda x: remove_punct(x)) df = df.dropna() f = open("processed.txt", "a") f.write(df['Textclean'].to_string()) f.close() #self.cleantext = df['Textclean'] self.cleantext = df['Textclean'] with S3(s3root='s3://outputbucket1221/') as s3: s3.put_files([('processed', 'processed.txt')]) self.next(self.labelling)
def labelling(self): import pandas as pd import boto3 from metaflow import S3 def create_sentiment_aws(row): """Uses AWS Comprehend to Create Sentiments on a DataFrame""" try: comprehend = boto3.client(service_name='comprehend', region_name="us-east-2") payload = comprehend.detect_sentiment(Text=row, LanguageCode='en') sentiment = payload['Sentiment'] except Exception: print("Size exceeded: Fail") return None return sentiment def apply_sentiment_aws(df, column="text"): """Uses Pandas Apply to Create Sentiment Analysis""" df['Sentiment'] = df[column].apply(create_sentiment_aws) return df L_aws = [] for x in self.clean_df['tweet']: comprehend = boto3.client(service_name='comprehend', region_name="us-east-1") comp_str = comprehend.detect_sentiment(Text=x, LanguageCode='en') if comp_str['Sentiment'] == 'POSITIVE': L_aws.append([x, 2]) elif comp_str['Sentiment'] == 'NEGATIVE': L_aws.append([x, 0]) elif comp_str['Sentiment'] == 'NEUTRAL': L_aws.append([x, 1]) f_sentiment = pd.DataFrame(L_aws, columns=['Tweet', 'Score']) final_sentiment = pd.DataFrame({ 'label': f_sentiment[1], 'tweet': f_sentiment[0].replace(r'\n', ' ', regex=True) }) final_sentiment.to_csv('labelledtweets.tsv', sep='\t', index=False, header=False) with S3(s3root='s3://sentstorage/') as s3: s3.put_files([('labelledtweets.tsv', 'labelledtweets.tsv')]) self.next(self.end)
def athena_ctas(self, sql): import awswrangler as wr table = 'mf_ctas_%s' % current.pathspec.replace('/', '_') self.ctas = "CREATE TABLE %s AS %s" % (table, sql) with profile('Running query'): query = wr.athena.start_query_execution(self.ctas, database=GLUE_DB) output = wr.athena.wait_query(query) loc = output['ResultConfiguration']['OutputLocation'] with S3() as s3: return [obj.url for obj in s3.list_recursive([loc + '/'])]
def start(self): import pyarrow.parquet as pq def make_key(obj): key = '%s/month=%s/%s' % tuple([self.table] + obj.key.split('/')) return key, obj.path def hive_field(f): return f.name, TYPES.get(str(f.type), str(f.type)) with S3() as s3down: with profile('Dowloading data'): loaded = list(map(make_key, s3down.get_recursive([URL]))) table = pq.read_table(loaded[0][1]) self.schema = dict(map(hive_field, table.schema)) with S3(run=self) as s3up: with profile('Uploading data'): uploaded = s3up.put_files(loaded) key, url = uploaded[0] self.s3_prefix = url[:-(len(key) - len(self.table))] self.next(self.end)
def list_folders(base_path, s3_root=data_path, with_full_path=False): if base_path == '/': base_path = '' sync_path = os.path.join(s3_root, base_path) pths = [] with S3(s3root=sync_path) as s3: for resp in s3.list_paths(): if with_full_path: pths.append(os.path.join(sync_path, resp.key)) else: pths.append(resp.key) return pths
def sync_folder_to_s3(root_path, base_path='', s3_root=s3_root): sync_path = os.path.join(s3_root, base_path) file_paths = [(os.path.normpath(os.path.join(r, file)), os.path.join(r, file)) for r, d, f in os.walk(root_path) for file in f] with S3(s3root=s3_root) as s3: s3.put() file_paths = s3.put_files(file_paths) sync_path = os.path.join(sync_path, os.path.normpath(root_path)) return sync_path, file_paths
def sync_folder_from_bucket(bucket_path, folder_path): safe_mkdir(folder_path) with S3(s3root=bucket_path) as s3: for resp in s3.get_all(): dir_path = os.path.join( folder_path, os.path.dirname(resp.key), ) file_path = os.path.join(folder_path, resp.key) safe_mkdir(dir_path) print("Writing File To : %s", file_path) with open(file_path, 'wb+') as f: f.write(resp.blob) return folder_path
def preprocess_data(self): with S3() as s3: from pyarrow.parquet import ParquetDataset if self.input: objs = s3.get_many(self.input) orig_table = ParquetDataset([obj.path for obj in objs]).read() self.num_rows_before = orig_table.num_rows table = process_data(orig_table) self.num_rows_after = table.num_rows print('selected %d/%d rows'\ % (self.num_rows_after, self.num_rows_before)) self.lat = table['pickup_latitude'].to_numpy() self.lon = table['pickup_longitude'].to_numpy() self.next(self.join)
def train_model(self): """ Train a regression model and use s3 client from metaflow to store the model tar file. """ # this is the current learning rate in the fan-out self.learning_rate = self.input import numpy as np import tensorflow as tf from tensorflow.keras import layers import tarfile import wandb from wandb.keras import WandbCallback # this name comes in handy later, as a naming convention for building the card wandb_run_name = '{}:{}-{}'.format(current.flow_name, current.run_id, self.learning_rate) wandb.init(project=current.flow_name, name=wandb_run_name) # build the model x_train = np.array([[_[0]] for _ in self.train_dataset]) y_train = np.array([_[1] for _ in self.train_dataset]) x_test = np.array([[_[0]] for _ in self.test_dataset]) y_test = np.array([_[1] for _ in self.test_dataset]) x_model = tf.keras.Sequential([layers.Dense(input_shape=[1,], units=1)]) # print model summary to a string stringlist = [] x_model.summary(print_fn=lambda x: stringlist.append(x)) self.model_summary = "\n".join(stringlist) x_model.compile( optimizer=tf.optimizers.Adam(learning_rate=self.learning_rate), loss='mean_absolute_error', metrics=[tf.keras.metrics.MeanSquaredError()]) history = x_model.fit( x_train, y_train, epochs=50, validation_split=0.2, callbacks=[WandbCallback()]) self.hist = history.history self.results = x_model.evaluate(x_test, y_test) model_name = "regression-model-{}/1".format(self.learning_rate) local_tar_name = 'model-{}.tar.gz'.format(self.learning_rate) x_model.save(filepath=model_name) # zip keras folder to a single tar file with tarfile.open(local_tar_name, mode="w:gz") as _tar: _tar.add(model_name, recursive=True) with open(local_tar_name, "rb") as in_file: data = in_file.read() with S3(run=self) as s3: url = s3.put(local_tar_name, data) # save this path for downstream reference! self.s3_path = url # finally join with the other runs self.next(self.join_runs)
def start(self): if self.use_ctas: self.paths = Flow('TaxiETLFlow').latest_run.data.paths else: with S3() as s3: objs = s3.list_recursive(URLS) self.paths = [obj.url for obj in objs] print("Processing %d Parquet files" % len(self.paths)) n = max(round(len(self.paths) / NUM_SHARDS), 1) self.shards = [ self.paths[i * n:(i + 1) * n] for i in range(NUM_SHARDS - 1) ] self.shards.append(self.paths[(NUM_SHARDS - 1) * n:]) self.next(self.preprocess_data, foreach='shards')
def preprocess_data(self): from table_utils import filter_outliers, sample self.shard = None with S3() as s3: from pyarrow.parquet import ParquetDataset if self.input: objs = s3.get_many(self.input) table = ParquetDataset([obj.path for obj in objs]).read() table = sample(filter_outliers(table, FIELDS), self.sample) self.shard = { field: table[field].to_numpy() for field in FIELDS } self.next(self.join)
def save_json(self, data_json, tmp_pth='temp_save.json', save_name='data.json'): from metaflow import S3 import shutil final_path = os.path.join(SAVE_PROCESSED_DATA_PATH, self.__class__.__name__) with S3(s3root=final_path) as s3: print(f"Saving data_json To S3") with open(tmp_pth, 'w') as f: json.dump(data_json, f) put_pth = s3.put_files([(save_name, tmp_pth)])[0][1] return put_pth
def upload_s3_folder(flow_spec: FlowSpec, s3_folder_name: str, local_path: str): """ Upload file in a local path to Flow S3 Folder Parameters ---------- flow_spec The running Flow s3_folder_name The S3 destination folder name local_path The local path to search for files """ import os artifact_files = [(os.path.join(s3_folder_name, f), os.path.join(local_path, f)) for f in os.listdir(local_path)] with S3(run=flow_spec) as s3: s3.put_files(iter(artifact_files))
def extract_individual_chunk(self,s3_chunk_url): from metaflow import S3 import io s = io.StringIO() ss_df = None tar_file_name = s3_chunk_url.split('/')[-1].split('.gz')[0] cat_csv_name = f'category_citations-{tar_file_name}.csv' with S3(s3root=S3_TAR_DATA_PATH) as s3: s3_obj = s3.get(s3_chunk_url) print(f"Extracted S3 Data {s3_obj.path}") ss_df = get_ctndf_from_gz(s3_obj.path,categories=self.selected_categories) ss_df['num_out_ctn'] = ss_df['outCitations'].apply(lambda x: len(x)) ss_df['num_in_ctn'] = ss_df['inCitations'].apply(lambda x: len(x)) save_df = self.extract_cat_df(ss_df) print(f"Extracted UseFul Information {len(save_df)}") # csv_str = s.getvalue() save_df.to_csv(cat_csv_name) print("Now Starting Uploading Of Parsed Data") s3_save_path = os.path.join( SAVE_PROCESSED_DATA_PATH,self.__class__.__name__,tar_file_name ) with S3(s3root=s3_save_path) as s3: print("Saving Metadata") df_save_paths = s3.put_files([ ( os.path.join('category_citations.csv'), cat_csv_name ) ])[0] print(f"Saved Metadata {s3_save_path}") return_object = dict( df_save_path = df_save_paths[1] ) return return_object
def save_data_df(self,df,from_pth): from metaflow import S3 import shutil partition_name = from_pth.split('/')[-2] final_path = os.path.join( SAVE_PROCESSED_DATA_PATH,self.__class__.__name__,partition_name ) with S3(s3root=final_path) as s3: print(f"Data Frame Saved To S3 With Mined Ontology AND RANK {partition_name}") temp_save_pth = f'{partition_name}-temp.csv' df.to_csv(temp_save_pth) s3_save_dest = s3.put_files([( 'ontology_processed.csv',temp_save_pth )])[0][1] os.remove(temp_save_pth) return s3_save_dest
def save_graph(self,graph_json,tmp_pth = 'temp_save_graph.json',save_name='citation_network_graph.json'): from metaflow import S3 import shutil final_path = os.path.join( SAVE_PROCESSED_DATA_PATH,self.__class__.__name__ ) print("ABOUT TO SAVE THE GRAPH !") with S3(s3root=final_path) as s3: print(f"Saving Graph To S3") with open(tmp_pth,'w') as f: json.dump(graph_json,f) put_pth = s3.put_files( [(save_name,tmp_pth)] )[0][1] return put_pth
def load_main_csvs(self,s3_paths): from metaflow import S3 import pandas def form_df(pth): try: df = pandas.read_csv(pth.path) print(f"Retrieved Df for Key {pth.key}") return df except: print(f"Couldn't Extract Dataframe For {pth.key}") return None n = 0 for pth in s3_paths: with S3(s3root=ONTOLOGY_CSV_PATH) as s3: s3_obj = s3.get(pth) df = form_df(s3_obj) if df is None: continue yield (df,pth) n+=1
def create_s3_folder(flow_spec: FlowSpec, folder_name: str) -> str: """ Create an S3 folder within the Flow Parameters ---------- flow_spec The running Flow folder_name The folder name Returns ------- Path S3 path """ import os from metaflow import S3 try: with S3(run=flow_spec) as s3: return os.path.split(s3.put(folder_name + "/.keep", "keep"))[0] except TypeError: return ""
def start(self): with S3() as s3: with profile('Loading and processing'): if self.local_dir: files = [ os.path.join(self.local_dir, f) for f in os.listdir(self.local_dir) ][:self.num] else: files = load_s3(s3, self.num) print("Reading %d objects" % len(files)) stats = {} with profile('reading', stats_dict=stats): size = sum( parallel_map(lambda x: len(open(x, 'rb').read()), files)) / 1024**3 read_gbps = (size * 8) / (stats['reading'] / 1000.) print("Read %2.fGB. Throughput: %2.1f Gb/s" % (size, read_gbps)) self.next(self.end)