Ejemplo n.º 1
0
    def create_dataframe(self, s3_paths):
        from metaflow import parallel_map, S3
        import pandas

        def form_df(pth):
            try:
                df = pandas.read_csv(pth.path)
                print(f"Retrieved Df for Key {pth.key}")
                return df
            except:
                print(f"Couldn't Extract Dataframe For {pth.key}")
                return None

        with S3(s3root=PROCESSED_CS_PATH) as s3:
            s3_objs = s3.get_many(s3_paths)
            print("Got the Data From S3. Now Concatenating")
            # final_dfs = parallel_map(lambda x: form_df(x),s3_objs)
            final_dfs = []
            for x in s3_objs:
                add_df = form_df(x)
                if add_df is not None:
                    final_dfs.append(add_df)
            # concat_df = pandas.concat(list(filter(lambda x: x is not None,final_dfs)))
            final_dfs = pandas.concat(final_dfs)

        save_file_name = 'cs-concat-data.csv'
        final_dfs.to_csv(save_file_name)
        s3_save_path = os.path.join(SAVE_PROCESSED_DATA_PATH,
                                    self.__class__.__name__)
        with S3(s3root=s3_save_path) as s3:
            print("Saving Concat DF")
            df_save_path = s3.put_files([(os.path.join(save_file_name),
                                          save_file_name)])[0]
        return df_save_path
Ejemplo n.º 2
0
    def extract_individual_chunk(self, s3_chunk_url):
        from metaflow import S3
        import io
        s = io.StringIO()
        csv_str = None
        with S3(s3root=S3_TAR_DATA_PATH) as s3:
            s3_obj = s3.get(s3_chunk_url)
            print(f"Extracted S3 Data {s3_obj.path}")
            ss_df = get_ctndf_from_gz(s3_obj.path)
            ss_df['num_out_ctn'] = ss_df['outCitations'].apply(
                lambda x: len(x))
            ss_df['num_in_ctn'] = ss_df['inCitations'].apply(lambda x: len(x))
            useful_df = ss_df[~ss_df.apply(lambda row: row['num_in_ctn'] == 0
                                           and row['num_out_ctn'] == 0,
                                           axis=1)]
            flat_in_ids = list(
                set([
                    item for sublist in useful_df['inCitations'].values
                    for item in sublist
                ]))
            flat_out_ids = list(
                set([
                    item for sublist in useful_df['outCitations'].values
                    for item in sublist
                ]))
            present_ids = list(set(useful_df['id']))
            useful_df.to_csv(s)
            csv_str = s.getvalue()
            print(f"Extracted UseFul Information {s3_obj.path}")
            citation_meta_object = dict(citation_ids=present_ids,
                                        in_citations=flat_in_ids,
                                        out_citations=flat_out_ids)
        print("Now Starting Uploading Of Parsed Data")
        tar_file_name = s3_chunk_url.split('/')[-1].split('.gz')[0]
        s3_save_path = os.path.join(PROCESSED_DATA_PATH,
                                    self.__class__.__name__, tar_file_name)
        with S3(s3root=s3_save_path) as s3:
            print("Saving Metadata")
            df_save_path = s3.put(  # Add the Citation File. 
                'usefull_citations.csv', csv_str)
            print("DF Saved")
            meta_save_path = s3.put('citation_info.json',
                                    json.dumps(citation_meta_object))
            print(f"Saved Metadata {s3_obj.path}")
            return_object = dict(
                meta_save_path=meta_save_path,
                df_save_path=df_save_path,
                citation_meta_object=citation_meta_object,
            )

        return return_object
Ejemplo n.º 3
0
 def process_features(self):
     from pyarrow.parquet import ParquetDataset
     with S3() as s3:
         objs = s3.get_many(self.input)
         table = ParquetDataset([obj.path for obj in objs]).read()
     self.shards = encoders.execute(table, self.sample)
     self.next(self.join_data)
    def train_model(self):
        from zipfile import ZipFile
        from io import BytesIO
        import random 
        import imagenet_pytorch
        import json
        with S3() as s3:
            zipped_dataset = s3.get(self.dataset_s3_path).blob
        # Create Directory for deloading dataset. 
        random_hex = str(hex(random.randint(0,16777215)))
        self.dataset_final_path = script_path('dataset-'+random_hex)
        safe_mkdir(self.dataset_final_path)

        # Create the Directory for the dataset using Zip Provided as the input dataset.
        zipdata = BytesIO()
        zipdata.write(zipped_dataset)
        dataset_zip_file = ZipFile(zipdata)
        # Set Arch here for parallel distributed training.
        self.arch = self.input
        # Extract the dataset 
        dataset_zip_file.extractall(self.dataset_final_path)
        print("Extracted Dataset. Now Training :",self.arch)
        self.dataset_final_path = os.path.join(self.dataset_final_path,self.zipped_dataset_name)
        results,model = imagenet_pytorch.start_training_session(self)
        model = model.to('cpu') # Save CPU based model state dict. 
        self.model = model.state_dict()
        # Need to save like this otherwise there are Pickle Serialisation problems. 
        self.epoch_histories = json.loads(json.dumps(results))
        self.next(self.join)
Ejemplo n.º 5
0
    def classify_ontology(self,df,from_pth):
        import cso_classifier.classifier.classifier as CSO
        import json
        from metaflow import parallel_map, S3
        from functools import reduce
        import pandas
        model_identity_dict = dict()
        partition_name = from_pth.split('/')[-2]
        # json_df_data = json.loads(df.to_json(orient='records'))
        id_indxd_df = df[['id','paperAbstract','title']].set_index('id')
        da_js = json.loads(id_indxd_df.rename(columns={'paperAbstract':'abstract'}).to_json(orient='index'))
        ont_result = CSO.run_cso_classifier_batch_mode(da_js,workers=MAX_WORKERS,print_counter=self.print_counter)
        renamed_cols={
            "syntactic": "ontology_syntactic",
            "semantic": "ontology_semantic",
            "union": "ontology_union",
            "enhanced": "ontology_enhanced",
            "explanation": 'ontology_explanation'
        }
        ont_df = pandas.read_json(json.dumps(ont_result),orient='index').rename(columns=renamed_cols)
        ont_df.index.name = 'id'
        df = pandas.concat((df.set_index('id'),ont_df),axis=1)
        final_path = os.path.join(
            SAVE_PROCESSED_DATA_PATH,self.__class__.__name__,partition_name
        )
        with S3(s3root=final_path) as s3:
            rss = s3.put('ontology.json',json.dumps(ont_result))
            print(f"Ontology Saved At {rss}")

        return df
 def start(self):
     with S3(run=self, s3root="s3://ish-metaflow-hackday") as s3:
         print("Singular scoped puts")
         message = json.dumps({"message": "hello world!"})
         s3.put("sample_obj_1", message)
         s3.put("sample_obj_2", message)
     self.next(self.singular_access)
 def multiple_access(self):
     print("Multiple object access")
     many = {"first_key": "foo", "second_key": "bar"}
     with S3(s3root="s3://ish-metaflow-hackday") as s3:
         s3.put_many(many.items())
         objs = s3.get_many(["first_key", "second_key"])
         print(objs)
     self.next(self.recursive_listing)
Ejemplo n.º 8
0
 def get_artifacts(cls, artifacts_to_prefetch):
     artifact_list = []
     from metaflow import S3
     with S3() as s3:
         for obj in s3.get_many(artifacts_to_prefetch):
             sha = obj.key.split('/')[-1]
             artifact_list.append((sha, cls.decode_gzip_data(obj.path)))
     return artifact_list
Ejemplo n.º 9
0
 def start(self):
     import pyarrow.parquet as pq
     with S3() as s3:
         res = s3.get(URL)
         table = pq.read_table(res.path)
         os.rename(res.path, 'taxi.parquet')
     table.to_pandas().to_csv('taxi.csv')
     self.stats = {}
     self.next(self.load_csv, self.load_parquet, self.load_pandas)
Ejemplo n.º 10
0
 def start(self):
     self.features = list(FEATURES)
     print("Encoding features: %s" % ', '.join(FEATURES))
     with S3() as s3:
         self.shards = []
         for prefix in TEST + TRAIN:
             objs = s3.list_recursive([prefix])
             self.shards.append([obj.url for obj in objs])
     self.next(self.process_features, foreach='shards')
 def singular_access(self):
     with S3(run=self, s3root="s3://ish-metaflow-hackday") as s3:
         print("Singular scoped gets")
         s3obj_1 = s3.get("sample_obj_1")
         print("Object found at", s3obj_1.url)
         print("Message:", json.loads(s3obj_1.text))
         s3obj_2 = s3.get("sample_obj_2")
         print("Object found at", s3obj_2.url)
         print("Message:", json.loads(s3obj_2.text))
     self.next(self.multiple_access)
Ejemplo n.º 12
0
    def preprocessing(self):
        import boto3
        from metaflow import S3
        import re
        import pandas as pd
        from nltk import tokenize
        import string
        import nltk
        from nltk.corpus import stopwords
        from smart_open import smart_open

        nltk.download('punkt')
        nltk.download('stopwords')
        nltk.download('words')

        listed = []
        with smart_open('s3://inputbucket1221/input.txt', 'r') as s3_source:
            Line = s3_source.readline()

            while Line != '':
                Line1 = Line.split(".")
                for Sentence in Line1:
                    listed.append(Sentence)
                Line = s3_source.readline()

        L = []
        for x in listed:
            if len(x) > 5:
                L.append(x)

        df = pd.DataFrame()

        df['Text'] = L
        print(df['Text'])

        def remove_punct(text):
            text = "".join(
                [char for char in text if char not in string.punctuation])
            text = re.sub('[0-9]+', '', text)
            return text

        df['Textclean'] = df['Text'].apply(lambda x: remove_punct(x))
        df = df.dropna()

        f = open("processed.txt", "a")
        f.write(df['Textclean'].to_string())
        f.close()

        #self.cleantext = df['Textclean']
        self.cleantext = df['Textclean']

        with S3(s3root='s3://outputbucket1221/') as s3:
            s3.put_files([('processed', 'processed.txt')])

        self.next(self.labelling)
    def labelling(self):
        import pandas as pd
        import boto3
        from metaflow import S3

        def create_sentiment_aws(row):
            """Uses AWS Comprehend to Create Sentiments on a DataFrame"""

            try:
                comprehend = boto3.client(service_name='comprehend',
                                          region_name="us-east-2")
                payload = comprehend.detect_sentiment(Text=row,
                                                      LanguageCode='en')
                sentiment = payload['Sentiment']
            except Exception:
                print("Size exceeded:  Fail")
                return None
            return sentiment

        def apply_sentiment_aws(df, column="text"):
            """Uses Pandas Apply to Create Sentiment Analysis"""
            df['Sentiment'] = df[column].apply(create_sentiment_aws)
            return df

        L_aws = []

        for x in self.clean_df['tweet']:
            comprehend = boto3.client(service_name='comprehend',
                                      region_name="us-east-1")
            comp_str = comprehend.detect_sentiment(Text=x, LanguageCode='en')
            if comp_str['Sentiment'] == 'POSITIVE':
                L_aws.append([x, 2])
            elif comp_str['Sentiment'] == 'NEGATIVE':
                L_aws.append([x, 0])
            elif comp_str['Sentiment'] == 'NEUTRAL':
                L_aws.append([x, 1])

        f_sentiment = pd.DataFrame(L_aws, columns=['Tweet', 'Score'])

        final_sentiment = pd.DataFrame({
            'label':
            f_sentiment[1],
            'tweet':
            f_sentiment[0].replace(r'\n', ' ', regex=True)
        })

        final_sentiment.to_csv('labelledtweets.tsv',
                               sep='\t',
                               index=False,
                               header=False)

        with S3(s3root='s3://sentstorage/') as s3:
            s3.put_files([('labelledtweets.tsv', 'labelledtweets.tsv')])

        self.next(self.end)
Ejemplo n.º 14
0
 def athena_ctas(self, sql):
     import awswrangler as wr
     table = 'mf_ctas_%s' % current.pathspec.replace('/', '_')
     self.ctas = "CREATE TABLE %s AS %s" % (table, sql)
     with profile('Running query'):
         query = wr.athena.start_query_execution(self.ctas,
                                                 database=GLUE_DB)
         output = wr.athena.wait_query(query)
         loc = output['ResultConfiguration']['OutputLocation']
         with S3() as s3:
             return [obj.url for obj in s3.list_recursive([loc + '/'])]
Ejemplo n.º 15
0
    def start(self):
        import pyarrow.parquet as pq

        def make_key(obj):
            key = '%s/month=%s/%s' % tuple([self.table] + obj.key.split('/'))
            return key, obj.path

        def hive_field(f):
            return f.name, TYPES.get(str(f.type), str(f.type))

        with S3() as s3down:
            with profile('Dowloading data'):
                loaded = list(map(make_key, s3down.get_recursive([URL])))
            table = pq.read_table(loaded[0][1])
            self.schema = dict(map(hive_field, table.schema))
            with S3(run=self) as s3up:
                with profile('Uploading data'):
                    uploaded = s3up.put_files(loaded)
                key, url = uploaded[0]
                self.s3_prefix = url[:-(len(key) - len(self.table))]
        self.next(self.end)
def list_folders(base_path, s3_root=data_path, with_full_path=False):
    if base_path == '/': base_path = ''
    sync_path = os.path.join(s3_root, base_path)
    pths = []
    with S3(s3root=sync_path) as s3:
        for resp in s3.list_paths():
            if with_full_path:
                pths.append(os.path.join(sync_path, resp.key))
            else:
                pths.append(resp.key)

    return pths
def sync_folder_to_s3(root_path, base_path='', s3_root=s3_root):
    sync_path = os.path.join(s3_root, base_path)

    file_paths = [(os.path.normpath(os.path.join(r,
                                                 file)), os.path.join(r, file))
                  for r, d, f in os.walk(root_path) for file in f]

    with S3(s3root=s3_root) as s3:
        s3.put()
        file_paths = s3.put_files(file_paths)

    sync_path = os.path.join(sync_path, os.path.normpath(root_path))
    return sync_path, file_paths
def sync_folder_from_bucket(bucket_path, folder_path):
    safe_mkdir(folder_path)
    with S3(s3root=bucket_path) as s3:
        for resp in s3.get_all():
            dir_path = os.path.join(
                folder_path,
                os.path.dirname(resp.key),
            )
            file_path = os.path.join(folder_path, resp.key)
            safe_mkdir(dir_path)
            print("Writing File To : %s", file_path)
            with open(file_path, 'wb+') as f:
                f.write(resp.blob)
    return folder_path
Ejemplo n.º 19
0
 def preprocess_data(self):
     with S3() as s3:
         from pyarrow.parquet import ParquetDataset
         if self.input:
             objs = s3.get_many(self.input)
             orig_table = ParquetDataset([obj.path for obj in objs]).read()
             self.num_rows_before = orig_table.num_rows
             table = process_data(orig_table)
             self.num_rows_after = table.num_rows
             print('selected %d/%d rows'\
                   % (self.num_rows_after, self.num_rows_before))
             self.lat = table['pickup_latitude'].to_numpy()
             self.lon = table['pickup_longitude'].to_numpy()
     self.next(self.join)
Ejemplo n.º 20
0
 def train_model(self):
     """
     Train a regression model and use s3 client from metaflow to store the model tar file.
     """
     # this is the current learning rate in the fan-out
     self.learning_rate = self.input
     import numpy as np
     import tensorflow as tf
     from tensorflow.keras import layers
     import tarfile
     import wandb
     from wandb.keras import WandbCallback
     # this name comes in handy later, as a naming convention for building the card
     wandb_run_name = '{}:{}-{}'.format(current.flow_name, current.run_id, self.learning_rate)
     wandb.init(project=current.flow_name, name=wandb_run_name)
     # build the model
     x_train = np.array([[_[0]] for _ in self.train_dataset])
     y_train = np.array([_[1] for _ in self.train_dataset])
     x_test = np.array([[_[0]] for _ in self.test_dataset])
     y_test = np.array([_[1] for _ in self.test_dataset])
     x_model = tf.keras.Sequential([layers.Dense(input_shape=[1,], units=1)])
     # print model summary to a string
     stringlist = []
     x_model.summary(print_fn=lambda x: stringlist.append(x))
     self.model_summary = "\n".join(stringlist)
     x_model.compile(
         optimizer=tf.optimizers.Adam(learning_rate=self.learning_rate),
         loss='mean_absolute_error', metrics=[tf.keras.metrics.MeanSquaredError()])
     history = x_model.fit(
         x_train,
         y_train,
         epochs=50,
         validation_split=0.2,
         callbacks=[WandbCallback()])
     self.hist = history.history
     self.results = x_model.evaluate(x_test, y_test)
     model_name = "regression-model-{}/1".format(self.learning_rate)
     local_tar_name = 'model-{}.tar.gz'.format(self.learning_rate)
     x_model.save(filepath=model_name)
     # zip keras folder to a single tar file
     with tarfile.open(local_tar_name, mode="w:gz") as _tar:
         _tar.add(model_name, recursive=True)
     with open(local_tar_name, "rb") as in_file:
         data = in_file.read()
         with S3(run=self) as s3:
             url = s3.put(local_tar_name, data)
             # save this path for downstream reference!
             self.s3_path = url
     # finally join with the other runs
     self.next(self.join_runs)
Ejemplo n.º 21
0
 def start(self):
     if self.use_ctas:
         self.paths = Flow('TaxiETLFlow').latest_run.data.paths
     else:
         with S3() as s3:
             objs = s3.list_recursive(URLS)
             self.paths = [obj.url for obj in objs]
     print("Processing %d Parquet files" % len(self.paths))
     n = max(round(len(self.paths) / NUM_SHARDS), 1)
     self.shards = [
         self.paths[i * n:(i + 1) * n] for i in range(NUM_SHARDS - 1)
     ]
     self.shards.append(self.paths[(NUM_SHARDS - 1) * n:])
     self.next(self.preprocess_data, foreach='shards')
Ejemplo n.º 22
0
 def preprocess_data(self):
     from table_utils import filter_outliers, sample
     self.shard = None
     with S3() as s3:
         from pyarrow.parquet import ParquetDataset
         if self.input:
             objs = s3.get_many(self.input)
             table = ParquetDataset([obj.path for obj in objs]).read()
             table = sample(filter_outliers(table, FIELDS), self.sample)
             self.shard = {
                 field: table[field].to_numpy()
                 for field in FIELDS
             }
     self.next(self.join)
Ejemplo n.º 23
0
    def save_json(self,
                  data_json,
                  tmp_pth='temp_save.json',
                  save_name='data.json'):
        from metaflow import S3
        import shutil
        final_path = os.path.join(SAVE_PROCESSED_DATA_PATH,
                                  self.__class__.__name__)
        with S3(s3root=final_path) as s3:
            print(f"Saving data_json To S3")
            with open(tmp_pth, 'w') as f:
                json.dump(data_json, f)
            put_pth = s3.put_files([(save_name, tmp_pth)])[0][1]

        return put_pth
Ejemplo n.º 24
0
def upload_s3_folder(flow_spec: FlowSpec, s3_folder_name: str, local_path: str):
    """
    Upload file in a local path to Flow S3 Folder
    Parameters
    ----------
    flow_spec The running Flow
    s3_folder_name The S3 destination folder name
    local_path The local path to search for files

    """
    import os

    artifact_files = [(os.path.join(s3_folder_name, f), os.path.join(local_path, f)) for f in os.listdir(local_path)]
    with S3(run=flow_spec) as s3:
        s3.put_files(iter(artifact_files))
    def extract_individual_chunk(self,s3_chunk_url):
        from metaflow import S3
        import io
        s = io.StringIO()
        ss_df = None
        tar_file_name = s3_chunk_url.split('/')[-1].split('.gz')[0]
        cat_csv_name = f'category_citations-{tar_file_name}.csv'
        with S3(s3root=S3_TAR_DATA_PATH) as s3:
            s3_obj = s3.get(s3_chunk_url)
            print(f"Extracted S3 Data {s3_obj.path}")
            ss_df = get_ctndf_from_gz(s3_obj.path,categories=self.selected_categories)
            ss_df['num_out_ctn'] = ss_df['outCitations'].apply(lambda x: len(x))
            ss_df['num_in_ctn'] = ss_df['inCitations'].apply(lambda x: len(x))
            save_df = self.extract_cat_df(ss_df)
            print(f"Extracted UseFul Information {len(save_df)}")
            # csv_str = s.getvalue()
            save_df.to_csv(cat_csv_name)
            
        print("Now Starting Uploading Of Parsed Data")
        s3_save_path = os.path.join(
            SAVE_PROCESSED_DATA_PATH,self.__class__.__name__,tar_file_name
        )
        with S3(s3root=s3_save_path) as s3:
            print("Saving Metadata")
            df_save_paths = s3.put_files([
                (
                    os.path.join('category_citations.csv'),
                    cat_csv_name
                )
            ])[0]
            print(f"Saved Metadata {s3_save_path}")
            return_object = dict(
                df_save_path = df_save_paths[1]
            )

        return return_object
 def save_data_df(self,df,from_pth):
     from metaflow import S3
     import shutil
     partition_name = from_pth.split('/')[-2]
     final_path = os.path.join(
         SAVE_PROCESSED_DATA_PATH,self.__class__.__name__,partition_name
     )
     with S3(s3root=final_path) as s3:
         print(f"Data Frame Saved To S3 With Mined Ontology AND RANK {partition_name}")
         temp_save_pth = f'{partition_name}-temp.csv'
         df.to_csv(temp_save_pth)
         s3_save_dest = s3.put_files([(
             'ontology_processed.csv',temp_save_pth
         )])[0][1]
         os.remove(temp_save_pth)
     return s3_save_dest
Ejemplo n.º 27
0
 def save_graph(self,graph_json,tmp_pth = 'temp_save_graph.json',save_name='citation_network_graph.json'):
     from metaflow import S3
     import shutil
     final_path = os.path.join(
         SAVE_PROCESSED_DATA_PATH,self.__class__.__name__
     )
     print("ABOUT TO SAVE THE GRAPH !")
     with S3(s3root=final_path) as s3:
         print(f"Saving Graph To S3")
         with open(tmp_pth,'w') as f:
             json.dump(graph_json,f)
         put_pth = s3.put_files(
             [(save_name,tmp_pth)]
         )[0][1]
         
     return put_pth
    def load_main_csvs(self,s3_paths):
        from metaflow import S3
        import pandas

        def form_df(pth):
            try:
                df = pandas.read_csv(pth.path)
                print(f"Retrieved Df for Key {pth.key}")
                return df
            except:
                print(f"Couldn't Extract Dataframe For {pth.key}")
                return None
        n = 0
        for pth in s3_paths:
            with S3(s3root=ONTOLOGY_CSV_PATH) as s3:
                s3_obj = s3.get(pth)
                df = form_df(s3_obj)
                if df is None:
                    continue
                yield (df,pth)
            n+=1
Ejemplo n.º 29
0
def create_s3_folder(flow_spec: FlowSpec, folder_name: str) -> str:
    """
    Create an S3 folder within the Flow
    Parameters
    ----------
    flow_spec The running Flow
    folder_name The folder name

    Returns
    -------
    Path S3 path

    """
    import os

    from metaflow import S3

    try:
        with S3(run=flow_spec) as s3:
            return os.path.split(s3.put(folder_name + "/.keep", "keep"))[0]
    except TypeError:
        return ""
Ejemplo n.º 30
0
    def start(self):
        with S3() as s3:
            with profile('Loading and processing'):
                if self.local_dir:
                    files = [
                        os.path.join(self.local_dir, f)
                        for f in os.listdir(self.local_dir)
                    ][:self.num]
                else:
                    files = load_s3(s3, self.num)

                print("Reading %d objects" % len(files))
                stats = {}
                with profile('reading', stats_dict=stats):
                    size = sum(
                        parallel_map(lambda x: len(open(x, 'rb').read()),
                                     files)) / 1024**3

                read_gbps = (size * 8) / (stats['reading'] / 1000.)
                print("Read %2.fGB. Throughput: %2.1f Gb/s" %
                      (size, read_gbps))
        self.next(self.end)