Ejemplo n.º 1
0
 def encode(cls, table):
     from tensorflow.keras.layers import Hashing, IntegerLookup
     with profile('coordinates to grid'):
         grid = cls._coords_to_grid(table)
     hashing_trick = Hashing(NUM_HASH_BINS)
     multi_hot = IntegerLookup(vocabulary=list(range(NUM_HASH_BINS)),
                               output_mode='multi_hot',
                               sparse=True)
     with profile('creating tensor'):
         tensor = multi_hot(hashing_trick(grid))
     return {'tensor': tensor}
Ejemplo n.º 2
0
 def train_kmeans(self):
     from sklearn.cluster import KMeans
     with profile('k-means'):
         kmeans = KMeans(n_clusters=10, verbose=1, n_init=1)
         kmeans.fit(self.mtx)
     self.clusters = kmeans.labels_
     self.next(self.end)
Ejemplo n.º 3
0
 def eval(self):
     with profile("Evaluating: %s" % self.model_name):
         mod = MODELS[self.model_name]
         data_run = Flow('TaxiRegressionDataFlow')[self.data_run_id]
         model = mod.load_model(self.model)
         self.mse = mod.mse(model, data_run.data.test_data)
     self.next(self.join)
Ejemplo n.º 4
0
 def load_csv(self):
     with profile('load_csv', stats_dict=self.stats):
         import csv
         with open('taxi.csv') as csvfile:
             for row in csv.reader(csvfile):
                 pass
     self.next(self.join)
Ejemplo n.º 5
0
 def train_kmeans(self):
     from sklearn.cluster import KMeans
     self.k = self.input
     with profile('k-means'):
         kmeans = KMeans(n_clusters=self.k, verbose=1, n_init=1)
         kmeans.fit(self.mtx)
     self.clusters = kmeans.labels_
     self.next(self.analyze)
Ejemplo n.º 6
0
 def train(self):
     self.model_name = self.input
     with profile('Training model: %s' % self.model_name):
         mod = MODELS[self.model_name]
         data_run = Flow('TaxiRegressionDataFlow')[self.data_run_id]
         model = mod.fit(data_run.data.train_data)
         self.model = mod.save_model(model)
     self.next(self.eval)
Ejemplo n.º 7
0
def load_s3(s3, num):
    files = list(s3.list_recursive([URL]))[:num]
    total_size = sum(f.size for f in files) / 1024**3
    stats = {}
    with profile('downloading', stats_dict=stats):
        loaded = s3.get_many([f.url for f in files])

    s3_gbps = (total_size * 8) / (stats['downloading'] / 1000.)
    print("S3->EC2 throughput: %2.1f Gb/s" % s3_gbps)
    return [obj.path for obj in loaded]
Ejemplo n.º 8
0
 def athena_ctas(self, sql):
     import awswrangler as wr
     table = 'mf_ctas_%s' % current.pathspec.replace('/', '_')
     self.ctas = "CREATE TABLE %s AS %s" % (table, sql)
     with profile('Running query'):
         query = wr.athena.start_query_execution(self.ctas,
                                                 database=GLUE_DB)
         output = wr.athena.wait_query(query)
         loc = output['ResultConfiguration']['OutputLocation']
         with S3() as s3:
             return [obj.url for obj in s3.list_recursive([loc + '/'])]
Ejemplo n.º 9
0
    def start(self):
        import pyarrow.parquet as pq

        def make_key(obj):
            key = '%s/month=%s/%s' % tuple([self.table] + obj.key.split('/'))
            return key, obj.path

        def hive_field(f):
            return f.name, TYPES.get(str(f.type), str(f.type))

        with S3() as s3down:
            with profile('Dowloading data'):
                loaded = list(map(make_key, s3down.get_recursive([URL])))
            table = pq.read_table(loaded[0][1])
            self.schema = dict(map(hive_field, table.schema))
            with S3(run=self) as s3up:
                with profile('Uploading data'):
                    uploaded = s3up.put_files(loaded)
                key, url = uploaded[0]
                self.s3_prefix = url[:-(len(key) - len(self.table))]
        self.next(self.end)
Ejemplo n.º 10
0
    def start(self):
        with S3() as s3:
            with profile('Loading and processing'):
                if self.local_dir:
                    files = [
                        os.path.join(self.local_dir, f)
                        for f in os.listdir(self.local_dir)
                    ][:self.num]
                else:
                    files = load_s3(s3, self.num)

                print("Reading %d objects" % len(files))
                stats = {}
                with profile('reading', stats_dict=stats):
                    size = sum(
                        parallel_map(lambda x: len(open(x, 'rb').read()),
                                     files)) / 1024**3

                read_gbps = (size * 8) / (stats['reading'] / 1000.)
                print("Read %2.fGB. Throughput: %2.1f Gb/s" %
                      (size, read_gbps))
        self.next(self.end)
Ejemplo n.º 11
0
 def end(self):
     import awswrangler as wr
     try:
         wr.catalog.create_database(name=GLUE_DB)
     except:
         pass
     wr.athena.create_athena_bucket()
     with profile('Creating table'):
         wr.catalog.create_parquet_table(database=GLUE_DB,
                                         table=self.table,
                                         path=self.s3_prefix,
                                         columns_types=self.schema,
                                         partitions_types={'month': 'int'},
                                         mode='overwrite')
         wr.athena.repair_table(self.table, database=GLUE_DB)
Ejemplo n.º 12
0
 def load_pandas(self):
     with profile('load_pandas', stats_dict=self.stats):
         import pandas as pd
         df = pd.read_parquet('taxi.parquet')
     self.next(self.join)
Ejemplo n.º 13
0
 def load_parquet(self):
     with profile('load_parquet', stats_dict=self.stats):
         import pyarrow.parquet as pq
         table = pq.read_table('taxi.parquet')
     self.next(self.join)
Ejemplo n.º 14
0
 def compute_cooc(self):
     module = import_module('cooc_%s' % self.algo)
     with profile('Computing co-occurrences with the %s algorithm' %
                  self.algo):
         self.cooc = module.compute_cooc(self.mtx, self.num_cpu)
     self.next(self.end)