def encode(cls, table): from tensorflow.keras.layers import Hashing, IntegerLookup with profile('coordinates to grid'): grid = cls._coords_to_grid(table) hashing_trick = Hashing(NUM_HASH_BINS) multi_hot = IntegerLookup(vocabulary=list(range(NUM_HASH_BINS)), output_mode='multi_hot', sparse=True) with profile('creating tensor'): tensor = multi_hot(hashing_trick(grid)) return {'tensor': tensor}
def train_kmeans(self): from sklearn.cluster import KMeans with profile('k-means'): kmeans = KMeans(n_clusters=10, verbose=1, n_init=1) kmeans.fit(self.mtx) self.clusters = kmeans.labels_ self.next(self.end)
def eval(self): with profile("Evaluating: %s" % self.model_name): mod = MODELS[self.model_name] data_run = Flow('TaxiRegressionDataFlow')[self.data_run_id] model = mod.load_model(self.model) self.mse = mod.mse(model, data_run.data.test_data) self.next(self.join)
def load_csv(self): with profile('load_csv', stats_dict=self.stats): import csv with open('taxi.csv') as csvfile: for row in csv.reader(csvfile): pass self.next(self.join)
def train_kmeans(self): from sklearn.cluster import KMeans self.k = self.input with profile('k-means'): kmeans = KMeans(n_clusters=self.k, verbose=1, n_init=1) kmeans.fit(self.mtx) self.clusters = kmeans.labels_ self.next(self.analyze)
def train(self): self.model_name = self.input with profile('Training model: %s' % self.model_name): mod = MODELS[self.model_name] data_run = Flow('TaxiRegressionDataFlow')[self.data_run_id] model = mod.fit(data_run.data.train_data) self.model = mod.save_model(model) self.next(self.eval)
def load_s3(s3, num): files = list(s3.list_recursive([URL]))[:num] total_size = sum(f.size for f in files) / 1024**3 stats = {} with profile('downloading', stats_dict=stats): loaded = s3.get_many([f.url for f in files]) s3_gbps = (total_size * 8) / (stats['downloading'] / 1000.) print("S3->EC2 throughput: %2.1f Gb/s" % s3_gbps) return [obj.path for obj in loaded]
def athena_ctas(self, sql): import awswrangler as wr table = 'mf_ctas_%s' % current.pathspec.replace('/', '_') self.ctas = "CREATE TABLE %s AS %s" % (table, sql) with profile('Running query'): query = wr.athena.start_query_execution(self.ctas, database=GLUE_DB) output = wr.athena.wait_query(query) loc = output['ResultConfiguration']['OutputLocation'] with S3() as s3: return [obj.url for obj in s3.list_recursive([loc + '/'])]
def start(self): import pyarrow.parquet as pq def make_key(obj): key = '%s/month=%s/%s' % tuple([self.table] + obj.key.split('/')) return key, obj.path def hive_field(f): return f.name, TYPES.get(str(f.type), str(f.type)) with S3() as s3down: with profile('Dowloading data'): loaded = list(map(make_key, s3down.get_recursive([URL]))) table = pq.read_table(loaded[0][1]) self.schema = dict(map(hive_field, table.schema)) with S3(run=self) as s3up: with profile('Uploading data'): uploaded = s3up.put_files(loaded) key, url = uploaded[0] self.s3_prefix = url[:-(len(key) - len(self.table))] self.next(self.end)
def start(self): with S3() as s3: with profile('Loading and processing'): if self.local_dir: files = [ os.path.join(self.local_dir, f) for f in os.listdir(self.local_dir) ][:self.num] else: files = load_s3(s3, self.num) print("Reading %d objects" % len(files)) stats = {} with profile('reading', stats_dict=stats): size = sum( parallel_map(lambda x: len(open(x, 'rb').read()), files)) / 1024**3 read_gbps = (size * 8) / (stats['reading'] / 1000.) print("Read %2.fGB. Throughput: %2.1f Gb/s" % (size, read_gbps)) self.next(self.end)
def end(self): import awswrangler as wr try: wr.catalog.create_database(name=GLUE_DB) except: pass wr.athena.create_athena_bucket() with profile('Creating table'): wr.catalog.create_parquet_table(database=GLUE_DB, table=self.table, path=self.s3_prefix, columns_types=self.schema, partitions_types={'month': 'int'}, mode='overwrite') wr.athena.repair_table(self.table, database=GLUE_DB)
def load_pandas(self): with profile('load_pandas', stats_dict=self.stats): import pandas as pd df = pd.read_parquet('taxi.parquet') self.next(self.join)
def load_parquet(self): with profile('load_parquet', stats_dict=self.stats): import pyarrow.parquet as pq table = pq.read_table('taxi.parquet') self.next(self.join)
def compute_cooc(self): module = import_module('cooc_%s' % self.algo) with profile('Computing co-occurrences with the %s algorithm' % self.algo): self.cooc = module.compute_cooc(self.mtx, self.num_cpu) self.next(self.end)