def main(): config = experiments.JOB_FULL join_spec = join_utils.get_join_spec(config) prepare_utils.prepare(join_spec) loaded_tables = [] for t in join_spec.join_tables: print('Loading', t) table = datasets.LoadImdb(t, use_cols=config["use_cols"]) table.data.info() loaded_tables.append(table) t_start = time.time() join_iter_dataset = FactorizedSamplerIterDataset( loaded_tables, join_spec, sample_batch_size=1000 * 100, disambiguate_column_names=True) table = common.ConcatTables(loaded_tables, join_spec.join_keys, sample_from_join_dataset=join_iter_dataset) join_iter_dataset = common.FactorizedSampleFromJoinIterDataset( join_iter_dataset, base_table=table, factorize_blacklist=[], word_size_bits=10, factorize_fanouts=True) t_end = time.time() log.info(f"> Initialization took {t_end - t_start} seconds.") join_iter_dataset.join_iter_dataset._sample_batch() print('-' * 60) print("Done")
def generate_title_movie_companies(p): table2alias = {'title': 't', 'movie_companies': 'mc', 'company_name': 'cn'} join_tables = ['title', 'movie_companies', 'company_name'] join_keys = {'title': ['id'], 'movie_companies': ['movie_id', 'company_id'], 'company_name': ['id']} join_clauses = {'title': 'title.id=movie_companies.movie_id', 'company_name': 'company_name.id=movie_companies.company_id'} # all_cols = { # 'title': [ # 'title','kind_id','production_year','id2', 'id' # ], # 'movie_companies': [ # 'company_type_id', 'company_id', 'movie_id' # ], # 'company_name': ['name', 'country_code', 'id'], # } config = JOB_jintao p = p + ['movie_companies'] key = '_'.join(sorted([table2alias[x] for x in p])) join_spec = join_utils.get_join_spec(config) prepare_utils.prepare(join_spec) loaded_tables = [] for t in join_spec.join_tables: print('Loading', t) table = datasets.LoadImdb(t, use_cols=config["use_cols"]) table.data.info() loaded_tables.append(table) t_start = time.time() join_iter_dataset = FactorizedSamplerIterDataset( loaded_tables, join_spec, sample_batch_size=51000 * 100, disambiguate_column_names=True) table = common.ConcatTables(loaded_tables, join_spec.join_keys, sample_from_join_dataset=join_iter_dataset) join_iter_dataset = common.FactorizedSampleFromJoinIterDataset( join_iter_dataset, base_table=table, factorize_blacklist=[], word_size_bits=10, factorize_fanouts=True) t_end = time.time() log.info(f"> Initialization took {t_end - t_start} seconds.") print(join_iter_dataset.join_iter_dataset.combined_columns) samples = [] for i in tqdm(range(5000000)): samples.append(next(join_iter_dataset.join_iter_dataset)) df = pd.DataFrame(data=pd.concat(samples, axis=1)).T df.to_csv('/home/jintao/{}.csv'.format(key), index=False)
def MakeSamplerDatasetLoader(self, loaded_tables): assert self.sampler in ['fair_sampler', 'factorized_sampler'], self.sampler join_spec = join_utils.get_join_spec(self.__dict__) if self.sampler == 'fair_sampler': klass = fair_sampler.FairSamplerIterDataset else: klass = factorized_sampler.FactorizedSamplerIterDataset join_iter_dataset = klass( loaded_tables, join_spec, sample_batch_size=self.sampler_batch_size, disambiguate_column_names=True, # Only initialize the sampler if training. initialize_sampler=self.checkpoint_to_load is None, save_samples=self._save_samples, load_samples=self._load_samples) table = common.ConcatTables(loaded_tables, self.join_keys, sample_from_join_dataset=join_iter_dataset) if self.factorize: join_iter_dataset = common.FactorizedSampleFromJoinIterDataset( join_iter_dataset, base_table=table, factorize_blacklist=self.dmol_cols if self.num_dmol else self.factorize_blacklist if self.factorize_blacklist else [], word_size_bits=self.word_size_bits, factorize_fanouts=self.factorize_fanouts) loader = data.DataLoader(join_iter_dataset, batch_size=self.bs, num_workers=self.loader_workers, worker_init_fn=lambda worker_id: np.random. seed(np.random.get_state()[1][0] + worker_id), pin_memory=True) return join_spec, join_iter_dataset, loader, table
def MakeQueries(spark, cursor, num_queries, tables_in_templates, table_names, join_keys, rng): """Sample a tuple from actual join result then place filters.""" spark.catalog.clearCache() # TODO: this assumes single equiv class. join_items = list(join_keys.items()) lhs = join_items[0][1] join_clauses_list = [] for rhs in join_items[1:]: rhs = rhs[1] join_clauses_list.append('{} = {}'.format(lhs, rhs)) lhs = rhs join_clauses = '\n AND '.join(join_clauses_list) # Take only the content columns. content_cols = [] categoricals = [] numericals = [] for table_name in table_names: categorical_cols = datasets.TPC_DS.CATEGORICAL_COLUMNS[table_name] for c in categorical_cols: disambiguated_name = common.JoinTableAndColumnNames(table_name, c, sep='.') content_cols.append(disambiguated_name) categoricals.append(disambiguated_name) range_cols = datasets.TPC_DS.RANGE_COLUMNS[table_name] for c in range_cols: disambiguated_name = common.JoinTableAndColumnNames(table_name, c, sep='.') content_cols.append(disambiguated_name) numericals.append(disambiguated_name) # Build a concat table representing the join result schema. join_keys_list = [join_keys[n] for n in table_names] join_spec = join_utils.get_join_spec({ "join_tables": table_names, "join_keys": dict(zip(table_names, [[k.split(".")[1]] for k in join_keys_list])), "join_root": "item", "join_how": "inner", }) ds = FactorizedSamplerIterDataset(tables_in_templates, join_spec, sample_batch_size=num_queries, disambiguate_column_names=False, add_full_join_indicators=False, add_full_join_fanouts=False) concat_table = common.ConcatTables(tables_in_templates, join_keys_list, sample_from_join_dataset=ds) template_for_execution = template.Template( textwrap.dedent(""" SELECT COUNT(*) FROM ${', '.join(table_names)} WHERE ${join_clauses} AND ${filter_clauses}; """).strip()) true_inner_join_card = ds.sampler.join_card true_full_join_card = TDS_LIGHT_OUTER_CARDINALITY print('True inner join card', true_inner_join_card, 'true full', true_full_join_card) ncols = len(content_cols) queries = [] filter_strings = [] sql_queries = [] # To get true cardinalities. while len(queries) < num_queries: sampled_df = ds.sampler.run()[content_cols] for r in sampled_df.iterrows(): tup = r[1] num_filters = rng.randint(FLAGS.min_filters, max(ncols // 2, FLAGS.max_filters)) # Positions where the values are non-null. non_null_indices = np.argwhere(~pd.isnull(tup).values).reshape( -1, ) if len(non_null_indices) < num_filters: continue print('{} filters out of {} content cols'.format( num_filters, ncols)) # Place {'<=', '>=', '='} on numericals and '=' on categoricals. idxs = rng.choice(non_null_indices, replace=False, size=num_filters) vals = tup[idxs].values cols = np.take(content_cols, idxs) ops = rng.choice(['<=', '>=', '='], size=num_filters) sensible_to_do_range = [c in numericals for c in cols] ops = np.where(sensible_to_do_range, ops, '=') print('cols', cols, 'ops', ops, 'vals', vals) queries.append((cols, ops, vals)) filter_strings.append(','.join([ ','.join((c, o, str(v))) for c, o, v in zip(cols, ops, vals) ])) # Quote string literals & leave other literals alone. filter_clauses = '\n AND '.join([ '{} {} {}'.format(col, op, val) if concat_table[col].data.dtype in [np.int64, np.float64] else '{} {} \'{}\''.format( col, op, val) for col, op, val in zip(cols, ops, vals) ]) sql = template_for_execution.render(table_names=table_names, join_clauses=join_clauses, filter_clauses=filter_clauses) sql_queries.append(sql) if len(queries) >= num_queries: break true_cards = [] for i, sql_query in enumerate(sql_queries): DropBufferCache() spark.catalog.clearCache() print(' Query', i, 'out of', len(sql_queries), '[{}]'.format(filter_strings[i]), end='') t1 = time.time() true_card = ExecuteSql(spark, sql_query)[0][0] # cursor.execute(sql_query) # result = cursor.fetchall() # true_card = result[0][0] dur = time.time() - t1 true_cards.append(true_card) print( '...done: {} (inner join sel {}; full sel {}; inner join {}); dur {:.1f}s' .format(true_card, true_card / true_inner_join_card, true_card / true_full_join_card, true_inner_join_card, dur)) # if i > 0 and i % 1 == 0: # spark = StartSpark(spark) df = pd.DataFrame({ 'tables': [','.join(table_names)] * len(true_cards), 'join_conds': [','.join(map(lambda s: s.replace(' ', ''), join_clauses_list))] * len(true_cards), 'filters': filter_strings, 'true_cards': true_cards, }) df.to_csv(FLAGS.output_csv, sep='#', mode='a', index=False, header=False) print('Template done.') return queries, true_cards
def main(): table2alias = {'title': 't', 'cast_info': 'ci', 'movie_companies': 'mc', 'movie_info': 'mi', 'movie_info_idx': 'mi_idx', 'movie_keyword': 'mk'} join_tables = ['title', 'cast_info', 'movie_companies', 'movie_info', 'movie_info_idx', 'movie_keyword'] join_keys = {'title': ['id'], 'cast_info': ['movie_id'], 'movie_companies': ['movie_id'], 'movie_info': ['movie_id'], 'movie_info_idx': ['movie_id'], 'movie_keyword': ['movie_id']} join_clauses = {'cast_info': 'title.id=cast_info.movie_id', 'movie_companies': 'title.id=movie_companies.movie_id', 'movie_info': 'title.id=movie_info.movie_id', 'movie_info_idx': 'title.id=movie_info_idx.movie_id', 'movie_keyword': 'title.id=movie_keyword.movie_id'} all_cols = { 'title': [ 'kind_id', 'production_year', 'episode_nr', 'imdb_index', 'phonetic_code', 'season_nr', 'series_years' ], 'cast_info': [ 'nr_order', 'role_id' ], 'movie_companies': [ 'company_type_id' ], 'movie_info_idx': ['info_type_id'], 'movie_info': ['info_type_id'], 'movie_keyword': ['keyword_id'] } tables = ['cast_info', 'movie_companies', 'movie_info', 'movie_info_idx', 'movie_keyword'] for num in range(1, 6): for p in combinations(tables, num): config = JOB_MY config['join_clauses'] = [] p = [x for x in p] for t in p: config['join_clauses'].append(join_clauses[t]) p = p + ['title'] key = '_'.join(sorted([table2alias[x] for x in p])) config['join_tables'] = p config['join_keys'] = {} for t in p: config['join_keys'][t] = join_keys[t] col_num = 0 for t in p: col_num += len(all_cols[t]) join_spec = join_utils.get_join_spec(config) prepare_utils.prepare(join_spec) loaded_tables = [] for t in join_spec.join_tables: print('Loading', t) table = datasets.LoadImdb(t, use_cols=config["use_cols"]) table.data.info() loaded_tables.append(table) t_start = time.time() join_iter_dataset = FactorizedSamplerIterDataset( loaded_tables, join_spec, sample_batch_size=1000 * 100, disambiguate_column_names=True) table = common.ConcatTables(loaded_tables, join_spec.join_keys, sample_from_join_dataset=join_iter_dataset) join_iter_dataset = common.FactorizedSampleFromJoinIterDataset( join_iter_dataset, base_table=table, factorize_blacklist=[], word_size_bits=10, factorize_fanouts=True) t_end = time.time() log.info(f"> Initialization took {t_end - t_start} seconds.") print(join_iter_dataset.join_iter_dataset.combined_columns) samples = [] for i in tqdm(range(1000000)): samples.append(next(join_iter_dataset.join_iter_dataset)) df = pd.DataFrame(data=pd.concat(samples, axis=1)).T.iloc[:, :col_num] df.to_csv('../train-test-data/join_samples/{}.csv'.format(key), index=False) # join_iter_dataset.join_iter_dataset._sample_batch() print('-' * 60) print("Done {}".format(key))