def main(): config = experiments.JOB_FULL join_spec = join_utils.get_join_spec(config) prepare_utils.prepare(join_spec) loaded_tables = [] for t in join_spec.join_tables: print('Loading', t) table = datasets.LoadImdb(t, use_cols=config["use_cols"]) table.data.info() loaded_tables.append(table) t_start = time.time() join_iter_dataset = FactorizedSamplerIterDataset( loaded_tables, join_spec, sample_batch_size=1000 * 100, disambiguate_column_names=True) table = common.ConcatTables(loaded_tables, join_spec.join_keys, sample_from_join_dataset=join_iter_dataset) join_iter_dataset = common.FactorizedSampleFromJoinIterDataset( join_iter_dataset, base_table=table, factorize_blacklist=[], word_size_bits=10, factorize_fanouts=True) t_end = time.time() log.info(f"> Initialization took {t_end - t_start} seconds.") join_iter_dataset.join_iter_dataset._sample_batch() print('-' * 60) print("Done")
def generate_title_movie_companies(p): table2alias = {'title': 't', 'movie_companies': 'mc', 'company_name': 'cn'} join_tables = ['title', 'movie_companies', 'company_name'] join_keys = {'title': ['id'], 'movie_companies': ['movie_id', 'company_id'], 'company_name': ['id']} join_clauses = {'title': 'title.id=movie_companies.movie_id', 'company_name': 'company_name.id=movie_companies.company_id'} # all_cols = { # 'title': [ # 'title','kind_id','production_year','id2', 'id' # ], # 'movie_companies': [ # 'company_type_id', 'company_id', 'movie_id' # ], # 'company_name': ['name', 'country_code', 'id'], # } config = JOB_jintao p = p + ['movie_companies'] key = '_'.join(sorted([table2alias[x] for x in p])) join_spec = join_utils.get_join_spec(config) prepare_utils.prepare(join_spec) loaded_tables = [] for t in join_spec.join_tables: print('Loading', t) table = datasets.LoadImdb(t, use_cols=config["use_cols"]) table.data.info() loaded_tables.append(table) t_start = time.time() join_iter_dataset = FactorizedSamplerIterDataset( loaded_tables, join_spec, sample_batch_size=51000 * 100, disambiguate_column_names=True) table = common.ConcatTables(loaded_tables, join_spec.join_keys, sample_from_join_dataset=join_iter_dataset) join_iter_dataset = common.FactorizedSampleFromJoinIterDataset( join_iter_dataset, base_table=table, factorize_blacklist=[], word_size_bits=10, factorize_fanouts=True) t_end = time.time() log.info(f"> Initialization took {t_end - t_start} seconds.") print(join_iter_dataset.join_iter_dataset.combined_columns) samples = [] for i in tqdm(range(5000000)): samples.append(next(join_iter_dataset.join_iter_dataset)) df = pd.DataFrame(data=pd.concat(samples, axis=1)).T df.to_csv('/home/jintao/{}.csv'.format(key), index=False)
def MakeSamplerDatasetLoader(self, loaded_tables): assert self.sampler in ['fair_sampler', 'factorized_sampler'], self.sampler join_spec = join_utils.get_join_spec(self.__dict__) if self.sampler == 'fair_sampler': klass = fair_sampler.FairSamplerIterDataset else: klass = factorized_sampler.FactorizedSamplerIterDataset join_iter_dataset = klass( loaded_tables, join_spec, sample_batch_size=self.sampler_batch_size, disambiguate_column_names=True, # Only initialize the sampler if training. initialize_sampler=self.checkpoint_to_load is None, save_samples=self._save_samples, load_samples=self._load_samples) table = common.ConcatTables(loaded_tables, self.join_keys, sample_from_join_dataset=join_iter_dataset) if self.factorize: join_iter_dataset = common.FactorizedSampleFromJoinIterDataset( join_iter_dataset, base_table=table, factorize_blacklist=self.dmol_cols if self.num_dmol else self.factorize_blacklist if self.factorize_blacklist else [], word_size_bits=self.word_size_bits, factorize_fanouts=self.factorize_fanouts) loader = data.DataLoader(join_iter_dataset, batch_size=self.bs, num_workers=self.loader_workers, worker_init_fn=lambda worker_id: np.random. seed(np.random.get_state()[1][0] + worker_id), pin_memory=True) return join_spec, join_iter_dataset, loader, table
def main(): table2alias = {'title': 't', 'cast_info': 'ci', 'movie_companies': 'mc', 'movie_info': 'mi', 'movie_info_idx': 'mi_idx', 'movie_keyword': 'mk'} join_tables = ['title', 'cast_info', 'movie_companies', 'movie_info', 'movie_info_idx', 'movie_keyword'] join_keys = {'title': ['id'], 'cast_info': ['movie_id'], 'movie_companies': ['movie_id'], 'movie_info': ['movie_id'], 'movie_info_idx': ['movie_id'], 'movie_keyword': ['movie_id']} join_clauses = {'cast_info': 'title.id=cast_info.movie_id', 'movie_companies': 'title.id=movie_companies.movie_id', 'movie_info': 'title.id=movie_info.movie_id', 'movie_info_idx': 'title.id=movie_info_idx.movie_id', 'movie_keyword': 'title.id=movie_keyword.movie_id'} all_cols = { 'title': [ 'kind_id', 'production_year', 'episode_nr', 'imdb_index', 'phonetic_code', 'season_nr', 'series_years' ], 'cast_info': [ 'nr_order', 'role_id' ], 'movie_companies': [ 'company_type_id' ], 'movie_info_idx': ['info_type_id'], 'movie_info': ['info_type_id'], 'movie_keyword': ['keyword_id'] } tables = ['cast_info', 'movie_companies', 'movie_info', 'movie_info_idx', 'movie_keyword'] for num in range(1, 6): for p in combinations(tables, num): config = JOB_MY config['join_clauses'] = [] p = [x for x in p] for t in p: config['join_clauses'].append(join_clauses[t]) p = p + ['title'] key = '_'.join(sorted([table2alias[x] for x in p])) config['join_tables'] = p config['join_keys'] = {} for t in p: config['join_keys'][t] = join_keys[t] col_num = 0 for t in p: col_num += len(all_cols[t]) join_spec = join_utils.get_join_spec(config) prepare_utils.prepare(join_spec) loaded_tables = [] for t in join_spec.join_tables: print('Loading', t) table = datasets.LoadImdb(t, use_cols=config["use_cols"]) table.data.info() loaded_tables.append(table) t_start = time.time() join_iter_dataset = FactorizedSamplerIterDataset( loaded_tables, join_spec, sample_batch_size=1000 * 100, disambiguate_column_names=True) table = common.ConcatTables(loaded_tables, join_spec.join_keys, sample_from_join_dataset=join_iter_dataset) join_iter_dataset = common.FactorizedSampleFromJoinIterDataset( join_iter_dataset, base_table=table, factorize_blacklist=[], word_size_bits=10, factorize_fanouts=True) t_end = time.time() log.info(f"> Initialization took {t_end - t_start} seconds.") print(join_iter_dataset.join_iter_dataset.combined_columns) samples = [] for i in tqdm(range(1000000)): samples.append(next(join_iter_dataset.join_iter_dataset)) df = pd.DataFrame(data=pd.concat(samples, axis=1)).T.iloc[:, :col_num] df.to_csv('../train-test-data/join_samples/{}.csv'.format(key), index=False) # join_iter_dataset.join_iter_dataset._sample_batch() print('-' * 60) print("Done {}".format(key))