def main():
    config = experiments.JOB_FULL
    join_spec = join_utils.get_join_spec(config)
    prepare_utils.prepare(join_spec)
    loaded_tables = []
    for t in join_spec.join_tables:
        print('Loading', t)
        table = datasets.LoadImdb(t, use_cols=config["use_cols"])
        table.data.info()
        loaded_tables.append(table)

    t_start = time.time()
    join_iter_dataset = FactorizedSamplerIterDataset(
        loaded_tables,
        join_spec,
        sample_batch_size=1000 * 100,
        disambiguate_column_names=True)

    table = common.ConcatTables(loaded_tables,
                                join_spec.join_keys,
                                sample_from_join_dataset=join_iter_dataset)

    join_iter_dataset = common.FactorizedSampleFromJoinIterDataset(
        join_iter_dataset,
        base_table=table,
        factorize_blacklist=[],
        word_size_bits=10,
        factorize_fanouts=True)
    t_end = time.time()
    log.info(f"> Initialization took {t_end - t_start} seconds.")

    join_iter_dataset.join_iter_dataset._sample_batch()
    print('-' * 60)
    print("Done")
Exemple #2
0
def generate_title_movie_companies(p):
    table2alias = {'title': 't', 'movie_companies': 'mc', 'company_name': 'cn'}
    join_tables = ['title', 'movie_companies', 'company_name']
    join_keys = {'title': ['id'], 'movie_companies': ['movie_id', 'company_id'], 'company_name': ['id']}
    join_clauses = {'title': 'title.id=movie_companies.movie_id',
                    'company_name': 'company_name.id=movie_companies.company_id'}
    # all_cols = {
    #         'title': [
    #             'title','kind_id','production_year','id2', 'id'
    #         ],
    #         'movie_companies': [
    #             'company_type_id', 'company_id', 'movie_id'
    #         ],
    #         'company_name': ['name', 'country_code', 'id'],
    #     }

    config = JOB_jintao
    p = p + ['movie_companies']
    key = '_'.join(sorted([table2alias[x] for x in p]))
    join_spec = join_utils.get_join_spec(config)
    prepare_utils.prepare(join_spec)
    loaded_tables = []
    for t in join_spec.join_tables:
        print('Loading', t)
        table = datasets.LoadImdb(t, use_cols=config["use_cols"])
        table.data.info()
        loaded_tables.append(table)
    t_start = time.time()
    join_iter_dataset = FactorizedSamplerIterDataset(
        loaded_tables,
        join_spec,
        sample_batch_size=51000 * 100,
        disambiguate_column_names=True)
    table = common.ConcatTables(loaded_tables,
                                join_spec.join_keys,
                                sample_from_join_dataset=join_iter_dataset)

    join_iter_dataset = common.FactorizedSampleFromJoinIterDataset(
        join_iter_dataset,
        base_table=table,
        factorize_blacklist=[],
        word_size_bits=10,
        factorize_fanouts=True)
    t_end = time.time()
    log.info(f"> Initialization took {t_end - t_start} seconds.")
    print(join_iter_dataset.join_iter_dataset.combined_columns)
    samples = []
    for i in tqdm(range(5000000)):
        samples.append(next(join_iter_dataset.join_iter_dataset))
    df = pd.DataFrame(data=pd.concat(samples, axis=1)).T
    df.to_csv('/home/jintao/{}.csv'.format(key), index=False)
Exemple #3
0
    def MakeSamplerDatasetLoader(self, loaded_tables):
        assert self.sampler in ['fair_sampler',
                                'factorized_sampler'], self.sampler
        join_spec = join_utils.get_join_spec(self.__dict__)
        if self.sampler == 'fair_sampler':
            klass = fair_sampler.FairSamplerIterDataset
        else:
            klass = factorized_sampler.FactorizedSamplerIterDataset
        join_iter_dataset = klass(
            loaded_tables,
            join_spec,
            sample_batch_size=self.sampler_batch_size,
            disambiguate_column_names=True,
            # Only initialize the sampler if training.
            initialize_sampler=self.checkpoint_to_load is None,
            save_samples=self._save_samples,
            load_samples=self._load_samples)

        table = common.ConcatTables(loaded_tables,
                                    self.join_keys,
                                    sample_from_join_dataset=join_iter_dataset)

        if self.factorize:
            join_iter_dataset = common.FactorizedSampleFromJoinIterDataset(
                join_iter_dataset,
                base_table=table,
                factorize_blacklist=self.dmol_cols if self.num_dmol else
                self.factorize_blacklist if self.factorize_blacklist else [],
                word_size_bits=self.word_size_bits,
                factorize_fanouts=self.factorize_fanouts)

        loader = data.DataLoader(join_iter_dataset,
                                 batch_size=self.bs,
                                 num_workers=self.loader_workers,
                                 worker_init_fn=lambda worker_id: np.random.
                                 seed(np.random.get_state()[1][0] + worker_id),
                                 pin_memory=True)
        return join_spec, join_iter_dataset, loader, table
Exemple #4
0
def main():
    table2alias = {'title': 't', 'cast_info': 'ci', 'movie_companies': 'mc', 'movie_info': 'mi',
                   'movie_info_idx': 'mi_idx', 'movie_keyword': 'mk'}
    join_tables = ['title', 'cast_info', 'movie_companies', 'movie_info', 'movie_info_idx', 'movie_keyword']
    join_keys = {'title': ['id'], 'cast_info': ['movie_id'], 'movie_companies': ['movie_id'],
                 'movie_info': ['movie_id'], 'movie_info_idx': ['movie_id'], 'movie_keyword': ['movie_id']}
    join_clauses = {'cast_info': 'title.id=cast_info.movie_id', 'movie_companies': 'title.id=movie_companies.movie_id',
                    'movie_info': 'title.id=movie_info.movie_id', 'movie_info_idx': 'title.id=movie_info_idx.movie_id',
                    'movie_keyword': 'title.id=movie_keyword.movie_id'}
    all_cols = {
        'title': [
            'kind_id', 'production_year', 'episode_nr', 'imdb_index', 'phonetic_code', 'season_nr', 'series_years'
        ],
        'cast_info': [
            'nr_order', 'role_id'
        ],
        'movie_companies': [
            'company_type_id'
        ],
        'movie_info_idx': ['info_type_id'],
        'movie_info': ['info_type_id'],
        'movie_keyword': ['keyword_id']
    }

    tables = ['cast_info', 'movie_companies', 'movie_info', 'movie_info_idx', 'movie_keyword']
    for num in range(1, 6):
        for p in combinations(tables, num):
            config = JOB_MY
            config['join_clauses'] = []
            p = [x for x in p]
            for t in p:
                config['join_clauses'].append(join_clauses[t])
            p = p + ['title']
            key = '_'.join(sorted([table2alias[x] for x in p]))
            config['join_tables'] = p
            config['join_keys'] = {}
            for t in p:
                config['join_keys'][t] = join_keys[t]
            col_num = 0
            for t in p:
                col_num += len(all_cols[t])
            join_spec = join_utils.get_join_spec(config)
            prepare_utils.prepare(join_spec)
            loaded_tables = []
            for t in join_spec.join_tables:
                print('Loading', t)
                table = datasets.LoadImdb(t, use_cols=config["use_cols"])
                table.data.info()
                loaded_tables.append(table)

            t_start = time.time()
            join_iter_dataset = FactorizedSamplerIterDataset(
                loaded_tables,
                join_spec,
                sample_batch_size=1000 * 100,
                disambiguate_column_names=True)

            table = common.ConcatTables(loaded_tables,
                                        join_spec.join_keys,
                                        sample_from_join_dataset=join_iter_dataset)

            join_iter_dataset = common.FactorizedSampleFromJoinIterDataset(
                join_iter_dataset,
                base_table=table,
                factorize_blacklist=[],
                word_size_bits=10,
                factorize_fanouts=True)
            t_end = time.time()
            log.info(f"> Initialization took {t_end - t_start} seconds.")
            print(join_iter_dataset.join_iter_dataset.combined_columns)
            samples = []
            for i in tqdm(range(1000000)):
                samples.append(next(join_iter_dataset.join_iter_dataset))
            df = pd.DataFrame(data=pd.concat(samples, axis=1)).T.iloc[:, :col_num]
            df.to_csv('../train-test-data/join_samples/{}.csv'.format(key), index=False)
            # join_iter_dataset.join_iter_dataset._sample_batch()
            print('-' * 60)
            print("Done {}".format(key))