Example #1
0
def main():
    config = experiments.JOB_FULL
    join_spec = join_utils.get_join_spec(config)
    prepare_utils.prepare(join_spec)
    loaded_tables = []
    for t in join_spec.join_tables:
        print('Loading', t)
        table = datasets.LoadImdb(t, use_cols=config["use_cols"])
        table.data.info()
        loaded_tables.append(table)

    t_start = time.time()
    join_iter_dataset = FactorizedSamplerIterDataset(
        loaded_tables,
        join_spec,
        sample_batch_size=1000 * 100,
        disambiguate_column_names=True)

    table = common.ConcatTables(loaded_tables,
                                join_spec.join_keys,
                                sample_from_join_dataset=join_iter_dataset)

    join_iter_dataset = common.FactorizedSampleFromJoinIterDataset(
        join_iter_dataset,
        base_table=table,
        factorize_blacklist=[],
        word_size_bits=10,
        factorize_fanouts=True)
    t_end = time.time()
    log.info(f"> Initialization took {t_end - t_start} seconds.")

    join_iter_dataset.join_iter_dataset._sample_batch()
    print('-' * 60)
    print("Done")
Example #2
0
def generate_title_movie_companies(p):
    table2alias = {'title': 't', 'movie_companies': 'mc', 'company_name': 'cn'}
    join_tables = ['title', 'movie_companies', 'company_name']
    join_keys = {'title': ['id'], 'movie_companies': ['movie_id', 'company_id'], 'company_name': ['id']}
    join_clauses = {'title': 'title.id=movie_companies.movie_id',
                    'company_name': 'company_name.id=movie_companies.company_id'}
    # all_cols = {
    #         'title': [
    #             'title','kind_id','production_year','id2', 'id'
    #         ],
    #         'movie_companies': [
    #             'company_type_id', 'company_id', 'movie_id'
    #         ],
    #         'company_name': ['name', 'country_code', 'id'],
    #     }

    config = JOB_jintao
    p = p + ['movie_companies']
    key = '_'.join(sorted([table2alias[x] for x in p]))
    join_spec = join_utils.get_join_spec(config)
    prepare_utils.prepare(join_spec)
    loaded_tables = []
    for t in join_spec.join_tables:
        print('Loading', t)
        table = datasets.LoadImdb(t, use_cols=config["use_cols"])
        table.data.info()
        loaded_tables.append(table)
    t_start = time.time()
    join_iter_dataset = FactorizedSamplerIterDataset(
        loaded_tables,
        join_spec,
        sample_batch_size=51000 * 100,
        disambiguate_column_names=True)
    table = common.ConcatTables(loaded_tables,
                                join_spec.join_keys,
                                sample_from_join_dataset=join_iter_dataset)

    join_iter_dataset = common.FactorizedSampleFromJoinIterDataset(
        join_iter_dataset,
        base_table=table,
        factorize_blacklist=[],
        word_size_bits=10,
        factorize_fanouts=True)
    t_end = time.time()
    log.info(f"> Initialization took {t_end - t_start} seconds.")
    print(join_iter_dataset.join_iter_dataset.combined_columns)
    samples = []
    for i in tqdm(range(5000000)):
        samples.append(next(join_iter_dataset.join_iter_dataset))
    df = pd.DataFrame(data=pd.concat(samples, axis=1)).T
    df.to_csv('/home/jintao/{}.csv'.format(key), index=False)
Example #3
0
    def MakeSamplerDatasetLoader(self, loaded_tables):
        assert self.sampler in ['fair_sampler',
                                'factorized_sampler'], self.sampler
        join_spec = join_utils.get_join_spec(self.__dict__)
        if self.sampler == 'fair_sampler':
            klass = fair_sampler.FairSamplerIterDataset
        else:
            klass = factorized_sampler.FactorizedSamplerIterDataset
        join_iter_dataset = klass(
            loaded_tables,
            join_spec,
            sample_batch_size=self.sampler_batch_size,
            disambiguate_column_names=True,
            # Only initialize the sampler if training.
            initialize_sampler=self.checkpoint_to_load is None,
            save_samples=self._save_samples,
            load_samples=self._load_samples)

        table = common.ConcatTables(loaded_tables,
                                    self.join_keys,
                                    sample_from_join_dataset=join_iter_dataset)

        if self.factorize:
            join_iter_dataset = common.FactorizedSampleFromJoinIterDataset(
                join_iter_dataset,
                base_table=table,
                factorize_blacklist=self.dmol_cols if self.num_dmol else
                self.factorize_blacklist if self.factorize_blacklist else [],
                word_size_bits=self.word_size_bits,
                factorize_fanouts=self.factorize_fanouts)

        loader = data.DataLoader(join_iter_dataset,
                                 batch_size=self.bs,
                                 num_workers=self.loader_workers,
                                 worker_init_fn=lambda worker_id: np.random.
                                 seed(np.random.get_state()[1][0] + worker_id),
                                 pin_memory=True)
        return join_spec, join_iter_dataset, loader, table
Example #4
0
def MakeQueries(spark, cursor, num_queries, tables_in_templates, table_names,
                join_keys, rng):
    """Sample a tuple from actual join result then place filters."""
    spark.catalog.clearCache()

    # TODO: this assumes single equiv class.
    join_items = list(join_keys.items())
    lhs = join_items[0][1]
    join_clauses_list = []
    for rhs in join_items[1:]:
        rhs = rhs[1]
        join_clauses_list.append('{} = {}'.format(lhs, rhs))
        lhs = rhs
    join_clauses = '\n AND '.join(join_clauses_list)

    # Take only the content columns.
    content_cols = []
    categoricals = []
    numericals = []
    for table_name in table_names:
        categorical_cols = datasets.TPC_DS.CATEGORICAL_COLUMNS[table_name]
        for c in categorical_cols:
            disambiguated_name = common.JoinTableAndColumnNames(table_name,
                                                                c,
                                                                sep='.')
            content_cols.append(disambiguated_name)
            categoricals.append(disambiguated_name)

        range_cols = datasets.TPC_DS.RANGE_COLUMNS[table_name]
        for c in range_cols:
            disambiguated_name = common.JoinTableAndColumnNames(table_name,
                                                                c,
                                                                sep='.')
            content_cols.append(disambiguated_name)
            numericals.append(disambiguated_name)

    # Build a concat table representing the join result schema.
    join_keys_list = [join_keys[n] for n in table_names]
    join_spec = join_utils.get_join_spec({
        "join_tables":
        table_names,
        "join_keys":
        dict(zip(table_names, [[k.split(".")[1]] for k in join_keys_list])),
        "join_root":
        "item",
        "join_how":
        "inner",
    })
    ds = FactorizedSamplerIterDataset(tables_in_templates,
                                      join_spec,
                                      sample_batch_size=num_queries,
                                      disambiguate_column_names=False,
                                      add_full_join_indicators=False,
                                      add_full_join_fanouts=False)
    concat_table = common.ConcatTables(tables_in_templates,
                                       join_keys_list,
                                       sample_from_join_dataset=ds)

    template_for_execution = template.Template(
        textwrap.dedent("""
        SELECT COUNT(*)
        FROM ${', '.join(table_names)}
        WHERE ${join_clauses}
        AND ${filter_clauses};
    """).strip())

    true_inner_join_card = ds.sampler.join_card
    true_full_join_card = TDS_LIGHT_OUTER_CARDINALITY
    print('True inner join card', true_inner_join_card, 'true full',
          true_full_join_card)

    ncols = len(content_cols)
    queries = []
    filter_strings = []
    sql_queries = []  # To get true cardinalities.

    while len(queries) < num_queries:
        sampled_df = ds.sampler.run()[content_cols]

        for r in sampled_df.iterrows():
            tup = r[1]
            num_filters = rng.randint(FLAGS.min_filters,
                                      max(ncols // 2, FLAGS.max_filters))

            # Positions where the values are non-null.
            non_null_indices = np.argwhere(~pd.isnull(tup).values).reshape(
                -1, )
            if len(non_null_indices) < num_filters:
                continue
            print('{} filters out of {} content cols'.format(
                num_filters, ncols))

            # Place {'<=', '>=', '='} on numericals and '=' on categoricals.
            idxs = rng.choice(non_null_indices,
                              replace=False,
                              size=num_filters)
            vals = tup[idxs].values
            cols = np.take(content_cols, idxs)
            ops = rng.choice(['<=', '>=', '='], size=num_filters)
            sensible_to_do_range = [c in numericals for c in cols]
            ops = np.where(sensible_to_do_range, ops, '=')

            print('cols', cols, 'ops', ops, 'vals', vals)

            queries.append((cols, ops, vals))
            filter_strings.append(','.join([
                ','.join((c, o, str(v))) for c, o, v in zip(cols, ops, vals)
            ]))

            # Quote string literals & leave other literals alone.
            filter_clauses = '\n AND '.join([
                '{} {} {}'.format(col, op, val) if concat_table[col].data.dtype
                in [np.int64, np.float64] else '{} {} \'{}\''.format(
                    col, op, val) for col, op, val in zip(cols, ops, vals)
            ])

            sql = template_for_execution.render(table_names=table_names,
                                                join_clauses=join_clauses,
                                                filter_clauses=filter_clauses)
            sql_queries.append(sql)

            if len(queries) >= num_queries:
                break

    true_cards = []

    for i, sql_query in enumerate(sql_queries):
        DropBufferCache()
        spark.catalog.clearCache()

        print('  Query',
              i,
              'out of',
              len(sql_queries),
              '[{}]'.format(filter_strings[i]),
              end='')

        t1 = time.time()

        true_card = ExecuteSql(spark, sql_query)[0][0]

        # cursor.execute(sql_query)
        # result = cursor.fetchall()
        # true_card = result[0][0]

        dur = time.time() - t1

        true_cards.append(true_card)
        print(
            '...done: {} (inner join sel {}; full sel {}; inner join {}); dur {:.1f}s'
            .format(true_card, true_card / true_inner_join_card,
                    true_card / true_full_join_card, true_inner_join_card,
                    dur))

        # if i > 0 and i % 1 == 0:
        #     spark = StartSpark(spark)

    df = pd.DataFrame({
        'tables': [','.join(table_names)] * len(true_cards),
        'join_conds':
        [','.join(map(lambda s: s.replace(' ', ''), join_clauses_list))] *
        len(true_cards),
        'filters':
        filter_strings,
        'true_cards':
        true_cards,
    })
    df.to_csv(FLAGS.output_csv, sep='#', mode='a', index=False, header=False)
    print('Template done.')
    return queries, true_cards
Example #5
0
def main():
    table2alias = {'title': 't', 'cast_info': 'ci', 'movie_companies': 'mc', 'movie_info': 'mi',
                   'movie_info_idx': 'mi_idx', 'movie_keyword': 'mk'}
    join_tables = ['title', 'cast_info', 'movie_companies', 'movie_info', 'movie_info_idx', 'movie_keyword']
    join_keys = {'title': ['id'], 'cast_info': ['movie_id'], 'movie_companies': ['movie_id'],
                 'movie_info': ['movie_id'], 'movie_info_idx': ['movie_id'], 'movie_keyword': ['movie_id']}
    join_clauses = {'cast_info': 'title.id=cast_info.movie_id', 'movie_companies': 'title.id=movie_companies.movie_id',
                    'movie_info': 'title.id=movie_info.movie_id', 'movie_info_idx': 'title.id=movie_info_idx.movie_id',
                    'movie_keyword': 'title.id=movie_keyword.movie_id'}
    all_cols = {
        'title': [
            'kind_id', 'production_year', 'episode_nr', 'imdb_index', 'phonetic_code', 'season_nr', 'series_years'
        ],
        'cast_info': [
            'nr_order', 'role_id'
        ],
        'movie_companies': [
            'company_type_id'
        ],
        'movie_info_idx': ['info_type_id'],
        'movie_info': ['info_type_id'],
        'movie_keyword': ['keyword_id']
    }

    tables = ['cast_info', 'movie_companies', 'movie_info', 'movie_info_idx', 'movie_keyword']
    for num in range(1, 6):
        for p in combinations(tables, num):
            config = JOB_MY
            config['join_clauses'] = []
            p = [x for x in p]
            for t in p:
                config['join_clauses'].append(join_clauses[t])
            p = p + ['title']
            key = '_'.join(sorted([table2alias[x] for x in p]))
            config['join_tables'] = p
            config['join_keys'] = {}
            for t in p:
                config['join_keys'][t] = join_keys[t]
            col_num = 0
            for t in p:
                col_num += len(all_cols[t])
            join_spec = join_utils.get_join_spec(config)
            prepare_utils.prepare(join_spec)
            loaded_tables = []
            for t in join_spec.join_tables:
                print('Loading', t)
                table = datasets.LoadImdb(t, use_cols=config["use_cols"])
                table.data.info()
                loaded_tables.append(table)

            t_start = time.time()
            join_iter_dataset = FactorizedSamplerIterDataset(
                loaded_tables,
                join_spec,
                sample_batch_size=1000 * 100,
                disambiguate_column_names=True)

            table = common.ConcatTables(loaded_tables,
                                        join_spec.join_keys,
                                        sample_from_join_dataset=join_iter_dataset)

            join_iter_dataset = common.FactorizedSampleFromJoinIterDataset(
                join_iter_dataset,
                base_table=table,
                factorize_blacklist=[],
                word_size_bits=10,
                factorize_fanouts=True)
            t_end = time.time()
            log.info(f"> Initialization took {t_end - t_start} seconds.")
            print(join_iter_dataset.join_iter_dataset.combined_columns)
            samples = []
            for i in tqdm(range(1000000)):
                samples.append(next(join_iter_dataset.join_iter_dataset))
            df = pd.DataFrame(data=pd.concat(samples, axis=1)).T.iloc[:, :col_num]
            df.to_csv('../train-test-data/join_samples/{}.csv'.format(key), index=False)
            # join_iter_dataset.join_iter_dataset._sample_batch()
            print('-' * 60)
            print("Done {}".format(key))