コード例 #1
0
def gen_flights_10M_schema(csv_path):
    """
    Flights schema with 1M tuples
    """

    schema = SchemaGraph()
    # YEAR_DATE,UNIQUE_CARRIER,ORIGIN,ORIGIN_STATE_ABR,DEST,DEST_STATE_ABR,DEP_DELAY,TAXI_OUT,TAXI_IN,ARR_DELAY,AIR_TIME,DISTANCE

    # tables
    # lineorder
    schema.add_table(
        Table('flights',
              attributes=[
                  'year_date', 'unique_carrier', 'origin', 'origin_state_abr',
                  'dest', 'dest_state_abr', 'dep_delay', 'taxi_out', 'taxi_in',
                  'arr_delay', 'air_time', 'distance'
              ],
              csv_file_location=csv_path.format('dataset_sampled'),
              table_size=10000000,
              primary_key=['f_flightno'],
              sample_rate=0.1,
              fd_list=[('origin', 'origin_state_abr'),
                       ('dest', 'dest_state_abr')]))

    return schema
コード例 #2
0
def gen_job_ranges_imdb_schema(csv_path, version):
    """
    Just like the full IMDB schema but without tables that are not used in the job-light benchmark.
    """
    schema = SchemaGraph()

    # tables
    tablename = version
    attributes = []

    tablenamecsv = version
    table_head = pd.read_csv(
        '../../../train-test-data/forest_power-data-sql/' + version + '.csv',
        sep=',',
        escapechar='\\',
        encoding='utf-8',
        low_memory=False,
        quotechar='"')
    schema.add_table(
        Table(
            tablename,
            attributes=list(table_head.columns),
            csv_file_location=csv_path.format(tablenamecsv),  # debug的思路要正确 范围
            table_size=table_head.shape[0]))

    return schema
コード例 #3
0
def gen_1t_tpc_ds_schema(csv_path):
    """
    TPCDS 1t schema
    """

    schema = SchemaGraph()
    schema.add_table(Table('store_sales',
                           attributes=['ss_sold_date_sk', 'ss_sold_time_sk', 'ss_item_sk', 'ss_customer_sk',
                                       'ss_cdemo_sk', 'ss_hdemo_sk', 'ss_addr_sk', 'ss_store_sk', 'ss_promo_sk',
                                       'ss_ticket_number', 'ss_quantity', 'ss_wholesale_cost', 'ss_list_price',
                                       'ss_sales_price', 'ss_ext_discount_amt', 'ss_ext_sales_price',
                                       'ss_ext_wholesale_cost', 'ss_ext_list_price', 'ss_ext_tax',
                                       'ss_coupon_amt', 'ss_net_paid', 'ss_net_paid_inc_tax', 'ss_net_profit'],
                           irrelevant_attributes=['ss_sold_time_sk', 'ss_item_sk', 'ss_customer_sk', 'ss_cdemo_sk',
                                                  'ss_hdemo_sk', 'ss_addr_sk', 'ss_promo_sk', 'ss_ticket_number',
                                                  'ss_quantity', 'ss_wholesale_cost', 'ss_list_price',
                                                  'ss_ext_discount_amt', 'ss_ext_sales_price', 'ss_ext_wholesale_cost',
                                                  'ss_ext_list_price', 'ss_ext_tax',
                                                  'ss_coupon_amt', 'ss_net_paid', 'ss_net_paid_inc_tax',
                                                  'ss_net_profit'],
                           no_compression=['ss_sold_date_sk', 'ss_store_sk', 'ss_sales_price'],
                           csv_file_location=csv_path.format('store_sales_sampled'),
                           table_size=2879987999, primary_key=['ss_item_sk', 'ss_ticket_number'],
                           sample_rate=10000000 / 2879987999
                           ))

    return schema
コード例 #4
0
ファイル: deepdb.py プロジェクト: sfu-db/AreCELearnedYet
def construct_schema(table):
    # construct a schema that has one table only
    csv_file = DATA_ROOT / table.dataset / f"{table.version}.csv"
    schema = SchemaGraph()
    schema.add_table(
        Table(
            f'"{table.name}"',  # use table name in postgres since deepdb deal with sql directly
            attributes=table.data.columns.values.tolist(),
            csv_file_location=csv_file,
            table_size=table.row_num))
    return schema
コード例 #5
0
ファイル: schema.py プロジェクト: LumingSun/deepdb-public
def gen_cover_schema(csv_path="./ssb-benchmark/cover.csv"):

    schema = SchemaGraph()
    schema.add_table(Table('cover',
                           attributes=['c0','c1','c2','c3','c4','c5','c6','c7','c8','c9'],
                           irrelevant_attributes=None,
                           no_compression=['c0','c1','c2','c3','c4','c5','c6','c7','c8','c9'],
                           csv_file_location=csv_path,
                           table_size=581012, primary_key=['c0'],
                           ))

    return schema
コード例 #6
0
def gen2(csv_path):
    """
    Just like the full IMDB schema but without tables that are not used in the job-light benchmark.
    """

    schema = SchemaGraph()

    # tables
    # id, title, imdb_index, kind_id, production_year, imdb_id,phonetic_code, episode_of_id, season_nr, episode_nr,series_years, md5sum
    # title
    schema.add_table(
        Table('auth_user',
              attributes=['id_copy', 'id', 'is_active'],
              irrelevant_attributes=['is_active'],
              csv_file_location=csv_path.format('auth_user'),
              table_size=91889))  # 无用列增加了title

    # movie_info
    schema.add_table(
        Table('student_courseenrollment',
              attributes=['id_copy', 'id', 'user_id', 'is_active'],
              irrelevant_attributes=['id_copy'],
              csv_file_location=csv_path.format('student_courseenrollment'),
              table_size=85637))  # 无用列增加了info

    # relationships

    schema.add_relationship('student_courseenrollment', 'user_id', 'auth_user',
                            'id')

    return schema
コード例 #7
0
def cols2(csv_path):
    """
    Just like the full IMDB schema but without tables that are not used in the job-light benchmark.
    """

    schema = SchemaGraph()

    # tables
    # id, title, imdb_index, kind_id, production_year, imdb_id,phonetic_code, episode_of_id, season_nr, episode_nr,series_years, md5sum
    # title
    schema.add_table(
        Table('title',
              attributes=[
                  'id', 'title', 'imdb_index', 'kind_id', 'production_year',
                  'imdb_id', 'phonetic_code', 'episode_of_id', 'season_nr',
                  'episode_nr', 'series_years', 'md5sum'
              ],
              irrelevant_attributes=[
                  'title', 'imdb_index', 'kind_id', 'imdb_id', 'episode_of_id',
                  'season_nr', 'episode_nr', 'series_years', 'md5sum'
              ],
              no_compression=['kind_id'],
              csv_file_location=csv_path.format('title'),
              table_size=3486660))  # 无用列增加了title

    # cast_info
    schema.add_table(
        Table('cast_info',
              attributes=[
                  'id', 'person_id', 'movie_id', 'person_role_id', 'note',
                  'nr_order', 'role_id'
              ],
              csv_file_location=csv_path.format('cast_info'),
              irrelevant_attributes=[
                  'person_id', 'person_role_id', 'note', 'nr_order'
              ],
              no_compression=['role_id'],
              table_size=63475800))

    # relationships

    schema.add_relationship('cast_info', 'movie_id', 'title', 'id')

    return schema
コード例 #8
0
def gen_imdb_schema(csv_path):
    """
    Specifies full imdb schema. Also tables not in the job-light benchmark.
    """
    schema = SchemaGraph()

    # tables

    # title
    schema.add_table(
        Table('title',
              attributes=[
                  'id', 'title', 'imdb_index', 'kind_id', 'production_year',
                  'imdb_id', 'phonetic_code', 'episode_of_id', 'season_nr',
                  'episode_nr', 'series_years', 'md5sum'
              ],
              irrelevant_attributes=['episode_of_id'],
              csv_file_location=csv_path.format('title'),
              table_size=3486660))

    # movie_info_idx
    schema.add_table(
        Table('movie_info_idx',
              attributes=['id', 'movie_id', 'info_type_id', 'info', 'note'],
              csv_file_location=csv_path.format('movie_info_idx'),
              table_size=3147110))

    # movie_info
    schema.add_table(
        Table('movie_info',
              attributes=['id', 'movie_id', 'info_type_id', 'info', 'note'],
              csv_file_location=csv_path.format('movie_info'),
              table_size=24988000))

    # info_type
    schema.add_table(
        Table('info_type',
              attributes=['id', 'info'],
              csv_file_location=csv_path.format('info_type'),
              table_size=113))

    # cast_info
    schema.add_table(
        Table('cast_info',
              attributes=[
                  'id', 'person_id', 'movie_id', 'person_role_id', 'note',
                  'nr_order', 'role_id'
              ],
              csv_file_location=csv_path.format('cast_info'),
              table_size=63475800))

    # char_name
    schema.add_table(
        Table('char_name',
              attributes=[
                  'id', 'name', 'imdb_index', 'imdb_id', 'name_pcode_nf',
                  'surname_pcode', 'md5sum'
              ],
              csv_file_location=csv_path.format('char_name'),
              table_size=4314870))

    # role_type
    schema.add_table(
        Table('role_type',
              attributes=['id', 'role'],
              csv_file_location=csv_path.format('role_type'),
              table_size=0))

    # complete_cast
    schema.add_table(
        Table('complete_cast',
              attributes=['id', 'movie_id', 'subject_id', 'status_id'],
              csv_file_location=csv_path.format('complete_cast'),
              table_size=135086))

    # comp_cast_type
    schema.add_table(
        Table('comp_cast_type',
              attributes=['id', 'kind'],
              csv_file_location=csv_path.format('comp_cast_type'),
              table_size=0))

    # name
    schema.add_table(
        Table('name',
              attributes=[
                  'id', 'name', 'imdb_index', 'imdb_id', 'gender',
                  'name_pcode_cf', 'name_pcode_nf', 'surname_pcode', 'md5sum'
              ],
              csv_file_location=csv_path.format('name'),
              table_size=6379740))

    # aka_name
    schema.add_table(
        Table('aka_name',
              attributes=[
                  'id', 'person_id', 'name', 'imdb_index', 'name_pcode_cf',
                  'name_pcode_nf', 'surname_pcode', 'md5sum'
              ],
              csv_file_location=csv_path.format('aka_name'),
              table_size=1312270))

    # movie_link, is empty
    # schema.add_table(Table('movie_link', attributes=['id', 'movie_id', 'linked_movie_id', 'link_type_id'],
    #                        csv_file_location=csv_path.format('movie_link')))

    # link_type, no relationships
    # schema.add_table(Table('link_type', attributes=['id', 'link'],
    #                        csv_file_location=csv_path.format('link_type')))

    # movie_keyword
    schema.add_table(
        Table('movie_keyword',
              attributes=['id', 'movie_id', 'keyword_id'],
              csv_file_location=csv_path.format('movie_keyword'),
              table_size=7522600))

    # keyword
    schema.add_table(
        Table('keyword',
              attributes=['id', 'keyword', 'phonetic_code'],
              csv_file_location=csv_path.format('keyword'),
              table_size=236627))

    # person_info
    schema.add_table(
        Table('person_info',
              attributes=['id', 'person_id', 'info_type_id', 'info', 'note'],
              csv_file_location=csv_path.format('person_info'),
              table_size=4130210))

    # movie_companies
    schema.add_table(
        Table('movie_companies',
              attributes=[
                  'id', 'movie_id', 'company_id', 'company_type_id', 'note'
              ],
              csv_file_location=csv_path.format('movie_companies'),
              table_size=4958300))

    # company_name
    schema.add_table(
        Table('company_name',
              attributes=[
                  'id', 'name', 'country_code', 'imdb_id', 'name_pcode_nf',
                  'name_pcode_sf', 'md5sum'
              ],
              csv_file_location=csv_path.format('company_name'),
              table_size=362131))

    # company_type
    schema.add_table(
        Table('company_type',
              attributes=['id', 'kind'],
              csv_file_location=csv_path.format('company_type'),
              table_size=0))

    # aka_title
    schema.add_table(
        Table('aka_title',
              attributes=[
                  'id', 'movie_id', 'title', 'imdb_index', 'kind_id',
                  'production_year', 'phonetic_code', 'episode_of_id',
                  'season_nr', 'episode_nr', 'note', 'md5sum'
              ],
              irrelevant_attributes=['episode_of_id'],
              csv_file_location=csv_path.format('aka_title'),
              table_size=528268))

    # kind_type
    schema.add_table(
        Table('kind_type',
              attributes=['id', 'kind'],
              csv_file_location=csv_path.format('kind_type'),
              table_size=0))

    # relationships

    # title
    # omit self-join for now
    # schema.add_relationship('title', 'episode_of_id', 'title', 'id')
    schema.add_relationship('title', 'kind_id', 'kind_type', 'id')

    # movie_info_idx
    schema.add_relationship('movie_info_idx', 'info_type_id', 'info_type',
                            'id')
    schema.add_relationship('movie_info_idx', 'movie_id', 'title', 'id')

    # movie_info
    schema.add_relationship('movie_info', 'info_type_id', 'info_type', 'id')
    schema.add_relationship('movie_info', 'movie_id', 'title', 'id')

    # info_type, no relationships

    # cast_info
    schema.add_relationship('cast_info', 'movie_id', 'title', 'id')
    schema.add_relationship('cast_info', 'person_id', 'name', 'id')
    schema.add_relationship('cast_info', 'person_role_id', 'char_name', 'id')
    schema.add_relationship('cast_info', 'role_id', 'role_type', 'id')

    # char_name, no relationships

    # role_type, no relationships

    # complete_cast
    schema.add_relationship('complete_cast', 'movie_id', 'title', 'id')
    schema.add_relationship('complete_cast', 'status_id', 'comp_cast_type',
                            'id')
    schema.add_relationship('complete_cast', 'subject_id', 'comp_cast_type',
                            'id')

    # comp_cast_type, no relationships

    # name, no relationships

    # aka_name
    schema.add_relationship('aka_name', 'person_id', 'name', 'id')

    # movie_link, is empty
    # schema.add_relationship('movie_link', 'link_type_id', 'link_type', 'id')
    # schema.add_relationship('movie_link', 'linked_movie_id', 'title', 'id')
    # schema.add_relationship('movie_link', 'movie_id', 'title', 'id')

    # link_type, no relationships

    # movie_keyword
    schema.add_relationship('movie_keyword', 'keyword_id', 'keyword', 'id')
    schema.add_relationship('movie_keyword', 'movie_id', 'title', 'id')

    # keyword, no relationships

    # person_info
    schema.add_relationship('person_info', 'info_type_id', 'info_type', 'id')
    schema.add_relationship('person_info', 'person_id', 'name', 'id')

    # movie_companies
    schema.add_relationship('movie_companies', 'company_id', 'company_name',
                            'id')
    schema.add_relationship('movie_companies', 'company_type_id',
                            'company_type', 'id')
    schema.add_relationship('movie_companies', 'movie_id', 'title', 'id')

    # company_name, no relationships

    # company_type, no relationships

    # aka_title
    schema.add_relationship('aka_title', 'movie_id', 'title', 'id')
    schema.add_relationship('aka_title', 'kind_id', 'kind_type', 'id')

    # kind_type, no relationships

    return schema
コード例 #9
0
def gen10(csv_path):
    """
    Just like the full IMDB schema but without tables that are not used in the job-light benchmark.
    """

    schema = SchemaGraph()

    # tables
    # id, title, imdb_index, kind_id, production_year, imdb_id,phonetic_code, episode_of_id, season_nr, episode_nr,series_years, md5sum
    # title
    schema.add_table(
        Table('auth_user',
              attributes=['id_copy', 'id', 'is_active'],
              csv_file_location=csv_path.format('auth_user'),
              table_size=91889))  # 无用列增加了title

    # movie_info
    schema.add_table(
        Table('student_courseenrollment',
              attributes=['id_copy', 'id', 'user_id', 'is_active'],
              csv_file_location=csv_path.format('student_courseenrollment'),
              table_size=85637))  # 无用列增加了info

    # cast_info
    schema.add_table(
        Table('organization_account_userorgprofile',
              attributes=[
                  'id', 'org_id', 'user_id', 'origin', 'role',
                  '_first_level_id'
              ],
              csv_file_location=csv_path.format(
                  'organization_account_userorgprofile'),
              table_size=99832))

    schema.add_table(
        Table('auth_userprofile',
              attributes=['id', 'user_id', 'level_of_education', 'host'],
              csv_file_location=csv_path.format('auth_userprofile'),
              table_size=99349))
    # relationships

    schema.add_relationship('student_courseenrollment', 'user_id', 'auth_user',
                            'id')
    schema.add_relationship('organization_account_userorgprofile', 'user_id',
                            'auth_user', 'id')
    schema.add_relationship('auth_userprofile', 'user_id', 'auth_user', 'id')

    return schema
コード例 #10
0
def gen_job_light_imdb_schema(csv_path):
    """
    Just like the full IMDB schema but without tables that are not used in the job-light benchmark.
    """

    schema = SchemaGraph()

    # tables

    # title
    schema.add_table(
        Table('title',
              attributes=[
                  'id', 'title', 'imdb_index', 'kind_id', 'production_year',
                  'imdb_id', 'phonetic_code', 'episode_of_id', 'season_nr',
                  'episode_nr', 'series_years', 'md5sum'
              ],
              irrelevant_attributes=[
                  'episode_of_id', 'title', 'imdb_index', 'phonetic_code',
                  'season_nr', 'imdb_id', 'episode_nr', 'series_years',
                  'md5sum'
              ],
              no_compression=['kind_id'],
              csv_file_location=csv_path.format('title'),
              table_size=3486660))

    # movie_info_idx
    schema.add_table(
        Table('movie_info_idx',
              attributes=['id', 'movie_id', 'info_type_id', 'info', 'note'],
              csv_file_location=csv_path.format('movie_info_idx'),
              irrelevant_attributes=['info', 'note'],
              no_compression=['info_type_id'],
              table_size=3147110))

    # movie_info
    schema.add_table(
        Table('movie_info',
              attributes=['id', 'movie_id', 'info_type_id', 'info', 'note'],
              csv_file_location=csv_path.format('movie_info'),
              irrelevant_attributes=['info', 'note'],
              no_compression=['info_type_id'],
              table_size=24988000))

    # cast_info
    schema.add_table(
        Table('cast_info',
              attributes=[
                  'id', 'person_id', 'movie_id', 'person_role_id', 'note',
                  'nr_order', 'role_id'
              ],
              csv_file_location=csv_path.format('cast_info'),
              irrelevant_attributes=[
                  'nr_order', 'note', 'person_id', 'person_role_id'
              ],
              no_compression=['role_id'],
              table_size=63475800))

    # movie_keyword
    schema.add_table(
        Table('movie_keyword',
              attributes=['id', 'movie_id', 'keyword_id'],
              csv_file_location=csv_path.format('movie_keyword'),
              no_compression=['keyword_id'],
              table_size=7522600))

    # movie_companies
    schema.add_table(
        Table('movie_companies',
              attributes=[
                  'id', 'movie_id', 'company_id', 'company_type_id', 'note'
              ],
              csv_file_location=csv_path.format('movie_companies'),
              irrelevant_attributes=['note'],
              no_compression=['company_id', 'company_type_id'],
              table_size=4958300))

    # relationships
    schema.add_relationship('movie_info_idx', 'movie_id', 'title', 'id')
    schema.add_relationship('movie_info', 'movie_id', 'title', 'id')
    schema.add_relationship('cast_info', 'movie_id', 'title', 'id')
    schema.add_relationship('movie_keyword', 'movie_id', 'title', 'id')
    schema.add_relationship('movie_companies', 'movie_id', 'title', 'id')

    return schema
コード例 #11
0
def gen_500gb_ssb_schema(csv_path):
    """
    SSB schema for SF=500.
    """

    schema = SchemaGraph()

    # tables
    # lineorder
    schema.add_table(Table('lineorder',
                           attributes=['lo_orderkey', 'lo_linenumber', 'lo_custkey', 'lo_partkey', 'lo_suppkey',
                                       'lo_orderdate', 'lo_orderpriority', 'lo_shippriority', 'lo_quantity',
                                       'lo_extendedprice', 'lo_ordertotalprice', 'lo_discount', 'lo_revenue',
                                       'lo_supplycost', 'lo_tax', 'lo_commitdate', 'lo_shipmode'],
                           csv_file_location=csv_path.format('lineorder_sampled'),
                           irrelevant_attributes=['lo_commitdate'],
                           table_size=3000028242, primary_key=['lo_orderkey', 'lo_linenumber'], sample_rate=0.003333))

    # dwdate
    # dwdate.d_dayofweek -> dwdate.d_daynuminweek
    # dwdate.d_dayofweek -> dwdate.d_lastdayinweekfl
    # dwdate.d_month -> dwdate.d_monthnuminyear
    # dwdate.d_monthnuminyear -> dwdate.d_sellingseason
    # dwdate.d_daynuminyear -> dwdate.d_weeknuminyear
    schema.add_table(
        Table('dwdate',
              attributes=['d_datekey', 'd_date', 'd_dayofweek', 'd_month', 'd_year', 'd_yearmonthnum', 'd_yearmonth',
                          'd_daynuminweek', 'd_daynuminmonth', 'd_daynuminyear', 'd_monthnuminyear', 'd_weeknuminyear',
                          'd_sellingseason', 'd_lastdayinweekfl', 'd_lastdayinmonthfl', 'd_holidayfl', 'd_weekdayfl'],
              csv_file_location=csv_path.format('date'),
              table_size=2556, primary_key=["d_datekey"],
              fd_list=[('d_dayofweek', 'd_daynuminweek'), ('d_dayofweek', 'd_lastdayinweekfl'),
                       ('d_month', 'd_monthnuminyear'), ('d_monthnuminyear', 'd_sellingseason'),
                       ('d_daynuminyear', 'd_weeknuminyear')]))

    # customer
    # customer.c_city -> customer.c_nation
    # customer.c_nation -> customer.c_region
    schema.add_table(
        Table('customer',
              attributes=['c_custkey', 'c_name', 'c_address', 'c_city', 'c_nation', 'c_region', 'c_phone',
                          'c_mktsegment'],
              csv_file_location=csv_path.format('customer'),
              table_size=15000000, primary_key=["c_custkey"],
              fd_list=[('c_city', 'c_nation'), ('c_nation', 'c_region')]))

    # part
    # part.p_brand1 -> part.p_category
    # part.p_category -> part.p_mfgr
    schema.add_table(
        Table('part',
              attributes=['p_partkey', 'p_name', 'p_mfgr', 'p_category', 'p_brand1', 'p_color', 'p_type', 'p_size',
                          'p_container'],
              csv_file_location=csv_path.format('part'),
              table_size=1800000, primary_key=["p_partkey"],
              fd_list=[('p_category', 'p_mfgr'), ('p_brand1', 'p_category')]))

    # supplier
    # supplier.s_city -> supplier.s_nation
    # supplier.s_nation -> supplier.s_region
    schema.add_table(
        Table('supplier', attributes=['s_suppkey', 's_name', 's_address', 's_city', 's_nation', 's_region', 's_phone'],
              csv_file_location=csv_path.format('supplier'),
              table_size=1000000, primary_key=["s_suppkey"],
              fd_list=[('s_city', 's_nation'), ('s_nation', 's_region')]))

    # relationships
    schema.add_relationship('lineorder', 'lo_custkey', 'customer', 'c_custkey')
    schema.add_relationship('lineorder', 'lo_partkey', 'part', 'p_partkey')
    schema.add_relationship('lineorder', 'lo_suppkey', 'supplier', 's_suppkey')
    schema.add_relationship('lineorder', 'lo_orderdate', 'dwdate', 'd_datekey')
    # schema.add_relationship('lineorder', 'lo_commitdate', 'dwdate', 'd_datekey')

    return schema