def analyze_tpch(database):
    db = dataset.connect(database)

    expression_ops = db.query("""select class, operator, count(*)
    from expr_ops_tpch group by class, operator order by class, operator;""")

    print_table([[x['class'], x['operator'], x['count']]
                 for x in expression_ops], ["class", "operator", "count"],
                'tpch')

    queries = list(db['tpchqueries'])

    print
    explicit_implicit_joins(queries)

    # counters for how often we have a certain count in a query
    compressed_lengths = Counter()
    str_ops = Counter()
    distinct_str_ops = Counter()
    estimated = Counter()
    which_str_ops = Counter()
    table_clusters = []

    not_yet_seen_tables = []
    tables_seen = set()
    last = 0

    print "Find recurring subtrees in queries queries:"
    find_recurring(queries, 'tpch')
    find_recurring_subset(queries)

    for idx, q in enumerate(queries):
        last = idx
        plan = json.loads(q['plan'])
        tables = visit_operators(plan, visitor_tables)

        # only valid sdss tables
        table_set = set([x.lower() for x in tables]) & set(SDSS_TABLES)
        if len(table_set):

            equal = []
            for i, c in enumerate(table_clusters):
                if c.intersection(table_set):
                    equal.append(i)
                    table_clusters[i] = c | table_set
            equal.append(len(table_clusters))
            table_clusters.append(table_set)

            if len(equal) > 1:
                first = equal[0]
                for i in equal[1:]:
                    table_clusters[
                        first] = table_clusters[first] | table_clusters[i]
            table_clusters = [
                x for i, x in enumerate(table_clusters) if i not in equal[1:]
            ]

        new_tables = set(tables) - tables_seen
        if new_tables:
            for t in new_tables:
                tables_seen.add(t)
            not_yet_seen_tables.append([idx, len(new_tables)])

        query = q['query']
        compressed_lengths[len(bz2.compress(query))] += 1

        estimated[q['estimated_cost']] += 1

        tokens = sqltokens.get_tokens(query)
        str_ops[len(tokens)] += 1
        distinct_str_ops[len(set(tokens))] += 1

        which_str_ops.update(tokens)

    print
    not_yet_seen_tables.append([last, 0])
    print_table(not_yet_seen_tables,
                headers=['query_number', 'num_new_tables'],
                workload="tpch")

    print
    print_table(sorted(which_str_ops.iteritems(),
                       key=lambda t: t[1],
                       reverse=True),
                headers=["string_op", "count"],
                workload='tpch')

    print_table(sorted([[str(list(x))] for x in table_clusters],
                       key=lambda t: len(t),
                       reverse=True),
                headers=["table_cluster"],
                workload='tpch')

    for name, values in zip([
            'compressed lengths', 'string ops', 'distinct string ops',
            'estimated'
    ], [compressed_lengths, str_ops, distinct_str_ops, estimated]):
        print_table(sorted(values.iteritems(), key=lambda t: t[0]),
                    headers=[name, "counts"],
                    workload='tpch')
def analyze_tpch(database):
    db = dataset.connect(database)

    expression_ops = db.query("""select class, operator, count(*)
    from expr_ops_tpch group by class, operator order by class, operator;""")

    print_table([[x['class'], x['operator'], x['count']] for x in expression_ops],
                ["class", "operator", "count"], 'tpch')

    queries = list(db['tpchqueries'])

    print
    explicit_implicit_joins(queries)

    # counters for how often we have a certain count in a query
    compressed_lengths = Counter()
    str_ops = Counter()
    distinct_str_ops = Counter()
    estimated = Counter()
    which_str_ops = Counter()
    table_clusters = []

    not_yet_seen_tables = []
    tables_seen = set()
    last = 0

    print "Find recurring subtrees in queries queries:"
    find_recurring(queries, 'tpch')
    find_recurring_subset(queries)

    for idx, q in enumerate(queries):
        last = idx
        plan = json.loads(q['plan'])
        tables = visit_operators(plan, visitor_tables)

        # only valid sdss tables
        table_set = set([x.lower() for x in tables]) & set(SDSS_TABLES)
        if len(table_set):

            equal = []
            for i, c in enumerate(table_clusters):
                if c.intersection(table_set):
                    equal.append(i)
                    table_clusters[i] = c | table_set
            equal.append(len(table_clusters))
            table_clusters.append(table_set)

            if len(equal) > 1:
                first = equal[0]
                for i in equal[1:]:
                    table_clusters[first] = table_clusters[first] | table_clusters[i]
            table_clusters = [x for i, x in enumerate(table_clusters) if i not in equal[1:]]

        new_tables = set(tables) - tables_seen
        if new_tables:
            for t in new_tables:
                tables_seen.add(t)
            not_yet_seen_tables.append([idx, len(new_tables)])

        query = q['query']
        compressed_lengths[len(bz2.compress(query))] += 1

        estimated[q['estimated_cost']] += 1

        tokens = sqltokens.get_tokens(query)
        str_ops[len(tokens)] += 1
        distinct_str_ops[len(set(tokens))] += 1

        which_str_ops.update(tokens)

    print
    not_yet_seen_tables.append([last, 0])
    print_table(not_yet_seen_tables, headers=['query_number', 'num_new_tables'], workload="tpch")

    print
    print_table(sorted(
        which_str_ops.iteritems(),
        key=lambda t: t[1], reverse=True),
        headers=["string_op", "count"], workload='tpch')

    print_table(sorted(
        [[str(list(x))] for x in table_clusters],
        key=lambda t: len(t), reverse=True),
        headers=["table_cluster"], workload='tpch')

    for name, values in zip(
            ['compressed lengths', 'string ops', 'distinct string ops', 'estimated'],
            [compressed_lengths, str_ops, distinct_str_ops, estimated]):
        print_table(sorted(
            values.iteritems(),
            key=lambda t: t[0]),
            headers=[name, "counts"], workload='tpch')
def analyze_sdss(database, analyze_recurring):
    db = dataset.connect(database)

    print "Limited to DR5"

    num_interesting_queries = list(
        db.query('SELECT COUNT(*) c FROM {} where has_plan = 1'.format(
            EXPLAINED)))[0]['c']
    print "Distinct queries with query plan:", num_interesting_queries

    num_interesting_queries = list(
        db.query(
            'SELECT COUNT(*) c FROM (SELECT distinct simple_plan from {} where has_plan = 1)'
            .format(UNIQUE)))[0]['c']
    print "Distinct queries with constants replaced:", num_interesting_queries

    expl_queries = '''
        SELECT query, plan, time_start, estimated_cost
        FROM {}
        WHERE estimated_cost < 100
        ORDER BY time_start ASC
        '''.format(EXPLAINED)

    dist_queries = '''
        SELECT query, plan, estimated_cost
        FROM {}
        WHERE estimated_cost < 100
        ORDER BY time_start ASC'''.format(UNIQUE)

    all_queries = '''
        SELECT *
        FROM {}
        ORDER BY time_start ASC'''.format(EXPLAINED_ALL)

    if analyze_recurring:
        print
        print "Find recurring subtrees in distinct (query) queries:"
        queries = db.query(expl_queries)
        find_recurring(queries)

        # stored csv from previous will be overwritten

        print
        print "Find recurring subtrees in distinct (template) queries:"
        queries = db.query(dist_queries)
        find_recurring(queries)

        print
        print "Find recurring subtrees in distinct (template) queries (using subset check):"
        queries = db.query(dist_queries)
        find_recurring_subset(queries)

    print
    queries = db.query(expl_queries)
    explicit_implicit_joins(queries)

    # counters for how often we have a certain count in a query
    compressed_lengths = Counter()
    lengths = Counter()
    str_ops = Counter()
    distinct_str_ops = Counter()
    estimated = Counter()
    tables_seen = set()
    which_str_ops = Counter()

    table_clusters = []

    # count how many new tables we see
    not_yet_seen_tables = []
    last = 0

    # go over all queries (joined with explained)
    print "Go over all queries"
    for i, q in enumerate(db.query(all_queries)):
        last = i
        plan = json.loads(q['plan'])
        tables = visit_operators(plan, visitor_tables)
        new_tables = set(tables) - tables_seen
        if new_tables:
            for t in new_tables:
                tables_seen.add(t)
            not_yet_seen_tables.append([i, len(new_tables)])
        if not i % 100000:
            print "Went over", i

    print
    not_yet_seen_tables.append([last, 0])
    print_table(not_yet_seen_tables,
                headers=['query_number', 'num_new_tables'])

    # go over distinct queries
    print "Go over distinct queries"
    for q in db.query(expl_queries):
        plan = json.loads(q['plan'])
        tables = visit_operators(plan, visitor_tables)

        # only valid sdss tables
        table_set = set([x.lower() for x in tables]) & set(SDSS_TABLES)
        if len(table_set):
            equal = []
            for i, c in enumerate(table_clusters):
                if c.intersection(table_set):
                    equal.append(i)
                    table_clusters[i] = c | table_set
            equal.append(len(table_clusters))
            table_clusters.append(table_set)

            if len(equal) > 1:
                first = equal[0]
                for i in equal[1:]:
                    table_clusters[
                        first] = table_clusters[first] | table_clusters[i]
            table_clusters = [
                x for i, x in enumerate(table_clusters) if i not in equal[1:]
            ]

        query = q['query']
        lengths[len(query)] += 1

        compressed_lengths[len(bz2.compress(query))] += 1

        estimated[q['estimated_cost']] += 1

        # tokenization is horribly slow and does not work for sdss
        continue

        tokens = sqltokens.get_tokens(query)
        str_ops[len(tokens)] += 1
        distinct_str_ops[len(set(tokens))] += 1

        which_str_ops.update(tokens)

    print
    print_table(sorted(which_str_ops.iteritems(),
                       key=lambda t: t[1],
                       reverse=True),
                headers=["string_op", "count"])

    print_table(sorted([[str(list(x))] for x in table_clusters],
                       key=lambda t: len(t),
                       reverse=True),
                headers=["table_cluster"])

    for name, values in zip([
            'compressed lengths', 'lengths', 'string ops',
            'distinct string ops', 'estimated'
    ], [compressed_lengths, lengths, str_ops, distinct_str_ops, estimated]):
        print
        print_table(sorted(values.iteritems(), key=lambda t: t[0]),
                    headers=[name, "counts"])
def analyze_sdss(database, analyze_recurring):
    db = dataset.connect(database)

    print "Limited to DR5"

    num_interesting_queries = list(db.query('SELECT COUNT(*) c FROM {} where has_plan = 1'.format(EXPLAINED)))[0]['c']
    print "Distinct queries with query plan:", num_interesting_queries

    num_interesting_queries = list(db.query('SELECT COUNT(*) c FROM (SELECT distinct simple_plan from {} where has_plan = 1)'.format(UNIQUE)))[0]['c']
    print "Distinct queries with constants replaced:", num_interesting_queries

    expl_queries = '''
        SELECT query, plan, time_start, estimated_cost
        FROM {}
        WHERE estimated_cost < 100
        ORDER BY time_start ASC
        '''.format(EXPLAINED)

    dist_queries = '''
        SELECT query, plan, estimated_cost
        FROM {}
        WHERE estimated_cost < 100
        ORDER BY time_start ASC'''.format(UNIQUE)

    all_queries = '''
        SELECT *
        FROM {}
        ORDER BY time_start ASC'''.format(EXPLAINED_ALL)

    if analyze_recurring:
        print
        print "Find recurring subtrees in distinct (query) queries:"
        queries = db.query(expl_queries)
        find_recurring(queries)

        # stored csv from previous will be overwritten

        print
        print "Find recurring subtrees in distinct (template) queries:"
        queries = db.query(dist_queries)
        find_recurring(queries)

        print
        print "Find recurring subtrees in distinct (template) queries (using subset check):"
        queries = db.query(dist_queries)
        find_recurring_subset(queries)

    print
    queries = db.query(expl_queries)
    explicit_implicit_joins(queries)

    # counters for how often we have a certain count in a query
    compressed_lengths = Counter()
    lengths = Counter()
    str_ops = Counter()
    distinct_str_ops = Counter()
    estimated = Counter()
    tables_seen = set()
    which_str_ops = Counter()

    table_clusters = []

    # count how many new tables we see
    not_yet_seen_tables = []
    last = 0

    # go over all queries (joined with explained)
    print "Go over all queries"
    for i, q in enumerate(db.query(all_queries)):
        last = i
        plan = json.loads(q['plan'])
        tables = visit_operators(plan, visitor_tables)
        new_tables = set(tables) - tables_seen
        if new_tables:
            for t in new_tables:
                tables_seen.add(t)
            not_yet_seen_tables.append([i, len(new_tables)])
        if not i % 100000:
            print "Went over", i

    print
    not_yet_seen_tables.append([last, 0])
    print_table(not_yet_seen_tables, headers=['query_number', 'num_new_tables'])

    # go over distinct queries
    print "Go over distinct queries"
    for q in db.query(expl_queries):
        plan = json.loads(q['plan'])
        tables = visit_operators(plan, visitor_tables)

        # only valid sdss tables
        table_set = set([x.lower() for x in tables]) & set(SDSS_TABLES)
        if len(table_set):
            equal = []
            for i, c in enumerate(table_clusters):
                if c.intersection(table_set):
                    equal.append(i)
                    table_clusters[i] = c | table_set
            equal.append(len(table_clusters))
            table_clusters.append(table_set)

            if len(equal) > 1:
                first = equal[0]
                for i in equal[1:]:
                    table_clusters[first] = table_clusters[first] | table_clusters[i]
            table_clusters = [x for i, x in enumerate(table_clusters) if i not in equal[1:]]

        query = q['query']
        lengths[len(query)] += 1

        compressed_lengths[len(bz2.compress(query))] += 1

        estimated[q['estimated_cost']] += 1

        # tokenization is horribly slow and does not work for sdss
        continue

        tokens = sqltokens.get_tokens(query)
        str_ops[len(tokens)] += 1
        distinct_str_ops[len(set(tokens))] += 1

        which_str_ops.update(tokens)

    print
    print_table(sorted(
        which_str_ops.iteritems(),
        key=lambda t: t[1], reverse=True),
        headers=["string_op", "count"])

    print_table(sorted(
        [[str(list(x))] for x in table_clusters],
        key=lambda t: len(t), reverse=True),
        headers=["table_cluster"])

    for name, values in zip(
            ['compressed lengths', 'lengths', 'string ops', 'distinct string ops', 'estimated'],
            [compressed_lengths, lengths, str_ops, distinct_str_ops, estimated]):
        print
        print_table(sorted(
            values.iteritems(),
            key=lambda t: t[0]),
            headers=[name, "counts"])