def analyze_tpch(database): db = dataset.connect(database) expression_ops = db.query("""select class, operator, count(*) from expr_ops_tpch group by class, operator order by class, operator;""") print_table([[x['class'], x['operator'], x['count']] for x in expression_ops], ["class", "operator", "count"], 'tpch') queries = list(db['tpchqueries']) print explicit_implicit_joins(queries) # counters for how often we have a certain count in a query compressed_lengths = Counter() str_ops = Counter() distinct_str_ops = Counter() estimated = Counter() which_str_ops = Counter() table_clusters = [] not_yet_seen_tables = [] tables_seen = set() last = 0 print "Find recurring subtrees in queries queries:" find_recurring(queries, 'tpch') find_recurring_subset(queries) for idx, q in enumerate(queries): last = idx plan = json.loads(q['plan']) tables = visit_operators(plan, visitor_tables) # only valid sdss tables table_set = set([x.lower() for x in tables]) & set(SDSS_TABLES) if len(table_set): equal = [] for i, c in enumerate(table_clusters): if c.intersection(table_set): equal.append(i) table_clusters[i] = c | table_set equal.append(len(table_clusters)) table_clusters.append(table_set) if len(equal) > 1: first = equal[0] for i in equal[1:]: table_clusters[ first] = table_clusters[first] | table_clusters[i] table_clusters = [ x for i, x in enumerate(table_clusters) if i not in equal[1:] ] new_tables = set(tables) - tables_seen if new_tables: for t in new_tables: tables_seen.add(t) not_yet_seen_tables.append([idx, len(new_tables)]) query = q['query'] compressed_lengths[len(bz2.compress(query))] += 1 estimated[q['estimated_cost']] += 1 tokens = sqltokens.get_tokens(query) str_ops[len(tokens)] += 1 distinct_str_ops[len(set(tokens))] += 1 which_str_ops.update(tokens) print not_yet_seen_tables.append([last, 0]) print_table(not_yet_seen_tables, headers=['query_number', 'num_new_tables'], workload="tpch") print print_table(sorted(which_str_ops.iteritems(), key=lambda t: t[1], reverse=True), headers=["string_op", "count"], workload='tpch') print_table(sorted([[str(list(x))] for x in table_clusters], key=lambda t: len(t), reverse=True), headers=["table_cluster"], workload='tpch') for name, values in zip([ 'compressed lengths', 'string ops', 'distinct string ops', 'estimated' ], [compressed_lengths, str_ops, distinct_str_ops, estimated]): print_table(sorted(values.iteritems(), key=lambda t: t[0]), headers=[name, "counts"], workload='tpch')
def analyze_tpch(database): db = dataset.connect(database) expression_ops = db.query("""select class, operator, count(*) from expr_ops_tpch group by class, operator order by class, operator;""") print_table([[x['class'], x['operator'], x['count']] for x in expression_ops], ["class", "operator", "count"], 'tpch') queries = list(db['tpchqueries']) print explicit_implicit_joins(queries) # counters for how often we have a certain count in a query compressed_lengths = Counter() str_ops = Counter() distinct_str_ops = Counter() estimated = Counter() which_str_ops = Counter() table_clusters = [] not_yet_seen_tables = [] tables_seen = set() last = 0 print "Find recurring subtrees in queries queries:" find_recurring(queries, 'tpch') find_recurring_subset(queries) for idx, q in enumerate(queries): last = idx plan = json.loads(q['plan']) tables = visit_operators(plan, visitor_tables) # only valid sdss tables table_set = set([x.lower() for x in tables]) & set(SDSS_TABLES) if len(table_set): equal = [] for i, c in enumerate(table_clusters): if c.intersection(table_set): equal.append(i) table_clusters[i] = c | table_set equal.append(len(table_clusters)) table_clusters.append(table_set) if len(equal) > 1: first = equal[0] for i in equal[1:]: table_clusters[first] = table_clusters[first] | table_clusters[i] table_clusters = [x for i, x in enumerate(table_clusters) if i not in equal[1:]] new_tables = set(tables) - tables_seen if new_tables: for t in new_tables: tables_seen.add(t) not_yet_seen_tables.append([idx, len(new_tables)]) query = q['query'] compressed_lengths[len(bz2.compress(query))] += 1 estimated[q['estimated_cost']] += 1 tokens = sqltokens.get_tokens(query) str_ops[len(tokens)] += 1 distinct_str_ops[len(set(tokens))] += 1 which_str_ops.update(tokens) print not_yet_seen_tables.append([last, 0]) print_table(not_yet_seen_tables, headers=['query_number', 'num_new_tables'], workload="tpch") print print_table(sorted( which_str_ops.iteritems(), key=lambda t: t[1], reverse=True), headers=["string_op", "count"], workload='tpch') print_table(sorted( [[str(list(x))] for x in table_clusters], key=lambda t: len(t), reverse=True), headers=["table_cluster"], workload='tpch') for name, values in zip( ['compressed lengths', 'string ops', 'distinct string ops', 'estimated'], [compressed_lengths, str_ops, distinct_str_ops, estimated]): print_table(sorted( values.iteritems(), key=lambda t: t[0]), headers=[name, "counts"], workload='tpch')
def analyze_sdss(database, analyze_recurring): db = dataset.connect(database) print "Limited to DR5" num_interesting_queries = list( db.query('SELECT COUNT(*) c FROM {} where has_plan = 1'.format( EXPLAINED)))[0]['c'] print "Distinct queries with query plan:", num_interesting_queries num_interesting_queries = list( db.query( 'SELECT COUNT(*) c FROM (SELECT distinct simple_plan from {} where has_plan = 1)' .format(UNIQUE)))[0]['c'] print "Distinct queries with constants replaced:", num_interesting_queries expl_queries = ''' SELECT query, plan, time_start, estimated_cost FROM {} WHERE estimated_cost < 100 ORDER BY time_start ASC '''.format(EXPLAINED) dist_queries = ''' SELECT query, plan, estimated_cost FROM {} WHERE estimated_cost < 100 ORDER BY time_start ASC'''.format(UNIQUE) all_queries = ''' SELECT * FROM {} ORDER BY time_start ASC'''.format(EXPLAINED_ALL) if analyze_recurring: print print "Find recurring subtrees in distinct (query) queries:" queries = db.query(expl_queries) find_recurring(queries) # stored csv from previous will be overwritten print print "Find recurring subtrees in distinct (template) queries:" queries = db.query(dist_queries) find_recurring(queries) print print "Find recurring subtrees in distinct (template) queries (using subset check):" queries = db.query(dist_queries) find_recurring_subset(queries) print queries = db.query(expl_queries) explicit_implicit_joins(queries) # counters for how often we have a certain count in a query compressed_lengths = Counter() lengths = Counter() str_ops = Counter() distinct_str_ops = Counter() estimated = Counter() tables_seen = set() which_str_ops = Counter() table_clusters = [] # count how many new tables we see not_yet_seen_tables = [] last = 0 # go over all queries (joined with explained) print "Go over all queries" for i, q in enumerate(db.query(all_queries)): last = i plan = json.loads(q['plan']) tables = visit_operators(plan, visitor_tables) new_tables = set(tables) - tables_seen if new_tables: for t in new_tables: tables_seen.add(t) not_yet_seen_tables.append([i, len(new_tables)]) if not i % 100000: print "Went over", i print not_yet_seen_tables.append([last, 0]) print_table(not_yet_seen_tables, headers=['query_number', 'num_new_tables']) # go over distinct queries print "Go over distinct queries" for q in db.query(expl_queries): plan = json.loads(q['plan']) tables = visit_operators(plan, visitor_tables) # only valid sdss tables table_set = set([x.lower() for x in tables]) & set(SDSS_TABLES) if len(table_set): equal = [] for i, c in enumerate(table_clusters): if c.intersection(table_set): equal.append(i) table_clusters[i] = c | table_set equal.append(len(table_clusters)) table_clusters.append(table_set) if len(equal) > 1: first = equal[0] for i in equal[1:]: table_clusters[ first] = table_clusters[first] | table_clusters[i] table_clusters = [ x for i, x in enumerate(table_clusters) if i not in equal[1:] ] query = q['query'] lengths[len(query)] += 1 compressed_lengths[len(bz2.compress(query))] += 1 estimated[q['estimated_cost']] += 1 # tokenization is horribly slow and does not work for sdss continue tokens = sqltokens.get_tokens(query) str_ops[len(tokens)] += 1 distinct_str_ops[len(set(tokens))] += 1 which_str_ops.update(tokens) print print_table(sorted(which_str_ops.iteritems(), key=lambda t: t[1], reverse=True), headers=["string_op", "count"]) print_table(sorted([[str(list(x))] for x in table_clusters], key=lambda t: len(t), reverse=True), headers=["table_cluster"]) for name, values in zip([ 'compressed lengths', 'lengths', 'string ops', 'distinct string ops', 'estimated' ], [compressed_lengths, lengths, str_ops, distinct_str_ops, estimated]): print print_table(sorted(values.iteritems(), key=lambda t: t[0]), headers=[name, "counts"])
def analyze_sdss(database, analyze_recurring): db = dataset.connect(database) print "Limited to DR5" num_interesting_queries = list(db.query('SELECT COUNT(*) c FROM {} where has_plan = 1'.format(EXPLAINED)))[0]['c'] print "Distinct queries with query plan:", num_interesting_queries num_interesting_queries = list(db.query('SELECT COUNT(*) c FROM (SELECT distinct simple_plan from {} where has_plan = 1)'.format(UNIQUE)))[0]['c'] print "Distinct queries with constants replaced:", num_interesting_queries expl_queries = ''' SELECT query, plan, time_start, estimated_cost FROM {} WHERE estimated_cost < 100 ORDER BY time_start ASC '''.format(EXPLAINED) dist_queries = ''' SELECT query, plan, estimated_cost FROM {} WHERE estimated_cost < 100 ORDER BY time_start ASC'''.format(UNIQUE) all_queries = ''' SELECT * FROM {} ORDER BY time_start ASC'''.format(EXPLAINED_ALL) if analyze_recurring: print print "Find recurring subtrees in distinct (query) queries:" queries = db.query(expl_queries) find_recurring(queries) # stored csv from previous will be overwritten print print "Find recurring subtrees in distinct (template) queries:" queries = db.query(dist_queries) find_recurring(queries) print print "Find recurring subtrees in distinct (template) queries (using subset check):" queries = db.query(dist_queries) find_recurring_subset(queries) print queries = db.query(expl_queries) explicit_implicit_joins(queries) # counters for how often we have a certain count in a query compressed_lengths = Counter() lengths = Counter() str_ops = Counter() distinct_str_ops = Counter() estimated = Counter() tables_seen = set() which_str_ops = Counter() table_clusters = [] # count how many new tables we see not_yet_seen_tables = [] last = 0 # go over all queries (joined with explained) print "Go over all queries" for i, q in enumerate(db.query(all_queries)): last = i plan = json.loads(q['plan']) tables = visit_operators(plan, visitor_tables) new_tables = set(tables) - tables_seen if new_tables: for t in new_tables: tables_seen.add(t) not_yet_seen_tables.append([i, len(new_tables)]) if not i % 100000: print "Went over", i print not_yet_seen_tables.append([last, 0]) print_table(not_yet_seen_tables, headers=['query_number', 'num_new_tables']) # go over distinct queries print "Go over distinct queries" for q in db.query(expl_queries): plan = json.loads(q['plan']) tables = visit_operators(plan, visitor_tables) # only valid sdss tables table_set = set([x.lower() for x in tables]) & set(SDSS_TABLES) if len(table_set): equal = [] for i, c in enumerate(table_clusters): if c.intersection(table_set): equal.append(i) table_clusters[i] = c | table_set equal.append(len(table_clusters)) table_clusters.append(table_set) if len(equal) > 1: first = equal[0] for i in equal[1:]: table_clusters[first] = table_clusters[first] | table_clusters[i] table_clusters = [x for i, x in enumerate(table_clusters) if i not in equal[1:]] query = q['query'] lengths[len(query)] += 1 compressed_lengths[len(bz2.compress(query))] += 1 estimated[q['estimated_cost']] += 1 # tokenization is horribly slow and does not work for sdss continue tokens = sqltokens.get_tokens(query) str_ops[len(tokens)] += 1 distinct_str_ops[len(set(tokens))] += 1 which_str_ops.update(tokens) print print_table(sorted( which_str_ops.iteritems(), key=lambda t: t[1], reverse=True), headers=["string_op", "count"]) print_table(sorted( [[str(list(x))] for x in table_clusters], key=lambda t: len(t), reverse=True), headers=["table_cluster"]) for name, values in zip( ['compressed lengths', 'lengths', 'string ops', 'distinct string ops', 'estimated'], [compressed_lengths, lengths, str_ops, distinct_str_ops, estimated]): print print_table(sorted( values.iteritems(), key=lambda t: t[0]), headers=[name, "counts"])