def empty_transaction(): stats = {'empty_transaction_count': {}, 'empty_pattern_count': {}} with open('transactions.pkl', 'rb') as pickle_file: transactions = pickle.load(pickle_file) for repo_name, project_type, transaction in transactions: queries = transaction.split('\n') if project_type not in stats['empty_pattern']: stats['empty_pattern'][project_type] = {} if len(queries) == 2: stats['empty_transaction_count'][project_type] = [ stats['empty_transaction_count'].get(project_type, [0])[0] + 1 ] if 'BEGIN' in queries[0].upper(): stats['empty_pattern_count'][project_type][ 'BEGIN'] = stats['empty_pattern_count'][ project_type].get('BEGIN', 0) + 1 elif 'AUTOCOMMIT' in queries[0].upper(): stats['empty_pattern_count'][project_type][ 'AUTOCOMMIT'] = stats['empty_pattern_count'][ project_type].get('AUTOCOMMIT', 0) + 1 print stats dump_all_stats('.', stats)
def query_stats(directory='.'): stats = {'query_type': {}} for repo in Repository.objects.exclude(latest_successful_attempt=None): if filter_repository(repo): continue actions = Action.objects.filter(attempt=repo.latest_successful_attempt) if len(actions) == 0: continue project_type_name = repo.project_type.name if project_type_name not in stats['query_type']: stats['query_type'][project_type_name] = {} for action in actions: for query in Query.objects.filter(action=action): for query_type in [ 'SELECT', 'INSERT', 'UPDATE', 'DELETE', 'OTHER' ]: if query_type == 'OTHER' or query_type in query.content: stats['query_type'][project_type_name][ query_type] = stats['query_type'][ project_type_name].get(query_type, 0) + 1 break dump_all_stats(directory, stats)
def multiset_stats(directory='.'): stats = {'logical_operator': {}, 'set_operator': {}} for repo in Repository.objects.exclude(latest_successful_attempt=None): if filter_repository(repo): continue project_type_name = repo.project_type.name if project_type_name not in stats['logical_operator']: stats['logical_operator'][project_type_name] = {} if project_type_name not in stats['set_operator']: stats['set_operator'][project_type_name] = {} for action in Action.objects.filter( attempt=repo.latest_successful_attempt): for query in Query.objects.filter(action=action): for logical_word in ['AND', 'OR', 'NOT', 'XOR']: stats['logical_operator'][project_type_name][ logical_word] = stats['logical_operator'][ project_type_name].get(logical_word, 0) + len( re.findall(logical_word, query.content)) for set_word in ['UNION', 'INTERSECT', 'EXCEPT']: stats['set_operator'][project_type_name][set_word] = stats[ 'set_operator'][project_type_name].get( set_word, 0) + len( re.findall(set_word, query.content)) dump_all_stats(directory, stats)
def nested_stats(directory='.'): stats = {'nested_count': {}, 'nested_operator': {}} for repo in Repository.objects.exclude(latest_successful_attempt=None): if filter_repository(repo): continue project_type_name = repo.project_type.name if project_type_name not in stats['nested_count']: stats['nested_count'][project_type_name] = [] if project_type_name not in stats['nested_operator']: stats['nested_operator'][project_type_name] = {} for action in Action.objects.filter( attempt=repo.latest_successful_attempt): for query in Query.objects.filter(action=action): nested_count = 0 for explain in Explain.objects.filter(query=query): nested_count += len(re.findall('Nested', explain.output)) if nested_count > 0: stats['nested_count'][project_type_name].append( nested_count) for nested_word in [ 'ALL', 'ANY', 'SOME', 'EXISTS', 'IN', 'NOT EXISTS' ]: stats['nested_operator'][project_type_name][ nested_word] = stats['nested_operator'][ project_type_name].get(nested_word, 0) + len( re.findall(nested_word, query.content)) stats['nested_operator'][project_type_name]['EXISTS'] -= len( re.findall('NOT EXISTS', query.content)) dump_all_stats(directory, stats)
def index_stats(directory=TABLES_DIRECTORY): stats = {'index_type': {}} for repo in Repository.objects.exclude(latest_successful_attempt=None): if filter_repository(repo): continue index_informations = Information.objects.filter( attempt=repo.latest_successful_attempt).filter(name='indexes') if len(index_informations) > 0: index_information = index_informations[0] project_type_name = repo.project_type.name if project_type_name not in stats['index_type']: stats['index_type'][project_type_name] = {} if repo.latest_successful_attempt.database.name == 'PostgreSQL': regex = '(\(.*?\))[,\]]' elif repo.latest_successful_attempt.database.name == 'MySQL': regex = '(\(.*?\))[,\)]' for column in re.findall(regex, index_information.description): cells = column.split(',') _type = cells[13].replace("'", "").strip() stats['index_type'][project_type_name][ _type] = stats['index_type'][project_type_name].get( _type, 0) + 1 dump_all_stats(directory, stats)
def repetitive(directory='.'): stats = {'repetitive_count': {}, 'query_count': {}} repetitive_queries = set() for repo in Repository.objects.exclude(latest_successful_attempt=None): if filter_repository(repo): continue project_type_name = repo.project_type.name for action in Action.objects.filter( attempt=repo.latest_successful_attempt): queries = map(lambda x: x.content.strip(), Query.objects.filter(action=action)) for i in xrange(1, len(queries)): if queries[i] == queries[i - 1]: repetitive_queries.add(queries[i]) print project_type_name print queries[i] print stats['repetitive_count'][ project_type_name] = stats['repetitive_count'].get( project_type_name, 0) + 1 stats['query_count'][project_type_name] = stats['query_count'].get( project_type_name, 0) + len(queries) pickle_dump(directory, 'repetitive_queries', repetitive_queries) dump_all_stats(directory, stats)
def transaction_stats(directory = '.'): stats = {'transaction_count': {}, 'transaction_query_count': {}, 'transaction_read_count': {}, 'transaction_write_count': {}} for repo in Repository.objects.exclude(latest_successful_attempt = None): if filter_repository(repo): continue project_type_name = repo.project_type.name if project_type_name not in stats['transaction_count']: stats['transaction_count'][project_type_name] = [] if project_type_name not in stats['transaction_query_count']: stats['transaction_query_count'][project_type_name] = [] if project_type_name not in stats['transaction_read_count']: stats['transaction_read_count'][project_type_name] = [] if project_type_name not in stats['transaction_write_count']: stats['transaction_write_count'][project_type_name] = [] for action in Action.objects.filter(attempt = repo.latest_successful_attempt): transaction = '' query_count = 0 transaction_count = 0 for query in Query.objects.filter(action = action): if 'BEGIN' in query.content.upper() or 'START TRANSACTION' in query.content.upper(): transaction = query.content + '\n' query_count = 1 elif transaction != '': transaction += query.content + '\n' query_count += 1 if 'COMMIT' in query.content.upper(): transaction = transaction.strip('\n') # for each transaction, count the number of transactions transaction_count += 1 # for each transaction, count the number of read/write read_count = len(re.findall('SELECT', transaction.upper())) stats['transaction_read_count'][project_type_name].append(read_count) write_count = 0 for keyword in ['INSERT', 'DELETE', 'UPDATE']: write_count += len(re.findall(keyword, transaction.upper())) stats['transaction_write_count'][project_type_name].append(write_count) # for each transaction, count the queries query_count -= 2 stats['transaction_query_count'][project_type_name].append(query_count) if transaction_count > 0: stats['transaction_count'][project_type_name].append(transaction_count) dump_all_stats(directory, stats)
def count_transaction(): stats = {'transaction_count': {}} with open('transactions.pkl', 'rb') as pickle_file: transactions = pickle.load(pickle_file) for repo_name, project_type, transaction in transactions: stats['transaction_count'][project_type] = [ stats['transaction_count'].get(project_type, [0])[0] + 1 ] print stats dump_all_stats('.', stats)
def sort_stats(directory='.'): stats = {'sort_key_count': {}, 'sort_key_type': {}} for repo in Repository.objects.exclude(latest_successful_attempt=None): if filter_repository(repo): continue project_type_name = repo.project_type.name if project_type_name not in stats['sort_key_count']: stats['sort_key_count'][project_type_name] = [] if project_type_name not in stats['sort_key_type']: stats['sort_key_type'][project_type_name] = {} informations = Information.objects.filter( attempt=repo.latest_successful_attempt).filter(name='columns') column_map = {} if len(informations) > 0: information = informations[0] if repo.latest_successful_attempt.database.name == 'PostgreSQL': regex = '(\(.*?\))[,\]]' elif repo.latest_successful_attempt.database.name == 'MySQL': regex = '(\(.*?\))[,\)]' for column in re.findall(regex, information.description): cells = column.split(',') table = str(cells[2]).replace("'", "").strip() name = str(cells[3]).replace("'", "").strip() _type = str(cells[7]).replace("'", "").strip() column_map[table + '.' + name] = _type column_map[name] = _type for action in Action.objects.filter( attempt=repo.latest_successful_attempt): for query in Query.objects.filter(action=action): for explain in Explain.objects.filter(query=query): for sort_key in re.findall('Sort Key: .*', explain.output): sort_key_count = len(re.findall(',', sort_key)) + 1 stats['sort_key_count'][project_type_name].append( sort_key_count) sort_keys = map(lambda key: str(key).strip(), sort_key[10:].split(',')) for key in sort_keys: if key in column_map: _type = column_map[key] stats['sort_key_type'][project_type_name][ _type] = stats['sort_key_type'][ project_type_name].get(_type, 0) + 1 dump_all_stats(directory, stats)
def action_stats(directory = '.'): stats = {'action_query_count': {}} for repo in Repository.objects.exclude(latest_successful_attempt = None): if filter_repository(repo): continue project_type_name = repo.project_type.name if project_type_name not in stats['action_query_count']: stats['action_query_count'][project_type_name] = [] for action in Action.objects.filter(attempt = repo.latest_successful_attempt): query_count = len(Query.objects.filter(action = action)) if query_count > 0: stats['action_query_count'][project_type_name].append(query_count) dump_all_stats(directory, stats)
def action_stats(directory = '.'): stats = {'action_query_count': {}} for repo in Repository.objects.exclude(latest_successful_attempt = None): if filter_repository(repo): continue project_type_name = repo.project_type.name if project_type_name not in stats['action_query_count']: stats['action_query_count'][project_type_name] = [] for action in Action.objects.filter(attempt = repo.latest_successful_attempt): query_count = len(Query.objects.filter(action = action)) if query_count > 0: stats['action_query_count'][project_type_name].append(query_count) dump_all_stats(directory, stats)
def aggregate_stats(directory='.'): stats = {'aggregate_operator': {}} for repo in Repository.objects.exclude(latest_successful_attempt=None): if filter_repository(repo): continue project_type_name = repo.project_type.name if project_type_name not in stats['aggregate_operator']: stats['aggregate_operator'][project_type_name] = {} for action in Action.objects.filter( attempt=repo.latest_successful_attempt): for query in Query.objects.filter(action=action): for aggregate_word in ['AVG', 'COUNT', 'MAX', 'MIN', 'SUM']: stats['aggregate_operator'][project_type_name][ aggregate_word] = stats['aggregate_operator'][ project_type_name].get(aggregate_word, 0) + len( re.findall(aggregate_word, query.content)) dump_all_stats(directory, stats)
def table_stats(directory = '.'): stats = {} for repo in Repository.objects.exclude(latest_successful_attempt = None): if filter_repository(repo): continue statistics = Statistic.objects.filter(attempt = repo.latest_successful_attempt) if len(statistics) == 0: continue for s in statistics: if s.description == 'num_transactions': continue if s.description not in stats: stats[s.description] = {} project_type_name = repo.project_type.name if project_type_name not in stats[s.description]: stats[s.description][project_type_name] = [] stats[s.description][project_type_name].append(s.count) dump_all_stats(directory, stats)
def scan_stats(directory='.'): stats = {'scan_type': {}} for repo in Repository.objects.exclude(latest_successful_attempt=None): if filter_repository(repo): continue project_type_name = repo.project_type.name if project_type_name not in stats['scan_type']: stats['scan_type'][project_type_name] = {} for action in Action.objects.filter( attempt=repo.latest_successful_attempt): for query in Query.objects.filter(action=action): for explain in Explain.objects.filter(query=query): for scan in re.findall('[A-Za-z][\sA-Za-z]*Scan', explain.output): stats['scan_type'][project_type_name][ scan] = stats['scan_type'][project_type_name].get( scan, 0) + 1 dump_all_stats(directory, stats)
def table_stats(directory = '.'): stats = {} for repo in Repository.objects.exclude(latest_successful_attempt = None): if filter_repository(repo): continue statistics = Statistic.objects.filter(attempt = repo.latest_successful_attempt) if len(statistics) == 0: continue for s in statistics: if s.description == 'num_transactions': continue if s.description not in stats: stats[s.description] = {} project_type_name = repo.project_type.name if project_type_name not in stats[s.description]: stats[s.description][project_type_name] = [] stats[s.description][project_type_name].append(s.count) dump_all_stats(directory, stats)
def having_stats(directory='.'): stats = {'having_count': {}, 'group_count': {}} for repo in Repository.objects.exclude(latest_successful_attempt=None): if filter_repository(repo): continue project_type_name = repo.project_type.name if project_type_name not in stats['having_count']: stats['having_count'][project_type_name] = [] if project_type_name not in stats['group_count']: stats['group_count'][project_type_name] = [] for action in Action.objects.filter( attempt=repo.latest_successful_attempt): for query in Query.objects.filter(action=action): having_count = len(re.findall('HAVING', query.content)) if having_count > 0: stats['having_count'][project_type_name].append( having_count) group_count = len(re.findall('GROUP BY', query.content)) if group_count > 0: stats['group_count'][project_type_name].append(group_count) dump_all_stats(directory, stats)
def coverage_stats(directory='.'): stats = { 'table_coverage': {}, 'column_coverage': {}, 'index_coverage': {}, 'table_access': {} } for repo in Repository.objects.exclude(latest_successful_attempt=None): if filter_repository(repo): continue actions = Action.objects.filter(attempt=repo.latest_successful_attempt) if len(actions) == 0: continue statistics = Statistic.objects.filter( attempt=repo.latest_successful_attempt).filter( description='num_tables') if len(statistics) == 0: continue table_count = statistics[0].count if table_count == 0: continue informations = Information.objects.filter( attempt=repo.latest_successful_attempt).filter(name='tables') if len(informations) == 0: continue information = informations[0] tables = set() if repo.latest_successful_attempt.database.name == 'PostgreSQL': regex = '(\(.*?\))[,\]]' elif repo.latest_successful_attempt.database.name == 'MySQL': regex = '(\(.*?\))[,\)]' for table in re.findall(regex, information.description): cells = table.split(',') table_name = str(cells[2]).replace("'", "").strip() tables.add(table_name) project_type_name = repo.project_type.name if project_type_name not in stats['table_coverage']: stats['table_coverage'][project_type_name] = [] if project_type_name not in stats['column_coverage']: stats['column_coverage'][project_type_name] = [] if project_type_name not in stats['index_coverage']: stats['index_coverage'][project_type_name] = [] if project_type_name not in stats['table_access']: stats['table_access'][project_type_name] = [] covered_tables = set() for action in actions: for query in Query.objects.filter(action=action): table_access_count = 0 for token in query.content.split(): token = token.replace('"', '').replace('`', '') if token in tables: table_access_count += 1 covered_tables.add(token) stats['table_access'][project_type_name].append( table_access_count) table_percentage = int(float(len(covered_tables) * 100) / table_count) table_percentage = min(table_percentage, 100) stats['table_coverage'][project_type_name].append(table_percentage) informations = Information.objects.filter( attempt=repo.latest_successful_attempt).filter(name='columns') if len(informations) > 0: information = informations[0] column_count = 0 for covered_table in covered_tables: column_count += len( re.findall(covered_table.upper(), information.description.upper())) if repo.latest_successful_attempt.database.name == 'PostgreSQL': column_count = min( column_count, len(re.findall('(\(.*?\))[,\]]', information.description))) elif repo.latest_successful_attempt.database.name == 'MySQL': column_count = min( column_count, len(re.findall('(\(.*?\))[,\)]', information.description))) if column_count > 0: covered_columns = set() for action in actions: for query in Query.objects.filter(action=action): parsed = sqlparse.parse(query.content)[0] tokens = parsed.tokens for token in tokens: token_name = token.value.replace('`', '') if isinstance(token, sqlparse.sql.Identifier): covered_columns.add(token_name) column_percentage = int( float(len(covered_columns) * 100) / column_count) column_percentage = min(column_percentage, 100) stats['column_coverage'][project_type_name].append( column_percentage) informations = Information.objects.filter( attempt=repo.latest_successful_attempt).filter(name='indexes') if len(informations) > 0: information = informations[0] index_count = 0 for covered_table in covered_tables: index_count += len( re.findall(covered_table.upper(), information.description.upper())) statistics = Statistic.objects.filter( attempt=repo.latest_successful_attempt).filter( description='num_indexes') if len(statistics) == 0: continue if statistics[0].count > 0: index_count = min(index_count, statistics[0].count) if index_count > 0: covered_indexes = set() for action in actions: for query in Query.objects.filter(action=action): for explain in Explain.objects.filter(query=query): for raw_index in re.findall( 'Index.*?Scan.*?on \S+', explain.output): index = raw_index.split()[-1] covered_indexes.add(index) index_percentage = int( float(len(covered_indexes) * 100) / index_count) index_percentage = min(index_percentage, 100) stats['index_coverage'][project_type_name].append( index_percentage) dump_all_stats(directory, stats)
def join_stats(directory='.'): stats = {'join_type': {}, 'join_key_type': {}, 'join_key_constraint': {}} for repo in Repository.objects.exclude(latest_successful_attempt=None): if filter_repository(repo): continue project_type_name = repo.project_type.name if project_type_name not in stats['join_type']: stats['join_type'][project_type_name] = {} if project_type_name not in stats['join_key_type']: stats['join_key_type'][project_type_name] = {} if project_type_name not in stats['join_key_constraint']: stats['join_key_constraint'][project_type_name] = {} informations = Information.objects.filter( attempt=repo.latest_successful_attempt).filter(name='columns') column_map = {} if len(informations) > 0: information = informations[0] if repo.latest_successful_attempt.database.name == 'PostgreSQL': regex = '(\(.*?\))[,\]]' elif repo.latest_successful_attempt.database.name == 'MySQL': regex = '(\(.*?\))[,\)]' for column in re.findall(regex, information.description): cells = column.split(',') table = str(cells[2]).replace("'", "").strip() name = str(cells[3]).replace("'", "").strip() _type = str(cells[7]).replace("'", "").strip() column_map[table + '.' + name] = _type column_map[name] = _type key_column_usage_informations = Information.objects.filter( attempt=repo.latest_successful_attempt).filter( name='key_column_usage') constraint_informations = Information.objects.filter( attempt=repo.latest_successful_attempt).filter(name='constraints') constraint_map = {} if len(key_column_usage_informations) > 0 and len( constraint_informations) > 0: if repo.latest_successful_attempt.database.name == 'PostgreSQL': regex = '(\(.*?\))[,\]]' elif repo.latest_successful_attempt.database.name == 'MySQL': regex = '(\(.*?\))[,\)]' merge_map = {} key_column_usage_information = key_column_usage_informations[0] for column in re.findall(regex, key_column_usage_information.description): cells = column.split(',') constraint_name = str(cells[2]).replace("'", "").strip() table_name = str(cells[5]).replace("'", "").strip() column_name = str(cells[6]).replace("'", "").strip() merge_map_key = table_name + '.' + constraint_name if merge_map_key in merge_map: merge_map[merge_map_key].append(column_name) else: merge_map[merge_map_key] = [column_name] constraint_information = constraint_informations[0] for column in re.findall(regex, constraint_information.description): cells = column.split(',') constraint_name = str(cells[2]).replace("'", "").strip() if repo.latest_successful_attempt.database.name == 'PostgreSQL': table_name = str(cells[5]).replace("'", "").strip() constraint_type = str(cells[6]).replace("'", "").strip() elif repo.latest_successful_attempt.database.name == 'MySQL': table_name = str(cells[4]).replace("'", "").strip() constraint_type = str(cells[5])[:-1].replace("'", "").strip() merge_map_key = table_name + '.' + constraint_name if merge_map_key in merge_map: for column_name in merge_map[merge_map_key]: constraint_map[table_name + '.' + column_name] = constraint_type constraint_map[column_name] = constraint_type cnt = 0 for action in Action.objects.filter( attempt=repo.latest_successful_attempt): queries = Query.objects.filter(action=action) for query in queries: content = query.content if 'JOIN' in content: parsed = sqlparse.parse(content)[0] tokens = parsed.tokens def process_join_key(token): if isinstance(token, sqlparse.sql.TokenList): if isinstance(token, sqlparse.sql.Comparison): left_key, right_key = str(token.left), str( token.right) left_key = left_key.replace('"', '').replace( '`', '') right_key = right_key.replace('"', '').replace( '`', '') if left_key in column_map and right_key in column_map: left_type = column_map[left_key] right_type = column_map[right_key] if left_type > right_type: left_type, right_type = right_type, left_type stats['join_key_type'][project_type_name][ left_type + '-' + right_type] = stats['join_key_type'][ project_type_name].get( left_type + '-' + right_type, 0) + 1 if left_key in constraint_map and right_key in constraint_map: left_constraint = constraint_map[left_key] right_constraint = constraint_map[ right_key] if left_constraint > right_constraint: left_constraint, right_constraint = right_constraint, left_constraint stats['join_key_constraint'][ project_type_name][ left_constraint + '-' + right_constraint] = stats[ 'join_key_constraint'][ project_type_name].get( left_constraint + '-' + right_constraint, 0) + 1 for _token in token.tokens: process_join_key(_token) for index in xrange(0, len(tokens)): if tokens[index].is_keyword: if 'JOIN' in tokens[index].value: join_type = tokens[index].value if 'OUTER' not in join_type and 'INNER' not in join_type: join_type = join_type.replace( 'JOIN', 'INNER JOIN') stats['join_type'][project_type_name][ join_type] = stats[ 'join_type'][project_type_name].get( join_type, 0) + 1 else: process_join_key(tokens[index]) dump_all_stats(directory, stats)
def foreign_key_stats(directory='.'): stats = {'foreign_key_count': {}, 'foreign_key_type': {}} for repo in Repository.objects.exclude(latest_successful_attempt=None): if filter_repository(repo): continue project_type_name = repo.project_type.name if project_type_name not in stats['foreign_key_count']: stats['foreign_key_count'][project_type_name] = [] if project_type_name not in stats['foreign_key_type']: stats['foreign_key_type'][project_type_name] = {} if 0: if project_type_name not in stats['join_key_constraint']: stats['join_key_constraint'][project_type_name] = {} informations = Information.objects.filter( attempt=repo.latest_successful_attempt).filter(name='columns') column_map = {} if len(informations) > 0: information = informations[0] if repo.latest_successful_attempt.database.name == 'PostgreSQL': regex = '(\(.*?\))[,\]]' elif repo.latest_successful_attempt.database.name == 'MySQL': regex = '(\(.*?\))[,\)]' for column in re.findall(regex, information.description): cells = column.split(',') table = str(cells[2]).replace("'", "").strip() name = str(cells[3]).replace("'", "").strip() _type = str(cells[7]).replace("'", "").strip() column_map[table + '.' + name] = _type column_map[name] = _type key_column_usage_informations = Information.objects.filter( attempt=repo.latest_successful_attempt).filter( name='key_column_usage') constraint_informations = Information.objects.filter( attempt=repo.latest_successful_attempt).filter(name='constraints') constraint_map = {} if len(key_column_usage_informations) > 0 and len( constraint_informations) > 0: if repo.latest_successful_attempt.database.name == 'PostgreSQL': regex = '(\(.*?\))[,\]]' elif repo.latest_successful_attempt.database.name == 'MySQL': regex = '(\(.*?\))[,\)]' merge_map = {} key_column_usage_information = key_column_usage_informations[0] for column in re.findall(regex, key_column_usage_information.description): cells = column.split(',') constraint_name = str(cells[2]).replace("'", "").strip() table_name = str(cells[5]).replace("'", "").strip() column_name = str(cells[6]).replace("'", "").strip() merge_map_key = table_name + '.' + constraint_name if merge_map_key in merge_map: merge_map[merge_map_key].append(column_name) else: merge_map[merge_map_key] = [column_name] constraint_information = constraint_informations[0] for column in re.findall(regex, constraint_information.description): cells = column.split(',') constraint_name = str(cells[2]).replace("'", "").strip() if repo.latest_successful_attempt.database.name == 'PostgreSQL': table_name = str(cells[5]).replace("'", "").strip() constraint_type = str(cells[6]).replace("'", "").strip() elif repo.latest_successful_attempt.database.name == 'MySQL': table_name = str(cells[4]).replace("'", "").strip() constraint_type = str(cells[5])[:-1].replace("'", "").strip() merge_map_key = table_name + '.' + constraint_name if merge_map_key in merge_map: for column_name in merge_map[merge_map_key]: constraint_map[table_name + '.' + column_name] = constraint_type constraint_map[column_name] = constraint_type if constraint_type == 'FOREIGN KEY': _type = column_map[table_name + '.' + column_name] stats['foreign_key_type'][project_type_name][ _type] = stats['foreign_key_type'][ project_type_name].get(_type, 0) + 1 for action in Action.objects.filter( attempt=repo.latest_successful_attempt): queries = Query.objects.filter(action=action) foreign_key_count = 0 for query in queries: parsed = sqlparse.parse(query.content)[0] tokens = parsed.tokens for token in tokens: if isinstance(token, sqlparse.sql.Identifier): token_name = token.value.replace('"', '').replace( '`', '') if token_name in constraint_map: constraint = constraint_map[token_name] if constraint == 'FOREIGN KEY': foreign_key_count += 1 for explain in Explain.objects.filter(query=query): if 'FOREIGN' in explain.output: print explain.output stats['foreign_key_count'][project_type_name].append( foreign_key_count) dump_all_stats(directory, stats)
def transaction_stats(directory = '.'): stats = {'transaction_count': {}, 'transaction_query_count': {}, 'transaction_read_count': {}, 'transaction_write_count': {}} transactions = [] for repo in Repository.objects.exclude(latest_successful_attempt = None): if filter_repository(repo): continue project_type_name = repo.project_type.name if project_type_name not in stats['transaction_count']: stats['transaction_count'][project_type_name] = [] if project_type_name not in stats['transaction_query_count']: stats['transaction_query_count'][project_type_name] = [] if project_type_name not in stats['transaction_read_count']: stats['transaction_read_count'][project_type_name] = [] if project_type_name not in stats['transaction_write_count']: stats['transaction_write_count'][project_type_name] = [] for action in Action.objects.filter(attempt = repo.latest_successful_attempt): transaction = '' query_count = 0 transaction_count = 0 for query in Query.objects.filter(action = action): if 'BEGIN' in query.content.upper() or 'START TRANSACTION' in query.content.upper() or 'SET AUTOCOMMIT=0' in query.content.upper(): transaction = query.content + '\n' query_count = 1 elif transaction != '': transaction += query.content + '\n' query_count += 1 if 'COMMIT' in query.content.upper(): transaction = transaction.strip('\n') # for each transaction, count the number of transactions transaction_count += 1 # for each transaction, count the number of read/write read_count = len(re.findall('SELECT', transaction.upper())) stats['transaction_read_count'][project_type_name].append(read_count) write_count = 0 for keyword in ['INSERT', 'DELETE', 'UPDATE']: write_count += len(re.findall(keyword, transaction.upper())) stats['transaction_write_count'][project_type_name].append(write_count) # for each transaction, count the queries query_count -= 2 stats['transaction_query_count'][project_type_name].append(query_count) try: transactions.append((repo.name, repo.project_type.name, transaction)) except: pass transaction = '' if transaction_count > 0: stats['transaction_count'][project_type_name].append(transaction_count) pickle_dump(directory, 'transactions', transactions) dump_all_stats(directory, stats)
def blind_write(): total = 0 count = 0 stats = {'blind_write_count': {}} def is_write(query): return ('INSERT' in query or 'UPDATE' in query) and ('UTC LOG: ' not in query) def get_identifiers(parsed): identifiers = [] for token in parsed[0].tokens: if isinstance(token, sqlparse.sql.Identifier): identifiers.append(token.value) return set(identifiers) def is_read_by(identifier, query): if 'SELECT' not in query: return False other_identifier = get_identifiers(sqlparse.parse(query)) return identifier.intersection(other_identifier) with open('transactions.pkl', 'rb') as pickle_file: transactions = pickle.load(pickle_file) for repo_name, project_type, transaction in transactions: queries = transaction.split('\n') writes = [] for i in xrange(len(queries)): if is_write(queries[i]): writes.append((i, queries[i])) is_blind_write = False index, other_index = -1, -1 if len(writes) > 1: identifiers = [(i, get_identifiers(sqlparse.parse(query))) for (i, query) in writes] for i in xrange(1, len(identifiers)): if is_blind_write: break for j in xrange(i): if is_blind_write: break index, identifier = identifiers[i] other_index, other_identifier = identifiers[j] if identifier.intersection(other_identifier): is_blind_write = True for k in xrange(other_index + 1, index): if is_read_by(identifier, queries[k]): is_blind_write = False break if is_blind_write: count += 1 stats['blind_write_count'][project_type] = [ stats['blind_write_count'].get( project_type, [0])[0] + 1 ] # for k in xrange(other_index + 1, index): # print 1, queries[k] # print # raw_input() if is_blind_write: print repo_name, project_type print queries[index] print queries[other_index] print '+' * 10 print transaction.encode('utf-8') print '-' * 20 print stats print 'Total # of Blind Writes:', count dump_all_stats('.', stats)
def column_stats(directory = '.'): stats = {'column_nullable': {}, 'column_type': {}, 'column_extra': {}, 'column_num': {}} for repo in Repository.objects.exclude(latest_successful_attempt = None).filter(project_type = 1): if filter_repository(repo): continue column_informations = Information.objects.filter(attempt = repo.latest_successful_attempt).filter(name = 'columns') constraint_informations = Information.objects.filter(attempt = repo.latest_successful_attempt).filter(name = 'constraints') num_table_statistics = Statistic.objects.filter(attempt = repo.latest_successful_attempt).filter(description = 'num_tables') if len(column_informations) > 0 and len(constraint_informations) > 0 and len(num_table_statistics) > 0: column_information = column_informations[0] constraint_information = constraint_informations[0] num_tables = num_table_statistics[0].count project_type_name = repo.project_type.name if project_type_name not in stats['column_nullable']: stats['column_nullable'][project_type_name] = {} if project_type_name not in stats['column_type']: stats['column_type'][project_type_name] = {} if project_type_name not in stats['column_extra']: stats['column_extra'][project_type_name] = {} if project_type_name not in stats['column_num']: stats['column_num'][project_type_name] = [] if repo.latest_successful_attempt.database.name == 'PostgreSQL': regex = '(\(.*?\))[,\]]' elif repo.latest_successful_attempt.database.name == 'MySQL': regex = '(\(.*?\))[,\)]' table_stats = {'column_nullable': {}, 'column_type': {}, 'column_extra': {}, 'column_num': {}} for column in re.findall(regex, column_information.description): cells = column.split(',') table = str(cells[2]).replace("'", "").strip() nullable = str(cells[6]).replace("'", "").strip() if table not in table_stats['column_nullable']: table_stats['column_nullable'][table] = {} table_stats['column_nullable'][table][nullable] = table_stats['column_nullable'][table].get(nullable, 0) + 1 _type = str(cells[7]).replace("'", "").strip() if table not in table_stats['column_type']: table_stats['column_type'][table] = {} table_stats['column_type'][table][_type] = table_stats['column_type'][table].get(_type, 0) + 1 extra = str(cells[16]).replace("'", "").strip() if extra: if table not in table_stats['column_extra']: table_stats['column_extra'][table] = {} table_stats['column_extra'][table][extra] = table_stats['column_extra'][table].get(extra, 0) + 1 if table not in table_stats['column_num']: table_stats['column_num'][table] = 0 table_stats['column_num'][table] += 1 for column in re.findall(regex, constraint_information.description): cells = column.split(',') if repo.latest_successful_attempt.database.name == 'PostgreSQL': constraint_type = str(cells[6]).replace("'", "").strip() elif repo.latest_successful_attempt.database.name == 'MySQL': constraint_type = str(cells[5])[:-1].replace("'", "").strip() if repo.latest_successful_attempt.database.name == 'PostgreSQL': table = str(cells[5]).replace("'", "").strip() elif repo.latest_successful_attempt.database.name == 'MySQL': table = str(cells[4]).replace("'", "").strip() if table not in table_stats['column_extra']: table_stats['column_extra'][table] = {} table_stats['column_extra'][table][constraint_type] = table_stats['column_extra'][table].get(constraint_type, 0) + 1 for stats_type in table_stats: for table in table_stats[stats_type]: if isinstance(table_stats[stats_type][table], dict): for second_type in table_stats[stats_type][table]: if second_type not in stats[stats_type][project_type_name]: stats[stats_type][project_type_name][second_type] = [] stats[stats_type][project_type_name][second_type].append(table_stats[stats_type][table][second_type]) else: stats[stats_type][project_type_name].append(table_stats[stats_type][table]) dump_all_stats(directory, stats)
def column_stats(directory='.'): stats = { 'column_nullable': {}, 'column_type': {}, 'column_extra': {}, 'column_num': {} } for repo in Repository.objects.exclude(latest_successful_attempt=None): if filter_repository(repo): continue column_informations = Information.objects.filter( attempt=repo.latest_successful_attempt).filter(name='columns') constraint_informations = Information.objects.filter( attempt=repo.latest_successful_attempt).filter(name='constraints') num_table_statistics = Statistic.objects.filter( attempt=repo.latest_successful_attempt).filter( description='num_tables') if len(column_informations) > 0 and len( constraint_informations) > 0 and len(num_table_statistics) > 0: column_information = column_informations[0] constraint_information = constraint_informations[0] num_tables = num_table_statistics[0].count project_type_name = repo.project_type.name if project_type_name not in stats['column_nullable']: stats['column_nullable'][project_type_name] = {} if project_type_name not in stats['column_type']: stats['column_type'][project_type_name] = {} if project_type_name not in stats['column_extra']: stats['column_extra'][project_type_name] = {} if project_type_name not in stats['column_num']: stats['column_num'][project_type_name] = [] if repo.latest_successful_attempt.database.name == 'PostgreSQL': regex = '(\(.*?\))[,\]]' elif repo.latest_successful_attempt.database.name == 'MySQL': regex = '(\(.*?\))[,\)]' table_stats = { 'column_nullable': {}, 'column_type': {}, 'column_extra': {}, 'column_num': {} } for column in re.findall(regex, column_information.description): cells = column.split(',') table = str(cells[2]).replace("'", "").strip() nullable = str(cells[6]).replace("'", "").strip() if table not in table_stats['column_nullable']: table_stats['column_nullable'][table] = {} table_stats['column_nullable'][table][ nullable] = table_stats['column_nullable'][table].get( nullable, 0) + 1 _type = str(cells[7]).replace("'", "").strip() if table not in table_stats['column_type']: table_stats['column_type'][table] = {} table_stats['column_type'][ table][_type] = table_stats['column_type'][table].get( _type, 0) + 1 extra = str(cells[16]).replace("'", "").strip() if extra: if table not in table_stats['column_extra']: table_stats['column_extra'][table] = {} table_stats['column_extra'][table][ extra] = table_stats['column_extra'][table].get( extra, 0) + 1 if table not in table_stats['column_num']: table_stats['column_num'][table] = 0 table_stats['column_num'][table] += 1 for column in re.findall(regex, constraint_information.description): cells = column.split(',') if repo.latest_successful_attempt.database.name == 'PostgreSQL': constraint_type = str(cells[6]).replace("'", "").strip() elif repo.latest_successful_attempt.database.name == 'MySQL': constraint_type = str(cells[5])[:-1].replace("'", "").strip() if repo.latest_successful_attempt.database.name == 'PostgreSQL': table = str(cells[5]).replace("'", "").strip() elif repo.latest_successful_attempt.database.name == 'MySQL': table = str(cells[4]).replace("'", "").strip() if table not in table_stats['column_extra']: table_stats['column_extra'][table] = {} table_stats['column_extra'][table][ constraint_type] = table_stats['column_extra'][table].get( constraint_type, 0) + 1 for stats_type in table_stats: for table in table_stats[stats_type]: if isinstance(table_stats[stats_type][table], dict): for second_type in table_stats[stats_type][table]: if second_type not in stats[stats_type][ project_type_name]: stats[stats_type][project_type_name][ second_type] = [] stats[stats_type][project_type_name][ second_type].append(table_stats[stats_type] [table][second_type]) else: stats[stats_type][project_type_name].append( table_stats[stats_type][table]) dump_all_stats(directory, stats)