Beispiel #1
0
def empty_transaction():
    stats = {'empty_transaction_count': {}, 'empty_pattern_count': {}}

    with open('transactions.pkl', 'rb') as pickle_file:
        transactions = pickle.load(pickle_file)

        for repo_name, project_type, transaction in transactions:
            queries = transaction.split('\n')

            if project_type not in stats['empty_pattern']:
                stats['empty_pattern'][project_type] = {}

            if len(queries) == 2:
                stats['empty_transaction_count'][project_type] = [
                    stats['empty_transaction_count'].get(project_type, [0])[0]
                    + 1
                ]

                if 'BEGIN' in queries[0].upper():
                    stats['empty_pattern_count'][project_type][
                        'BEGIN'] = stats['empty_pattern_count'][
                            project_type].get('BEGIN', 0) + 1
                elif 'AUTOCOMMIT' in queries[0].upper():
                    stats['empty_pattern_count'][project_type][
                        'AUTOCOMMIT'] = stats['empty_pattern_count'][
                            project_type].get('AUTOCOMMIT', 0) + 1

    print stats

    dump_all_stats('.', stats)
Beispiel #2
0
def query_stats(directory='.'):
    stats = {'query_type': {}}

    for repo in Repository.objects.exclude(latest_successful_attempt=None):
        if filter_repository(repo):
            continue

        actions = Action.objects.filter(attempt=repo.latest_successful_attempt)
        if len(actions) == 0:
            continue

        project_type_name = repo.project_type.name
        if project_type_name not in stats['query_type']:
            stats['query_type'][project_type_name] = {}

        for action in actions:
            for query in Query.objects.filter(action=action):
                for query_type in [
                        'SELECT', 'INSERT', 'UPDATE', 'DELETE', 'OTHER'
                ]:
                    if query_type == 'OTHER' or query_type in query.content:
                        stats['query_type'][project_type_name][
                            query_type] = stats['query_type'][
                                project_type_name].get(query_type, 0) + 1
                        break

    dump_all_stats(directory, stats)
Beispiel #3
0
def multiset_stats(directory='.'):
    stats = {'logical_operator': {}, 'set_operator': {}}

    for repo in Repository.objects.exclude(latest_successful_attempt=None):
        if filter_repository(repo):
            continue

        project_type_name = repo.project_type.name
        if project_type_name not in stats['logical_operator']:
            stats['logical_operator'][project_type_name] = {}
        if project_type_name not in stats['set_operator']:
            stats['set_operator'][project_type_name] = {}
        for action in Action.objects.filter(
                attempt=repo.latest_successful_attempt):
            for query in Query.objects.filter(action=action):
                for logical_word in ['AND', 'OR', 'NOT', 'XOR']:
                    stats['logical_operator'][project_type_name][
                        logical_word] = stats['logical_operator'][
                            project_type_name].get(logical_word, 0) + len(
                                re.findall(logical_word, query.content))
                for set_word in ['UNION', 'INTERSECT', 'EXCEPT']:
                    stats['set_operator'][project_type_name][set_word] = stats[
                        'set_operator'][project_type_name].get(
                            set_word, 0) + len(
                                re.findall(set_word, query.content))

    dump_all_stats(directory, stats)
Beispiel #4
0
def nested_stats(directory='.'):
    stats = {'nested_count': {}, 'nested_operator': {}}

    for repo in Repository.objects.exclude(latest_successful_attempt=None):
        if filter_repository(repo):
            continue

        project_type_name = repo.project_type.name
        if project_type_name not in stats['nested_count']:
            stats['nested_count'][project_type_name] = []
        if project_type_name not in stats['nested_operator']:
            stats['nested_operator'][project_type_name] = {}
        for action in Action.objects.filter(
                attempt=repo.latest_successful_attempt):
            for query in Query.objects.filter(action=action):
                nested_count = 0
                for explain in Explain.objects.filter(query=query):
                    nested_count += len(re.findall('Nested', explain.output))
                if nested_count > 0:
                    stats['nested_count'][project_type_name].append(
                        nested_count)
                for nested_word in [
                        'ALL', 'ANY', 'SOME', 'EXISTS', 'IN', 'NOT EXISTS'
                ]:
                    stats['nested_operator'][project_type_name][
                        nested_word] = stats['nested_operator'][
                            project_type_name].get(nested_word, 0) + len(
                                re.findall(nested_word, query.content))
                stats['nested_operator'][project_type_name]['EXISTS'] -= len(
                    re.findall('NOT EXISTS', query.content))

    dump_all_stats(directory, stats)
Beispiel #5
0
def index_stats(directory=TABLES_DIRECTORY):
    stats = {'index_type': {}}

    for repo in Repository.objects.exclude(latest_successful_attempt=None):
        if filter_repository(repo):
            continue

        index_informations = Information.objects.filter(
            attempt=repo.latest_successful_attempt).filter(name='indexes')
        if len(index_informations) > 0:
            index_information = index_informations[0]

            project_type_name = repo.project_type.name
            if project_type_name not in stats['index_type']:
                stats['index_type'][project_type_name] = {}

            if repo.latest_successful_attempt.database.name == 'PostgreSQL':
                regex = '(\(.*?\))[,\]]'
            elif repo.latest_successful_attempt.database.name == 'MySQL':
                regex = '(\(.*?\))[,\)]'

            for column in re.findall(regex, index_information.description):
                cells = column.split(',')

                _type = cells[13].replace("'", "").strip()
                stats['index_type'][project_type_name][
                    _type] = stats['index_type'][project_type_name].get(
                        _type, 0) + 1

    dump_all_stats(directory, stats)
Beispiel #6
0
def repetitive(directory='.'):
    stats = {'repetitive_count': {}, 'query_count': {}}

    repetitive_queries = set()

    for repo in Repository.objects.exclude(latest_successful_attempt=None):
        if filter_repository(repo):
            continue

        project_type_name = repo.project_type.name

        for action in Action.objects.filter(
                attempt=repo.latest_successful_attempt):
            queries = map(lambda x: x.content.strip(),
                          Query.objects.filter(action=action))
            for i in xrange(1, len(queries)):
                if queries[i] == queries[i - 1]:
                    repetitive_queries.add(queries[i])
                    print project_type_name
                    print queries[i]
                    print
                    stats['repetitive_count'][
                        project_type_name] = stats['repetitive_count'].get(
                            project_type_name, 0) + 1
            stats['query_count'][project_type_name] = stats['query_count'].get(
                project_type_name, 0) + len(queries)

    pickle_dump(directory, 'repetitive_queries', repetitive_queries)

    dump_all_stats(directory, stats)
Beispiel #7
0
def transaction_stats(directory = '.'):
    stats = {'transaction_count': {}, 'transaction_query_count': {}, 'transaction_read_count': {}, 'transaction_write_count': {}}

    for repo in Repository.objects.exclude(latest_successful_attempt = None):
        if filter_repository(repo):
            continue
        
        project_type_name = repo.project_type.name
        if project_type_name not in stats['transaction_count']:
            stats['transaction_count'][project_type_name] = []
        if project_type_name not in stats['transaction_query_count']:
            stats['transaction_query_count'][project_type_name] = []
        if project_type_name not in stats['transaction_read_count']:
            stats['transaction_read_count'][project_type_name] = []
        if project_type_name not in stats['transaction_write_count']:
            stats['transaction_write_count'][project_type_name] = []
        

        for action in Action.objects.filter(attempt = repo.latest_successful_attempt):
            transaction = ''
            query_count = 0
            transaction_count = 0

            for query in Query.objects.filter(action = action):
                if 'BEGIN' in query.content.upper() or 'START TRANSACTION' in query.content.upper():
                    transaction = query.content + '\n'
                    query_count = 1
                elif transaction != '':
                    transaction += query.content + '\n'
                    query_count += 1
                    if 'COMMIT' in query.content.upper():
                        transaction = transaction.strip('\n')
                    
                        # for each transaction, count the number of transactions
                        transaction_count += 1

                        # for each transaction, count the number of read/write
                        read_count = len(re.findall('SELECT', transaction.upper()))
                        stats['transaction_read_count'][project_type_name].append(read_count)
                        write_count = 0
                        for keyword in ['INSERT', 'DELETE', 'UPDATE']:
                            write_count += len(re.findall(keyword, transaction.upper()))
                        stats['transaction_write_count'][project_type_name].append(write_count)
                        
                        # for each transaction, count the queries
                        query_count -= 2
                        stats['transaction_query_count'][project_type_name].append(query_count)

            if transaction_count > 0:
                stats['transaction_count'][project_type_name].append(transaction_count)

            
    dump_all_stats(directory, stats)
Beispiel #8
0
def count_transaction():
    stats = {'transaction_count': {}}

    with open('transactions.pkl', 'rb') as pickle_file:
        transactions = pickle.load(pickle_file)

        for repo_name, project_type, transaction in transactions:
            stats['transaction_count'][project_type] = [
                stats['transaction_count'].get(project_type, [0])[0] + 1
            ]

    print stats

    dump_all_stats('.', stats)
Beispiel #9
0
def sort_stats(directory='.'):
    stats = {'sort_key_count': {}, 'sort_key_type': {}}

    for repo in Repository.objects.exclude(latest_successful_attempt=None):
        if filter_repository(repo):
            continue

        project_type_name = repo.project_type.name
        if project_type_name not in stats['sort_key_count']:
            stats['sort_key_count'][project_type_name] = []
        if project_type_name not in stats['sort_key_type']:
            stats['sort_key_type'][project_type_name] = {}

        informations = Information.objects.filter(
            attempt=repo.latest_successful_attempt).filter(name='columns')
        column_map = {}
        if len(informations) > 0:
            information = informations[0]
            if repo.latest_successful_attempt.database.name == 'PostgreSQL':
                regex = '(\(.*?\))[,\]]'
            elif repo.latest_successful_attempt.database.name == 'MySQL':
                regex = '(\(.*?\))[,\)]'

            for column in re.findall(regex, information.description):
                cells = column.split(',')
                table = str(cells[2]).replace("'", "").strip()
                name = str(cells[3]).replace("'", "").strip()
                _type = str(cells[7]).replace("'", "").strip()
                column_map[table + '.' + name] = _type
                column_map[name] = _type

        for action in Action.objects.filter(
                attempt=repo.latest_successful_attempt):
            for query in Query.objects.filter(action=action):
                for explain in Explain.objects.filter(query=query):
                    for sort_key in re.findall('Sort Key: .*', explain.output):
                        sort_key_count = len(re.findall(',', sort_key)) + 1
                        stats['sort_key_count'][project_type_name].append(
                            sort_key_count)

                        sort_keys = map(lambda key: str(key).strip(),
                                        sort_key[10:].split(','))
                        for key in sort_keys:
                            if key in column_map:
                                _type = column_map[key]
                                stats['sort_key_type'][project_type_name][
                                    _type] = stats['sort_key_type'][
                                        project_type_name].get(_type, 0) + 1

    dump_all_stats(directory, stats)
Beispiel #10
0
def action_stats(directory = '.'):
    stats = {'action_query_count': {}}

    for repo in Repository.objects.exclude(latest_successful_attempt = None):
        if filter_repository(repo):
            continue

        project_type_name = repo.project_type.name
        if project_type_name not in stats['action_query_count']:
            stats['action_query_count'][project_type_name] = []
        
        for action in Action.objects.filter(attempt = repo.latest_successful_attempt):
            query_count = len(Query.objects.filter(action = action))
            if query_count > 0:
                stats['action_query_count'][project_type_name].append(query_count)

            
    dump_all_stats(directory, stats)
Beispiel #11
0
def action_stats(directory = '.'):
    stats = {'action_query_count': {}}

    for repo in Repository.objects.exclude(latest_successful_attempt = None):
        if filter_repository(repo):
            continue

        project_type_name = repo.project_type.name
        if project_type_name not in stats['action_query_count']:
            stats['action_query_count'][project_type_name] = []
        
        for action in Action.objects.filter(attempt = repo.latest_successful_attempt):
            query_count = len(Query.objects.filter(action = action))
            if query_count > 0:
                stats['action_query_count'][project_type_name].append(query_count)

            
    dump_all_stats(directory, stats)
Beispiel #12
0
def aggregate_stats(directory='.'):
    stats = {'aggregate_operator': {}}

    for repo in Repository.objects.exclude(latest_successful_attempt=None):
        if filter_repository(repo):
            continue

        project_type_name = repo.project_type.name
        if project_type_name not in stats['aggregate_operator']:
            stats['aggregate_operator'][project_type_name] = {}
        for action in Action.objects.filter(
                attempt=repo.latest_successful_attempt):
            for query in Query.objects.filter(action=action):
                for aggregate_word in ['AVG', 'COUNT', 'MAX', 'MIN', 'SUM']:
                    stats['aggregate_operator'][project_type_name][
                        aggregate_word] = stats['aggregate_operator'][
                            project_type_name].get(aggregate_word, 0) + len(
                                re.findall(aggregate_word, query.content))

    dump_all_stats(directory, stats)
Beispiel #13
0
def table_stats(directory = '.'):
    stats = {}

    for repo in Repository.objects.exclude(latest_successful_attempt = None):
        if filter_repository(repo):
            continue

        statistics = Statistic.objects.filter(attempt = repo.latest_successful_attempt)
        if len(statistics) == 0:
            continue
        
        for s in statistics:
            if s.description == 'num_transactions':
                continue
            if s.description not in stats:
                stats[s.description] = {}
            project_type_name = repo.project_type.name
            if project_type_name not in stats[s.description]:
                stats[s.description][project_type_name] = []
            stats[s.description][project_type_name].append(s.count)
    
    dump_all_stats(directory, stats)
Beispiel #14
0
def scan_stats(directory='.'):
    stats = {'scan_type': {}}

    for repo in Repository.objects.exclude(latest_successful_attempt=None):
        if filter_repository(repo):
            continue

        project_type_name = repo.project_type.name
        if project_type_name not in stats['scan_type']:
            stats['scan_type'][project_type_name] = {}

        for action in Action.objects.filter(
                attempt=repo.latest_successful_attempt):
            for query in Query.objects.filter(action=action):
                for explain in Explain.objects.filter(query=query):
                    for scan in re.findall('[A-Za-z][\sA-Za-z]*Scan',
                                           explain.output):
                        stats['scan_type'][project_type_name][
                            scan] = stats['scan_type'][project_type_name].get(
                                scan, 0) + 1

    dump_all_stats(directory, stats)
Beispiel #15
0
def table_stats(directory = '.'):
    stats = {}

    for repo in Repository.objects.exclude(latest_successful_attempt = None):
        if filter_repository(repo):
            continue

        statistics = Statistic.objects.filter(attempt = repo.latest_successful_attempt)
        if len(statistics) == 0:
            continue

        for s in statistics:
            if s.description == 'num_transactions':
                continue
            if s.description not in stats:
                stats[s.description] = {}
            project_type_name = repo.project_type.name
            if project_type_name not in stats[s.description]:
                stats[s.description][project_type_name] = []
            stats[s.description][project_type_name].append(s.count)

    dump_all_stats(directory, stats)
Beispiel #16
0
def having_stats(directory='.'):
    stats = {'having_count': {}, 'group_count': {}}

    for repo in Repository.objects.exclude(latest_successful_attempt=None):
        if filter_repository(repo):
            continue

        project_type_name = repo.project_type.name
        if project_type_name not in stats['having_count']:
            stats['having_count'][project_type_name] = []
        if project_type_name not in stats['group_count']:
            stats['group_count'][project_type_name] = []
        for action in Action.objects.filter(
                attempt=repo.latest_successful_attempt):
            for query in Query.objects.filter(action=action):
                having_count = len(re.findall('HAVING', query.content))
                if having_count > 0:
                    stats['having_count'][project_type_name].append(
                        having_count)
                group_count = len(re.findall('GROUP BY', query.content))
                if group_count > 0:
                    stats['group_count'][project_type_name].append(group_count)

    dump_all_stats(directory, stats)
Beispiel #17
0
def coverage_stats(directory='.'):
    stats = {
        'table_coverage': {},
        'column_coverage': {},
        'index_coverage': {},
        'table_access': {}
    }

    for repo in Repository.objects.exclude(latest_successful_attempt=None):
        if filter_repository(repo):
            continue

        actions = Action.objects.filter(attempt=repo.latest_successful_attempt)
        if len(actions) == 0:
            continue
        statistics = Statistic.objects.filter(
            attempt=repo.latest_successful_attempt).filter(
                description='num_tables')
        if len(statistics) == 0:
            continue
        table_count = statistics[0].count
        if table_count == 0:
            continue
        informations = Information.objects.filter(
            attempt=repo.latest_successful_attempt).filter(name='tables')
        if len(informations) == 0:
            continue

        information = informations[0]
        tables = set()
        if repo.latest_successful_attempt.database.name == 'PostgreSQL':
            regex = '(\(.*?\))[,\]]'
        elif repo.latest_successful_attempt.database.name == 'MySQL':
            regex = '(\(.*?\))[,\)]'

        for table in re.findall(regex, information.description):
            cells = table.split(',')
            table_name = str(cells[2]).replace("'", "").strip()
            tables.add(table_name)

        project_type_name = repo.project_type.name
        if project_type_name not in stats['table_coverage']:
            stats['table_coverage'][project_type_name] = []
        if project_type_name not in stats['column_coverage']:
            stats['column_coverage'][project_type_name] = []
        if project_type_name not in stats['index_coverage']:
            stats['index_coverage'][project_type_name] = []
        if project_type_name not in stats['table_access']:
            stats['table_access'][project_type_name] = []

        covered_tables = set()
        for action in actions:
            for query in Query.objects.filter(action=action):
                table_access_count = 0
                for token in query.content.split():
                    token = token.replace('"', '').replace('`', '')
                    if token in tables:
                        table_access_count += 1
                        covered_tables.add(token)
                stats['table_access'][project_type_name].append(
                    table_access_count)

        table_percentage = int(float(len(covered_tables) * 100) / table_count)
        table_percentage = min(table_percentage, 100)

        stats['table_coverage'][project_type_name].append(table_percentage)

        informations = Information.objects.filter(
            attempt=repo.latest_successful_attempt).filter(name='columns')
        if len(informations) > 0:
            information = informations[0]
            column_count = 0
            for covered_table in covered_tables:
                column_count += len(
                    re.findall(covered_table.upper(),
                               information.description.upper()))
            if repo.latest_successful_attempt.database.name == 'PostgreSQL':
                column_count = min(
                    column_count,
                    len(re.findall('(\(.*?\))[,\]]', information.description)))
            elif repo.latest_successful_attempt.database.name == 'MySQL':
                column_count = min(
                    column_count,
                    len(re.findall('(\(.*?\))[,\)]', information.description)))

            if column_count > 0:
                covered_columns = set()
                for action in actions:
                    for query in Query.objects.filter(action=action):
                        parsed = sqlparse.parse(query.content)[0]
                        tokens = parsed.tokens
                        for token in tokens:
                            token_name = token.value.replace('`', '')
                            if isinstance(token, sqlparse.sql.Identifier):
                                covered_columns.add(token_name)

                column_percentage = int(
                    float(len(covered_columns) * 100) / column_count)
                column_percentage = min(column_percentage, 100)

                stats['column_coverage'][project_type_name].append(
                    column_percentage)

        informations = Information.objects.filter(
            attempt=repo.latest_successful_attempt).filter(name='indexes')
        if len(informations) > 0:
            information = informations[0]
            index_count = 0
            for covered_table in covered_tables:
                index_count += len(
                    re.findall(covered_table.upper(),
                               information.description.upper()))
            statistics = Statistic.objects.filter(
                attempt=repo.latest_successful_attempt).filter(
                    description='num_indexes')
            if len(statistics) == 0:
                continue
            if statistics[0].count > 0:
                index_count = min(index_count, statistics[0].count)

            if index_count > 0:
                covered_indexes = set()
                for action in actions:
                    for query in Query.objects.filter(action=action):
                        for explain in Explain.objects.filter(query=query):
                            for raw_index in re.findall(
                                    'Index.*?Scan.*?on \S+', explain.output):
                                index = raw_index.split()[-1]
                                covered_indexes.add(index)

                index_percentage = int(
                    float(len(covered_indexes) * 100) / index_count)
                index_percentage = min(index_percentage, 100)

                stats['index_coverage'][project_type_name].append(
                    index_percentage)

    dump_all_stats(directory, stats)
Beispiel #18
0
def join_stats(directory='.'):
    stats = {'join_type': {}, 'join_key_type': {}, 'join_key_constraint': {}}

    for repo in Repository.objects.exclude(latest_successful_attempt=None):
        if filter_repository(repo):
            continue

        project_type_name = repo.project_type.name
        if project_type_name not in stats['join_type']:
            stats['join_type'][project_type_name] = {}
        if project_type_name not in stats['join_key_type']:
            stats['join_key_type'][project_type_name] = {}
        if project_type_name not in stats['join_key_constraint']:
            stats['join_key_constraint'][project_type_name] = {}

        informations = Information.objects.filter(
            attempt=repo.latest_successful_attempt).filter(name='columns')
        column_map = {}
        if len(informations) > 0:
            information = informations[0]
            if repo.latest_successful_attempt.database.name == 'PostgreSQL':
                regex = '(\(.*?\))[,\]]'
            elif repo.latest_successful_attempt.database.name == 'MySQL':
                regex = '(\(.*?\))[,\)]'

            for column in re.findall(regex, information.description):
                cells = column.split(',')
                table = str(cells[2]).replace("'", "").strip()
                name = str(cells[3]).replace("'", "").strip()
                _type = str(cells[7]).replace("'", "").strip()
                column_map[table + '.' + name] = _type
                column_map[name] = _type

        key_column_usage_informations = Information.objects.filter(
            attempt=repo.latest_successful_attempt).filter(
                name='key_column_usage')
        constraint_informations = Information.objects.filter(
            attempt=repo.latest_successful_attempt).filter(name='constraints')
        constraint_map = {}
        if len(key_column_usage_informations) > 0 and len(
                constraint_informations) > 0:
            if repo.latest_successful_attempt.database.name == 'PostgreSQL':
                regex = '(\(.*?\))[,\]]'
            elif repo.latest_successful_attempt.database.name == 'MySQL':
                regex = '(\(.*?\))[,\)]'

            merge_map = {}
            key_column_usage_information = key_column_usage_informations[0]
            for column in re.findall(regex,
                                     key_column_usage_information.description):
                cells = column.split(',')
                constraint_name = str(cells[2]).replace("'", "").strip()
                table_name = str(cells[5]).replace("'", "").strip()
                column_name = str(cells[6]).replace("'", "").strip()
                merge_map_key = table_name + '.' + constraint_name
                if merge_map_key in merge_map:
                    merge_map[merge_map_key].append(column_name)
                else:
                    merge_map[merge_map_key] = [column_name]

            constraint_information = constraint_informations[0]
            for column in re.findall(regex,
                                     constraint_information.description):
                cells = column.split(',')
                constraint_name = str(cells[2]).replace("'", "").strip()
                if repo.latest_successful_attempt.database.name == 'PostgreSQL':
                    table_name = str(cells[5]).replace("'", "").strip()
                    constraint_type = str(cells[6]).replace("'", "").strip()
                elif repo.latest_successful_attempt.database.name == 'MySQL':
                    table_name = str(cells[4]).replace("'", "").strip()
                    constraint_type = str(cells[5])[:-1].replace("'",
                                                                 "").strip()
                merge_map_key = table_name + '.' + constraint_name
                if merge_map_key in merge_map:
                    for column_name in merge_map[merge_map_key]:
                        constraint_map[table_name + '.' +
                                       column_name] = constraint_type
                        constraint_map[column_name] = constraint_type

        cnt = 0
        for action in Action.objects.filter(
                attempt=repo.latest_successful_attempt):
            queries = Query.objects.filter(action=action)
            for query in queries:
                content = query.content
                if 'JOIN' in content:
                    parsed = sqlparse.parse(content)[0]
                    tokens = parsed.tokens

                    def process_join_key(token):
                        if isinstance(token, sqlparse.sql.TokenList):
                            if isinstance(token, sqlparse.sql.Comparison):
                                left_key, right_key = str(token.left), str(
                                    token.right)
                                left_key = left_key.replace('"', '').replace(
                                    '`', '')
                                right_key = right_key.replace('"', '').replace(
                                    '`', '')
                                if left_key in column_map and right_key in column_map:
                                    left_type = column_map[left_key]
                                    right_type = column_map[right_key]
                                    if left_type > right_type:
                                        left_type, right_type = right_type, left_type
                                    stats['join_key_type'][project_type_name][
                                        left_type + '-' +
                                        right_type] = stats['join_key_type'][
                                            project_type_name].get(
                                                left_type + '-' + right_type,
                                                0) + 1
                                if left_key in constraint_map and right_key in constraint_map:
                                    left_constraint = constraint_map[left_key]
                                    right_constraint = constraint_map[
                                        right_key]
                                    if left_constraint > right_constraint:
                                        left_constraint, right_constraint = right_constraint, left_constraint
                                    stats['join_key_constraint'][
                                        project_type_name][
                                            left_constraint + '-' +
                                            right_constraint] = stats[
                                                'join_key_constraint'][
                                                    project_type_name].get(
                                                        left_constraint + '-' +
                                                        right_constraint,
                                                        0) + 1

                            for _token in token.tokens:
                                process_join_key(_token)

                    for index in xrange(0, len(tokens)):
                        if tokens[index].is_keyword:
                            if 'JOIN' in tokens[index].value:
                                join_type = tokens[index].value
                                if 'OUTER' not in join_type and 'INNER' not in join_type:
                                    join_type = join_type.replace(
                                        'JOIN', 'INNER JOIN')
                                stats['join_type'][project_type_name][
                                    join_type] = stats[
                                        'join_type'][project_type_name].get(
                                            join_type, 0) + 1
                        else:
                            process_join_key(tokens[index])

    dump_all_stats(directory, stats)
Beispiel #19
0
def foreign_key_stats(directory='.'):
    stats = {'foreign_key_count': {}, 'foreign_key_type': {}}

    for repo in Repository.objects.exclude(latest_successful_attempt=None):
        if filter_repository(repo):
            continue

        project_type_name = repo.project_type.name
        if project_type_name not in stats['foreign_key_count']:
            stats['foreign_key_count'][project_type_name] = []
        if project_type_name not in stats['foreign_key_type']:
            stats['foreign_key_type'][project_type_name] = {}
        if 0:
            if project_type_name not in stats['join_key_constraint']:
                stats['join_key_constraint'][project_type_name] = {}

        informations = Information.objects.filter(
            attempt=repo.latest_successful_attempt).filter(name='columns')
        column_map = {}
        if len(informations) > 0:
            information = informations[0]
            if repo.latest_successful_attempt.database.name == 'PostgreSQL':
                regex = '(\(.*?\))[,\]]'
            elif repo.latest_successful_attempt.database.name == 'MySQL':
                regex = '(\(.*?\))[,\)]'

            for column in re.findall(regex, information.description):
                cells = column.split(',')
                table = str(cells[2]).replace("'", "").strip()
                name = str(cells[3]).replace("'", "").strip()
                _type = str(cells[7]).replace("'", "").strip()
                column_map[table + '.' + name] = _type
                column_map[name] = _type

        key_column_usage_informations = Information.objects.filter(
            attempt=repo.latest_successful_attempt).filter(
                name='key_column_usage')
        constraint_informations = Information.objects.filter(
            attempt=repo.latest_successful_attempt).filter(name='constraints')
        constraint_map = {}
        if len(key_column_usage_informations) > 0 and len(
                constraint_informations) > 0:
            if repo.latest_successful_attempt.database.name == 'PostgreSQL':
                regex = '(\(.*?\))[,\]]'
            elif repo.latest_successful_attempt.database.name == 'MySQL':
                regex = '(\(.*?\))[,\)]'

            merge_map = {}
            key_column_usage_information = key_column_usage_informations[0]
            for column in re.findall(regex,
                                     key_column_usage_information.description):
                cells = column.split(',')
                constraint_name = str(cells[2]).replace("'", "").strip()
                table_name = str(cells[5]).replace("'", "").strip()
                column_name = str(cells[6]).replace("'", "").strip()
                merge_map_key = table_name + '.' + constraint_name
                if merge_map_key in merge_map:
                    merge_map[merge_map_key].append(column_name)
                else:
                    merge_map[merge_map_key] = [column_name]

            constraint_information = constraint_informations[0]
            for column in re.findall(regex,
                                     constraint_information.description):
                cells = column.split(',')
                constraint_name = str(cells[2]).replace("'", "").strip()
                if repo.latest_successful_attempt.database.name == 'PostgreSQL':
                    table_name = str(cells[5]).replace("'", "").strip()
                    constraint_type = str(cells[6]).replace("'", "").strip()
                elif repo.latest_successful_attempt.database.name == 'MySQL':
                    table_name = str(cells[4]).replace("'", "").strip()
                    constraint_type = str(cells[5])[:-1].replace("'",
                                                                 "").strip()
                merge_map_key = table_name + '.' + constraint_name
                if merge_map_key in merge_map:
                    for column_name in merge_map[merge_map_key]:
                        constraint_map[table_name + '.' +
                                       column_name] = constraint_type
                        constraint_map[column_name] = constraint_type

                        if constraint_type == 'FOREIGN KEY':
                            _type = column_map[table_name + '.' + column_name]
                            stats['foreign_key_type'][project_type_name][
                                _type] = stats['foreign_key_type'][
                                    project_type_name].get(_type, 0) + 1

            for action in Action.objects.filter(
                    attempt=repo.latest_successful_attempt):
                queries = Query.objects.filter(action=action)
                foreign_key_count = 0

                for query in queries:
                    parsed = sqlparse.parse(query.content)[0]
                    tokens = parsed.tokens

                    for token in tokens:
                        if isinstance(token, sqlparse.sql.Identifier):
                            token_name = token.value.replace('"', '').replace(
                                '`', '')
                            if token_name in constraint_map:
                                constraint = constraint_map[token_name]
                                if constraint == 'FOREIGN KEY':
                                    foreign_key_count += 1

                    for explain in Explain.objects.filter(query=query):
                        if 'FOREIGN' in explain.output:
                            print explain.output

                stats['foreign_key_count'][project_type_name].append(
                    foreign_key_count)

    dump_all_stats(directory, stats)
Beispiel #20
0
def transaction_stats(directory = '.'):
    stats = {'transaction_count': {}, 'transaction_query_count': {}, 'transaction_read_count': {}, 'transaction_write_count': {}}

    transactions = []

    for repo in Repository.objects.exclude(latest_successful_attempt = None):
        if filter_repository(repo):
            continue

        project_type_name = repo.project_type.name
        if project_type_name not in stats['transaction_count']:
            stats['transaction_count'][project_type_name] = []
        if project_type_name not in stats['transaction_query_count']:
            stats['transaction_query_count'][project_type_name] = []
        if project_type_name not in stats['transaction_read_count']:
            stats['transaction_read_count'][project_type_name] = []
        if project_type_name not in stats['transaction_write_count']:
            stats['transaction_write_count'][project_type_name] = []
        

        for action in Action.objects.filter(attempt = repo.latest_successful_attempt):
            transaction = ''
            query_count = 0
            transaction_count = 0

            for query in Query.objects.filter(action = action):
                if 'BEGIN' in query.content.upper() or 'START TRANSACTION' in query.content.upper() or 'SET AUTOCOMMIT=0' in query.content.upper():
                    transaction = query.content + '\n'
                    query_count = 1
                elif transaction != '':
                    transaction += query.content + '\n'
                    query_count += 1
                    if 'COMMIT' in query.content.upper():
                        transaction = transaction.strip('\n')
                    
                        # for each transaction, count the number of transactions
                        transaction_count += 1

                        # for each transaction, count the number of read/write
                        read_count = len(re.findall('SELECT', transaction.upper()))
                        stats['transaction_read_count'][project_type_name].append(read_count)
                        write_count = 0
                        for keyword in ['INSERT', 'DELETE', 'UPDATE']:
                            write_count += len(re.findall(keyword, transaction.upper()))
                        stats['transaction_write_count'][project_type_name].append(write_count)
                        
                        # for each transaction, count the queries
                        query_count -= 2
                        stats['transaction_query_count'][project_type_name].append(query_count)

                        try:
                            transactions.append((repo.name, repo.project_type.name, transaction))
                        except:
                            pass

                        transaction = ''

            if transaction_count > 0:
                stats['transaction_count'][project_type_name].append(transaction_count)

    pickle_dump(directory, 'transactions', transactions)

    dump_all_stats(directory, stats)
Beispiel #21
0
def blind_write():
    total = 0
    count = 0
    stats = {'blind_write_count': {}}

    def is_write(query):
        return ('INSERT' in query or 'UPDATE' in query) and ('UTC LOG:  '
                                                             not in query)

    def get_identifiers(parsed):
        identifiers = []
        for token in parsed[0].tokens:
            if isinstance(token, sqlparse.sql.Identifier):
                identifiers.append(token.value)

        return set(identifiers)

    def is_read_by(identifier, query):
        if 'SELECT' not in query:
            return False
        other_identifier = get_identifiers(sqlparse.parse(query))
        return identifier.intersection(other_identifier)

    with open('transactions.pkl', 'rb') as pickle_file:
        transactions = pickle.load(pickle_file)

        for repo_name, project_type, transaction in transactions:
            queries = transaction.split('\n')
            writes = []

            for i in xrange(len(queries)):
                if is_write(queries[i]):
                    writes.append((i, queries[i]))

            is_blind_write = False
            index, other_index = -1, -1
            if len(writes) > 1:
                identifiers = [(i, get_identifiers(sqlparse.parse(query)))
                               for (i, query) in writes]
                for i in xrange(1, len(identifiers)):
                    if is_blind_write:
                        break

                    for j in xrange(i):
                        if is_blind_write:
                            break

                        index, identifier = identifiers[i]
                        other_index, other_identifier = identifiers[j]
                        if identifier.intersection(other_identifier):
                            is_blind_write = True
                            for k in xrange(other_index + 1, index):
                                if is_read_by(identifier, queries[k]):
                                    is_blind_write = False
                                    break

                            if is_blind_write:
                                count += 1
                                stats['blind_write_count'][project_type] = [
                                    stats['blind_write_count'].get(
                                        project_type, [0])[0] + 1
                                ]
                                # for k in xrange(other_index + 1, index):
                                #    print 1, queries[k]
                                # print
                                # raw_input()

            if is_blind_write:
                print repo_name, project_type
                print queries[index]
                print queries[other_index]
                print '+' * 10
                print transaction.encode('utf-8')
                print '-' * 20

    print stats

    print 'Total # of Blind Writes:', count

    dump_all_stats('.', stats)
Beispiel #22
0
def column_stats(directory = '.'):
    stats = {'column_nullable': {}, 'column_type': {}, 'column_extra': {}, 'column_num': {}}

    for repo in Repository.objects.exclude(latest_successful_attempt = None).filter(project_type = 1):
        if filter_repository(repo):
            continue

        column_informations = Information.objects.filter(attempt = repo.latest_successful_attempt).filter(name = 'columns')
        constraint_informations = Information.objects.filter(attempt = repo.latest_successful_attempt).filter(name = 'constraints')
        num_table_statistics = Statistic.objects.filter(attempt = repo.latest_successful_attempt).filter(description = 'num_tables')

        if len(column_informations) > 0 and len(constraint_informations) > 0 and len(num_table_statistics) > 0:
            column_information = column_informations[0]
            constraint_information = constraint_informations[0]
            num_tables = num_table_statistics[0].count

            project_type_name = repo.project_type.name
            if project_type_name not in stats['column_nullable']:
                stats['column_nullable'][project_type_name] = {}
            if project_type_name not in stats['column_type']:
                stats['column_type'][project_type_name] = {}
            if project_type_name not in stats['column_extra']:
                stats['column_extra'][project_type_name] = {}
            if project_type_name not in stats['column_num']:
                stats['column_num'][project_type_name] = []

            if repo.latest_successful_attempt.database.name == 'PostgreSQL':
                regex = '(\(.*?\))[,\]]'
            elif repo.latest_successful_attempt.database.name == 'MySQL':
                regex = '(\(.*?\))[,\)]'
            
            table_stats = {'column_nullable': {}, 'column_type': {}, 'column_extra': {}, 'column_num': {}}
            for column in re.findall(regex, column_information.description):
                cells = column.split(',')

                table = str(cells[2]).replace("'", "").strip()
                
                nullable = str(cells[6]).replace("'", "").strip()
                if table not in table_stats['column_nullable']:
                    table_stats['column_nullable'][table] = {}
                table_stats['column_nullable'][table][nullable] = table_stats['column_nullable'][table].get(nullable, 0) + 1
                
                _type = str(cells[7]).replace("'", "").strip()
                if table not in table_stats['column_type']:
                    table_stats['column_type'][table] = {}
                table_stats['column_type'][table][_type] = table_stats['column_type'][table].get(_type, 0) + 1
                
                extra = str(cells[16]).replace("'", "").strip()
                if extra:
                    if table not in table_stats['column_extra']:
                        table_stats['column_extra'][table] = {}
                    table_stats['column_extra'][table][extra] = table_stats['column_extra'][table].get(extra, 0) + 1
                
                if table not in table_stats['column_num']:
                    table_stats['column_num'][table] = 0
                table_stats['column_num'][table] += 1

            for column in re.findall(regex, constraint_information.description):
                cells = column.split(',')
                if repo.latest_successful_attempt.database.name == 'PostgreSQL':
                    constraint_type = str(cells[6]).replace("'", "").strip()
                elif repo.latest_successful_attempt.database.name == 'MySQL':
                    constraint_type = str(cells[5])[:-1].replace("'", "").strip()
                if repo.latest_successful_attempt.database.name == 'PostgreSQL':
                    table = str(cells[5]).replace("'", "").strip()
                elif repo.latest_successful_attempt.database.name == 'MySQL':
                    table = str(cells[4]).replace("'", "").strip()
                if table not in table_stats['column_extra']:
                    table_stats['column_extra'][table] = {}
                table_stats['column_extra'][table][constraint_type] = table_stats['column_extra'][table].get(constraint_type, 0) + 1

            for stats_type in table_stats:
                for table in table_stats[stats_type]:
                    if isinstance(table_stats[stats_type][table], dict):
                        for second_type in table_stats[stats_type][table]:
                            if second_type not in stats[stats_type][project_type_name]:
                                stats[stats_type][project_type_name][second_type] = []
                            stats[stats_type][project_type_name][second_type].append(table_stats[stats_type][table][second_type])
                    else:
                        stats[stats_type][project_type_name].append(table_stats[stats_type][table])

    dump_all_stats(directory, stats)
Beispiel #23
0
def column_stats(directory='.'):
    stats = {
        'column_nullable': {},
        'column_type': {},
        'column_extra': {},
        'column_num': {}
    }

    for repo in Repository.objects.exclude(latest_successful_attempt=None):
        if filter_repository(repo):
            continue

        column_informations = Information.objects.filter(
            attempt=repo.latest_successful_attempt).filter(name='columns')
        constraint_informations = Information.objects.filter(
            attempt=repo.latest_successful_attempt).filter(name='constraints')
        num_table_statistics = Statistic.objects.filter(
            attempt=repo.latest_successful_attempt).filter(
                description='num_tables')

        if len(column_informations) > 0 and len(
                constraint_informations) > 0 and len(num_table_statistics) > 0:
            column_information = column_informations[0]
            constraint_information = constraint_informations[0]
            num_tables = num_table_statistics[0].count

            project_type_name = repo.project_type.name
            if project_type_name not in stats['column_nullable']:
                stats['column_nullable'][project_type_name] = {}
            if project_type_name not in stats['column_type']:
                stats['column_type'][project_type_name] = {}
            if project_type_name not in stats['column_extra']:
                stats['column_extra'][project_type_name] = {}
            if project_type_name not in stats['column_num']:
                stats['column_num'][project_type_name] = []

            if repo.latest_successful_attempt.database.name == 'PostgreSQL':
                regex = '(\(.*?\))[,\]]'
            elif repo.latest_successful_attempt.database.name == 'MySQL':
                regex = '(\(.*?\))[,\)]'

            table_stats = {
                'column_nullable': {},
                'column_type': {},
                'column_extra': {},
                'column_num': {}
            }
            for column in re.findall(regex, column_information.description):
                cells = column.split(',')

                table = str(cells[2]).replace("'", "").strip()

                nullable = str(cells[6]).replace("'", "").strip()
                if table not in table_stats['column_nullable']:
                    table_stats['column_nullable'][table] = {}
                table_stats['column_nullable'][table][
                    nullable] = table_stats['column_nullable'][table].get(
                        nullable, 0) + 1

                _type = str(cells[7]).replace("'", "").strip()
                if table not in table_stats['column_type']:
                    table_stats['column_type'][table] = {}
                table_stats['column_type'][
                    table][_type] = table_stats['column_type'][table].get(
                        _type, 0) + 1

                extra = str(cells[16]).replace("'", "").strip()
                if extra:
                    if table not in table_stats['column_extra']:
                        table_stats['column_extra'][table] = {}
                    table_stats['column_extra'][table][
                        extra] = table_stats['column_extra'][table].get(
                            extra, 0) + 1

                if table not in table_stats['column_num']:
                    table_stats['column_num'][table] = 0
                table_stats['column_num'][table] += 1

            for column in re.findall(regex,
                                     constraint_information.description):
                cells = column.split(',')
                if repo.latest_successful_attempt.database.name == 'PostgreSQL':
                    constraint_type = str(cells[6]).replace("'", "").strip()
                elif repo.latest_successful_attempt.database.name == 'MySQL':
                    constraint_type = str(cells[5])[:-1].replace("'",
                                                                 "").strip()
                if repo.latest_successful_attempt.database.name == 'PostgreSQL':
                    table = str(cells[5]).replace("'", "").strip()
                elif repo.latest_successful_attempt.database.name == 'MySQL':
                    table = str(cells[4]).replace("'", "").strip()
                if table not in table_stats['column_extra']:
                    table_stats['column_extra'][table] = {}
                table_stats['column_extra'][table][
                    constraint_type] = table_stats['column_extra'][table].get(
                        constraint_type, 0) + 1

            for stats_type in table_stats:
                for table in table_stats[stats_type]:
                    if isinstance(table_stats[stats_type][table], dict):
                        for second_type in table_stats[stats_type][table]:
                            if second_type not in stats[stats_type][
                                    project_type_name]:
                                stats[stats_type][project_type_name][
                                    second_type] = []
                            stats[stats_type][project_type_name][
                                second_type].append(table_stats[stats_type]
                                                    [table][second_type])
                    else:
                        stats[stats_type][project_type_name].append(
                            table_stats[stats_type][table])

    dump_all_stats(directory, stats)