def has_to_clone_repository(clone_url):
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()
    cursor.execute("select count(*) from repositories where clone_url = %s", (clone_url,))
    result = cursor.fetchone()[0]
    connection.close()
    return result == 0
def remove_license_comments(comments_to_keep):
    before = timeit.default_timer()
    print (len(comments_to_keep))    
    exception_words_to_remove_license_comments_regex = HeuristicHandlerConfig.get_parameter('exception_words_to_remove_license_comments_regex')
    
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()
    cursor.execute("select id, comment_text, end_line, class_declaration_lines from raw_comments where id in %s", [tuple(comments_to_keep),])
    raw_comment_results = cursor.fetchall()
    connection.close()
    
    for raw_comment_line in raw_comment_results:
        raw_comment_id = raw_comment_line[0]
        comment_text = raw_comment_line[1]
        end_line = raw_comment_line[2]
        class_declaration_line = [int(i) for i in raw_comment_line[3].split(',')][0]
        
        if end_line < class_declaration_line :
            exception_words_to_remove_license_comments_matcher = re.search(exception_words_to_remove_license_comments_regex, comment_text)
            if exception_words_to_remove_license_comments_matcher is None:
                comments_to_keep.remove(raw_comment_id)

    print (len(comments_to_keep))
    after = timeit.default_timer()
    print (after - before)

    return comments_to_keep
def remove_commented_source_code(comments_to_keep):
    before = timeit.default_timer()
    
    print (len(comments_to_keep))    
    commented_source_code_regex = HeuristicHandlerConfig.get_parameter('commented_source_code_regex')
    
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()
    cursor.execute("select id, comment_text from raw_comments where id in %s", [tuple(comments_to_keep),])
    raw_comment_results = cursor.fetchall()
    connection.close()
    
    for raw_comment_line in raw_comment_results:
        raw_comment_id = raw_comment_line[0]
        comment_text = raw_comment_line[1]
        
        commented_source_code_matcher = re.search(commented_source_code_regex, comment_text)
        if commented_source_code_matcher is not None:
            # print (raw_comment_id)
            comments_to_keep.remove(raw_comment_id)

    print (len(comments_to_keep))
    after = timeit.default_timer()
    print (after - before)

    return comments_to_keep
def remove_javadoc_comments(repository_id):
    before = timeit.default_timer()
    exception_words_to_remove_javadoc_comments_regex = HeuristicHandlerConfig.get_parameter('exception_words_to_remove_javadoc_comments_regex')
    comments_to_keep = []

    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()

    cursor.execute("select id, comment_text, comment_type, comment_format from raw_comments where repository_id = %s", (repository_id, ))
    raw_comment_results = cursor.fetchall()

    print (len(raw_comment_results))
    for raw_comment_line in raw_comment_results:
        raw_comment_id = raw_comment_line[0]
        comment_text = raw_comment_line[1]
        comment_type = raw_comment_line[2]
        comment_format = raw_comment_line[3]

        if comment_format is not None and comment_format == 'javadoc':
            exception_words_to_remove_javadoc_comments_matcher = re.search(exception_words_to_remove_javadoc_comments_regex, comment_text)
            if exception_words_to_remove_javadoc_comments_matcher is not None:
                comments_to_keep.append(raw_comment_id)
                # print (raw_comment_id)
        else:
            comments_to_keep.append(raw_comment_id)

    connection.close()

    after = timeit.default_timer()
    print (len(comments_to_keep))
    print (after - before)
    return comments_to_keep
def treat_comment_text(repository_id):
    before = timeit.default_timer()

    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()
    cursor.execute("select comment_text, id from processed_comments where treated_comment_text is null and repository_id = %s", (repository_id, ))
    processed_comment_list = cursor.fetchall()
    
    formatted_comment_list = []
    formatted_comment_id_list = []
    
    for processed_comment in processed_comment_list:
        formatted_comment = " ".join(processed_comment[0].lower().replace('\n','').replace('\r\n', '').replace('\r', '').replace('\t', '').replace('//','').replace('/**','').replace('*/','').replace('/*','').replace('*','').replace(',','').replace(':','').replace('...','').replace(';','').split())
        formatted_comment_list.append(formatted_comment)
        formatted_comment_id_list.append(processed_comment[1])

    progress_counter = 0
    total_comments = len(formatted_comment_id_list)
    
    for x in range(0, total_comments):
        progress_counter = progress_counter + 1
        cursor.execute("update processed_comments set treated_comment_text = %s where id = %s", (formatted_comment_list[x], formatted_comment_id_list[x]))
        connection.commit()
        print(progress_counter, "out of: ", total_comments)
    
    connection.close()
    after = timeit.default_timer()
    print (after - before)
Exemple #6
0
def remove_commented_source_code(comments_to_keep):
    before = timeit.default_timer()

    print(len(comments_to_keep))
    commented_source_code_regex = HeuristicHandlerConfig.get_parameter(
        'commented_source_code_regex')

    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()
    cursor.execute("select id, comment_text from raw_comments where id in %s",
                   [
                       tuple(comments_to_keep),
                   ])
    raw_comment_results = cursor.fetchall()
    connection.close()

    for raw_comment_line in raw_comment_results:
        raw_comment_id = raw_comment_line[0]
        comment_text = raw_comment_line[1]

        commented_source_code_matcher = re.search(commented_source_code_regex,
                                                  comment_text)
        if commented_source_code_matcher is not None:
            # print (raw_comment_id)
            comments_to_keep.remove(raw_comment_id)

    print(len(comments_to_keep))
    after = timeit.default_timer()
    print(after - before)

    return comments_to_keep
def parse_files_using_srcml(repository_id, repository_name):
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()

    cursor.execute("select id from files where repository_id = %s", (repository_id, ))
    files_results = cursor.fetchall()

    for file_line in files_results:
        file_id = file_line[0]
        file_versions_directory = DiretoryConfig.get_parameter('file_versions_directory') + repository_name
        parsed_files_directory = DiretoryConfig.get_parameter('parsed_files_directory') + repository_name
        create_directory(parsed_files_directory)
        cursor.execute('select id, commit_hash, version_path from file_versions where file_id = %s and has_parsed_file is false order by author_date', (file_id, ))
        file_vesions_result = cursor.fetchall()

        for file_versions_line in file_vesions_result:
            file_versions_id =  file_versions_line[0]
            commit_hash = file_versions_line[1]
            version_path = file_versions_line[2]
            file_extension = version_path.split('.')[-1]
                                                             
            local_file_copy  =  file_versions_directory  +"/"+ str(file_id) + "_" + str(file_versions_id) + "_" + commit_hash +"."+  file_extension
            parsed_file_output =  parsed_files_directory +"/"+ str(file_id) + "_" + str(file_versions_id) + "_" + commit_hash +"."+  file_extension
            subprocess.call(["srcml", local_file_copy, "-o", parsed_file_output])
            
            cursor.execute("update file_versions set has_parsed_file = true where id = %s", (file_versions_id, ))
            connection.commit()
    connection.close()
Exemple #8
0
def remove_license_comments(comments_to_keep):
    before = timeit.default_timer()
    print(len(comments_to_keep))
    exception_words_to_remove_license_comments_regex = HeuristicHandlerConfig.get_parameter(
        'exception_words_to_remove_license_comments_regex')

    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()
    cursor.execute(
        "select id, comment_text, end_line, class_declaration_lines from raw_comments where id in %s",
        [
            tuple(comments_to_keep),
        ])
    raw_comment_results = cursor.fetchall()
    connection.close()

    for raw_comment_line in raw_comment_results:
        raw_comment_id = raw_comment_line[0]
        comment_text = raw_comment_line[1]
        end_line = raw_comment_line[2]
        class_declaration_line = [
            int(i) for i in raw_comment_line[3].split(',')
        ][0]

        if end_line < class_declaration_line:
            exception_words_to_remove_license_comments_matcher = re.search(
                exception_words_to_remove_license_comments_regex, comment_text)
            if exception_words_to_remove_license_comments_matcher is None:
                comments_to_keep.remove(raw_comment_id)

    print(len(comments_to_keep))
    after = timeit.default_timer()
    print(after - before)

    return comments_to_keep
Exemple #9
0
def insert_cloned_repo_info(repository_name, master_branch):
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()
    cursor.execute(
        "insert into repositories (name, clone_url, master_branch) values (%s, %s, %s)",
        (repository_name, clone_url, master_branch))
    connection.commit()
    connection.close()
Exemple #10
0
def has_to_clone_repository(clone_url):
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()
    cursor.execute("select count(*) from repositories where clone_url = %s",
                   (clone_url, ))
    result = cursor.fetchone()[0]
    connection.close()
    return result == 0
Exemple #11
0
def merge_line_comments(repository_id):
    before = timeit.default_timer()

    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()
    cursor.execute(
        "select distinct(file_versions_id) from processed_comments where repository_id = %s ",
        (repository_id, ))
    file_versions = cursor.fetchall()

    for file_version in file_versions:
        file_versions_id = file_version[0]
        print("file version:", file_versions_id)

        cursor.execute(
            "select id, comment_text,  end_line from processed_comments where file_versions_id = %s and comment_type = 'line' order by end_line",
            (file_versions_id, ))
        sorted_comments = cursor.fetchall()

        iterator = iter(sorted_comments)
        comment = next(iterator, None)
        while comment is not None:
            # print(comment[2])

            next_comment = next(iterator, None)
            if next_comment is None:
                break
            # print(next_comment[2])

            comment_id = comment[0]
            comment_message = comment[1]

            while comment[2] - next_comment[2] == -1:
                print(comment_id)
                # print (comment_message)

                comment_message = comment_message + " " + next_comment[1]
                new_end_line = next_comment[2]

                print("new end line:", new_end_line)
                print("new commit message:", comment_message)

                cursor.execute(
                    "update processed_comments set end_line = %s, comment_text= %s, comment_format = 'multiline' where id = %s",
                    (new_end_line, comment_message, comment_id))
                cursor.execute("delete from processed_comments where id = %s",
                               (next_comment[0], ))
                connection.commit()

                comment = next_comment
                next_comment = next(iterator, None)
                if next_comment is None:
                    break
            else:
                comment = next_comment

    after = timeit.default_timer()
    print(after - before)
Exemple #12
0
def insert_snapshot_version_info(repository_id, name, version_date,
                                 version_order):
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()
    cursor.execute(
        "insert into tags (repository_id, name, version_date, version_order) values (%s, %s, to_timestamp(%s, 'YYYY-MM-DD HH24:MI:SS'), %s)",
        (repository_id, name, version_date, version_order))
    connection.commit()
    connection.close()
def fetch_repositories(repo_list = tuple([])):
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()
    if not repo_list:
        cursor.execute("select id, name, master_branch, clone_url, cloned_date from repositories order by 1")
    else:
        cursor.execute("select id, name, master_branch, clone_url, cloned_date from repositories where name in %s", [tuple(repo_list),])
    query_return = cursor.fetchall()
    connection.close()
    return query_return
Exemple #14
0
def fetch_repositories(repo_list = tuple([])):
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()
    if not repo_list:
        cursor.execute("select id, name, master_branch, clone_url, cloned_date from repositories order by 1 ")
    else:
        cursor.execute("select id, name, master_branch, clone_url, cloned_date from repositories where name in %s", [tuple(repo_list),])
    query_return = cursor.fetchall()
    connection.close()
    return query_return
Exemple #15
0
def generate_training_dataset():
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()

    default_nlp_path = DiretoryConfig.get_parameter('nlp_directory')

    training_dataset_path = default_nlp_path + NLPHandlerConfig.get_parameter('training_dataset_name')
    classification_types  = NLPHandlerConfig.get_parameter('classification_types')

    cursor.execute("select classification, treated_comment_text from manually_classified_comments where classification in %s", [tuple(classification_types),])
    write_formated_file(training_dataset_path, cursor.fetchall())
def insert_processed_comments(comments_to_keep):
    before = timeit.default_timer()
    
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor() 
    cursor.execute("insert into processed_comments(id, repository_id,  file_id, file_versions_id, commit_hash, comment_text, comment_type, comment_format, start_line, end_line, has_class_declaration, has_interface_declaration, has_enum_declaration, has_annotation_declaration, class_declaration_lines) select id, repository_id,  file_id, file_versions_id, commit_hash, comment_text, comment_type, comment_format, start_line, end_line, has_class_declaration, has_interface_declaration, has_enum_declaration, has_annotation_declaration, class_declaration_lines from raw_comments where id in %s", [tuple(comments_to_keep),])
    connection.commit()
    connection.close()

    after = timeit.default_timer()
    print (after - before)
Exemple #17
0
def classify_comments(repository_id):

    default_nlp_path = DiretoryConfig.get_parameter('nlp_directory')
    test_dataset_path = default_nlp_path + NLPHandlerConfig.get_parameter('test_dataset_name')

    connection = PSQLConnection.get_connection()
    cursor = connection.cursor() 
    cursor.execute("select distinct(file_versions_id) from processed_comments where repository_id = %s", (repository_id, ))
    file_versions = cursor.fetchall()
    
    for file_version in file_versions:
        before = timeit.default_timer()
        file_versions_id = file_version[0]
        print("file version:", file_versions_id)
        
        cursor.execute("select 'WITHOUT_CLASSIFICATION' as classification, treated_comment_text, id from processed_comments where file_versions_id = %s and td_classification is null order by end_line", (file_versions_id, ))
        all_comments_from_file = cursor.fetchall()
        write_formated_file(test_dataset_path , all_comments_from_file)

        nlp_classifier_memory_use = NLPHandlerConfig.get_parameter('nlp_classifier_memory_use')
        command = 'java ' + nlp_classifier_memory_use + ' -jar stanford-classifier.jar -prop ./dataset.prop -1.useSplitWords -1.splitWordsRegexp "\s"' 
        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, cwd=default_nlp_path).communicate()
        subprocess.call("rm " + test_dataset_path , shell=True)

        output  = process[0].strip().decode("utf-8").split('\n')
        # results = process[1].strip().decode("utf-8").split('\n')

        output_regex = NLPHandlerConfig.get_parameter('output_regex')
        comment_text_exact_regex = NLPHandlerConfig.get_parameter('comment_text_exact_regex')

        for comment in all_comments_from_file:
            treated_comment_text = comment[1]
            comment_id = comment[2]

            for line in output:
                comment_text_exact_matcher = re.match(comment_text_exact_regex, line)
                comment_text_from_output = comment_text_exact_matcher.group(1)

                if treated_comment_text == comment_text_from_output :
                    output_without_comment = line.replace(treated_comment_text, '')
                    output_matcher = re.findall(output_regex, line)

                    if output_matcher is not None:
                        golden_anwser = output_matcher[0].replace('\'', '')
                        nlp_tool_classification = output_matcher[1].replace('\'', '')

                        cursor.execute("update processed_comments set td_classification = %s where id = %s " , (nlp_tool_classification, comment_id) )
                        connection.commit()
                        # print (golden_anwser , "-" , nlp_tool_classification)
                        break

        after = timeit.default_timer()
        print (after - before)
def merge_line_comments(repository_id):
    before = timeit.default_timer()

    connection = PSQLConnection.get_connection()
    cursor = connection.cursor() 
    cursor.execute("select distinct(file_versions_id) from processed_comments where repository_id = %s ", (repository_id, ))
    file_versions = cursor.fetchall()
    
    for file_version in file_versions:
        file_versions_id = file_version[0]
        print("file version:", file_versions_id)
        
        cursor.execute("select id, comment_text,  end_line from processed_comments where file_versions_id = %s and comment_type = 'line' order by end_line", (file_versions_id, ))
        sorted_comments = cursor.fetchall()

        iterator = iter(sorted_comments)
        comment = next(iterator, None)        
        while comment is not None:  
            # print(comment[2])

            next_comment = next(iterator, None)
            if next_comment is None:
                break
            # print(next_comment[2])

            comment_id = comment[0]
            comment_message = comment[1]

            while comment[2] - next_comment[2] == -1:
                print (comment_id)
                # print (comment_message)
                
                comment_message = comment_message + " " + next_comment[1]
                new_end_line = next_comment[2] 
                
                print ("new end line:", new_end_line)
                print ("new commit message:", comment_message)
                
                cursor.execute("update processed_comments set end_line = %s, comment_text= %s, comment_format = 'multiline' where id = %s", (new_end_line, comment_message, comment_id))
                cursor.execute("delete from processed_comments where id = %s" , (next_comment[0], ))
                connection.commit()

                comment = next_comment
                next_comment = next(iterator, None)
                if next_comment is None:
                    break
            else:
                comment = next_comment

    after = timeit.default_timer()
    print (after - before)
Exemple #19
0
def search_deleted_files(repository_id, repository_name, master_branch):
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()

    repository_directory = DiretoryConfig.get_parameter('repository_directory') + repository_name
    git_deleted_log_file_regex = FileHandlerConfig.get_parameter('git_deleted_log_file_regex')
    file_regex = FileHandlerConfig.get_parameter('parseable_files_regex')

    command = "git log --diff-filter=D --summary"
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr = subprocess.PIPE, stdin = subprocess.PIPE, shell=True, cwd=repository_directory)
    git_log_output = process.communicate()[0].strip().decode("utf-8").split('\n')

    commit_hash     = ''
    author_name     = ''
    author_email    = ''
    author_date     = ''
    version_path    = ''

    for git_log_output_line in git_log_output:
            # removes non ascii characters
            stripped = (c for c in git_log_output_line if 0 < ord(c) < 127)
            stripped_line = ''.join(stripped)
            
            git_log_file_matcher = re.match(git_deleted_log_file_regex, stripped_line)
            if git_log_file_matcher is not None:
                if git_log_file_matcher.group(1):         
                    commit_hash  = git_log_file_matcher.group(1)
                    # print (commit_hash)
                if git_log_file_matcher.group(2):
                    author_name  = git_log_file_matcher.group(2)
                    # print (author_name)
                if git_log_file_matcher.group(3):
                    author_email = git_log_file_matcher.group(3) 
                    # print (author_email)
                if git_log_file_matcher.group(4):
                    author_date  = git_log_file_matcher.group(4)
                    # print (author_date)
                if git_log_file_matcher.group(5):
                    version_path = git_log_file_matcher.group(5)
                    file_regex_matcher = re.match(file_regex, version_path)
                    if file_regex_matcher is not None:
                        # print (version_path)
                        cursor.execute("select count(*) from file_versions where older_version_path = %s and commit_hash = %s", (version_path, commit_hash))
                        found_in_database = cursor.fetchone()[0]
                        if found_in_database == 0:
                            print(found_in_database, version_path, commit_hash)
                            file_name = version_path.split('/')[-1]
                            file_id = insert_file(repository_id, file_name, version_path, commit_hash)
                            if file_id is not None:
                                execute_git_log_to_get_versions("git log "+commit_hash+"^ --follow --stat=350 --stat-graph-width=2 -- ", file_id, version_path, repository_directory)
Exemple #20
0
def insert_processed_comments(comments_to_keep):
    before = timeit.default_timer()

    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()
    cursor.execute(
        "insert into processed_comments(id, repository_id,  file_id, file_versions_id, commit_hash, comment_text, comment_type, comment_format, start_line, end_line, has_class_declaration, has_interface_declaration, has_enum_declaration, has_annotation_declaration, class_declaration_lines) select id, repository_id,  file_id, file_versions_id, commit_hash, comment_text, comment_type, comment_format, start_line, end_line, has_class_declaration, has_interface_declaration, has_enum_declaration, has_annotation_declaration, class_declaration_lines from raw_comments where id in %s",
        [
            tuple(comments_to_keep),
        ])
    connection.commit()
    connection.close()

    after = timeit.default_timer()
    print(after - before)
Exemple #21
0
def insert_file(repository_id, name, absolute_path, deletion_commit_hash = None):
    inserted_id = None

    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()

    cursor.execute("select count(*) from files where name = %s and file_path = %s ", (name, absolute_path))
    count_check = cursor.fetchone()[0]

    if count_check == 0:
        cursor.execute("insert into files (repository_id, name, file_path, deletion_commit_hash) values (%s,%s,%s,%s) returning id", (repository_id, name, absolute_path, deletion_commit_hash))
        inserted_id = cursor.fetchone()[0]

    connection.commit()
    connection.close()
    return inserted_id
Exemple #22
0
def extract_file_versions(repository_id, repository_name):

    repository_path = DiretoryConfig.get_parameter('repository_directory') + repository_name
    git_log_file_regex = FileHandlerConfig.get_parameter('git_log_file_regex')

    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()
    
    cursor.execute('select id, file_path from files where repository_id = %s', (repository_id, ))
    files_results =  cursor.fetchall()
    connection.close()

    for files_results_line in files_results:

        file_id = files_results_line[0]
        file_path = files_results_line[1]

        execute_git_log_to_get_versions("git log --follow --stat=350 --stat-graph-width=2 -- ", file_id, file_path, repository_path)
Exemple #23
0
def parse_files_using_srcml(repository_id, repository_name):
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()

    cursor.execute("select id from files where repository_id = %s",
                   (repository_id, ))
    files_results = cursor.fetchall()

    for file_line in files_results:
        file_id = file_line[0]
        file_versions_directory = DiretoryConfig.get_parameter(
            'file_versions_directory') + repository_name
        parsed_files_directory = DiretoryConfig.get_parameter(
            'parsed_files_directory') + repository_name
        create_directory(parsed_files_directory)
        cursor.execute(
            'select id, commit_hash, version_path from file_versions where file_id = %s and has_parsed_file is false order by author_date',
            (file_id, ))
        file_vesions_result = cursor.fetchall()

        for file_versions_line in file_vesions_result:
            file_versions_id = file_versions_line[0]
            commit_hash = file_versions_line[1]
            version_path = file_versions_line[2]
            file_extension = version_path.split('.')[-1]

            local_file_copy = file_versions_directory + "/" + str(
                file_id) + "_" + str(
                    file_versions_id
                ) + "_" + commit_hash + "." + file_extension
            parsed_file_output = parsed_files_directory + "/" + str(
                file_id) + "_" + str(
                    file_versions_id
                ) + "_" + commit_hash + "." + file_extension
            subprocess.call(
                ["srcml", local_file_copy, "-o", parsed_file_output])

            cursor.execute(
                "update file_versions set has_parsed_file = true where id = %s",
                (file_versions_id, ))
            connection.commit()
    connection.close()
Exemple #24
0
def treat_comment_text(repository_id):
    before = timeit.default_timer()

    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()
    cursor.execute(
        "select comment_text, id from processed_comments where treated_comment_text is null and repository_id = %s",
        (repository_id, ))
    processed_comment_list = cursor.fetchall()

    formatted_comment_list = []
    formatted_comment_id_list = []

    for processed_comment in processed_comment_list:
        formatted_comment = " ".join(processed_comment[0].lower().replace(
            '\n', '').replace('\r\n', '').replace('\r', '').replace(
                '\t', '').replace('//', '').replace('/**', '').replace(
                    '*/', '').replace('/*', '').replace('*', '').replace(
                        ',', '').replace(':',
                                         '').replace('...',
                                                     '').replace(';',
                                                                 '').split())
        formatted_comment_list.append(formatted_comment)
        formatted_comment_id_list.append(processed_comment[1])

    progress_counter = 0
    total_comments = len(formatted_comment_id_list)

    for x in range(0, total_comments):
        progress_counter = progress_counter + 1
        cursor.execute(
            "update processed_comments set treated_comment_text = %s where id = %s",
            (formatted_comment_list[x], formatted_comment_id_list[x]))
        connection.commit()
        print(progress_counter, "out of: ", total_comments)

    connection.close()
    after = timeit.default_timer()
    print(after - before)
Exemple #25
0
def checkout_file_versions(repository_id, repository_name, master_branch):
    repository_directory = DiretoryConfig.get_parameter('repository_directory') + repository_name

    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()

    cursor.execute("select id from files where repository_id = %s", (repository_id, ))
    files_results = cursor.fetchall()

    for file_line in files_results:
        file_id = file_line[0]
        
        checkout_to_latest_version(repository_name, master_branch)
        file_versions_directory = DiretoryConfig.get_parameter('file_versions_directory') + repository_name
        create_directory(file_versions_directory)

        cursor.execute('select id, commit_hash, version_path from file_versions where file_id = %s and has_local_file is false order by author_date', (file_id, ))
        file_vesions_result = cursor.fetchall()

        for file_versions_line in file_vesions_result:
            file_versions_id = file_versions_line[0]
            commit_hash = file_versions_line[1]
            version_path = file_versions_line[2]
            file_extension = version_path.split('.')[-1]

            git_checkout = "git checkout " + commit_hash
            cp_file = "cp " + version_path + " ../" + file_versions_directory +"/"+ str(file_id)+ "_" + str(file_versions_id) + "_" + commit_hash +"."+  file_extension  

            print (cp_file)

            command = git_checkout + ";" + cp_file
            process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr = subprocess.PIPE, stdin = subprocess.PIPE, shell=True, cwd=repository_directory)
            git_log_output = process.communicate()[0].strip().decode("utf-8").split('\n')

            cursor.execute("update file_versions set has_local_file = true where id = %s", (file_versions_id, ))
            connection.commit()

    connection.close()
Exemple #26
0
def remove_javadoc_comments(repository_id):
    before = timeit.default_timer()
    exception_words_to_remove_javadoc_comments_regex = HeuristicHandlerConfig.get_parameter(
        'exception_words_to_remove_javadoc_comments_regex')
    comments_to_keep = []

    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()

    cursor.execute(
        "select id, comment_text, comment_type, comment_format from raw_comments where repository_id = %s",
        (repository_id, ))
    raw_comment_results = cursor.fetchall()

    print(len(raw_comment_results))
    for raw_comment_line in raw_comment_results:
        raw_comment_id = raw_comment_line[0]
        comment_text = raw_comment_line[1]
        comment_type = raw_comment_line[2]
        comment_format = raw_comment_line[3]

        if comment_format is not None and comment_format == 'javadoc':
            exception_words_to_remove_javadoc_comments_matcher = re.search(
                exception_words_to_remove_javadoc_comments_regex, comment_text)
            if exception_words_to_remove_javadoc_comments_matcher is not None:
                comments_to_keep.append(raw_comment_id)
                # print (raw_comment_id)
        else:
            comments_to_keep.append(raw_comment_id)

    connection.close()

    after = timeit.default_timer()
    print(len(comments_to_keep))
    print(after - before)
    return comments_to_keep
def insert_cloned_repo_info(repository_name, master_branch):
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()
    cursor.execute("insert into repositories (name, clone_url, master_branch) values (%s, %s, %s)", (repository_name, clone_url, master_branch))
    connection.commit()
    connection.close()
def insert_snapshot_version_info(repository_id, name, version_date, version_order):
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()
    cursor.execute("insert into tags (repository_id, name, version_date, version_order) values (%s, %s, to_timestamp(%s, 'YYYY-MM-DD HH24:MI:SS'), %s)", (repository_id, name, version_date, version_order))
    connection.commit()
    connection.close()
Exemple #29
0
def extract_comments(repository_id, repository_name):
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()

    cursor.execute("select id from files where repository_id = %s ",
                   (repository_id, ))
    files_results = cursor.fetchall()

    for file_line in files_results:
        file_id = file_line[0]
        parsed_files_directory = DiretoryConfig.get_parameter(
            'parsed_files_directory') + repository_name
        cursor.execute(
            'select id, commit_hash, version_path from file_versions where file_id = %s and has_parsed_file is true order by author_date',
            (file_id, ))
        file_vesions_result = cursor.fetchall()

        for file_versions_line in file_vesions_result:

            class_declaration_lines = []
            has_class_declaration = False
            has_interface_declaration = False
            has_enum_declaration = False
            has_annotation_declaration = False

            file_versions_id = file_versions_line[0]
            commit_hash = file_versions_line[1]
            version_path = file_versions_line[2]
            file_extension = version_path.split('.')[-1]

            parsed_file_output = parsed_files_directory + "/" + str(
                file_id) + "_" + str(
                    file_versions_id
                ) + "_" + commit_hash + "." + file_extension
            print(parsed_file_output)
            try:
                tree = etree.parse(parsed_file_output)
                root = tree.getroot()
            except Exception as e:
                print(e)

            for element in root.iter("{http://www.srcML.org/srcML/src}class"):
                class_declaration_line = element.sourceline - 1
                class_declaration_lines.append(str(class_declaration_line))
                has_class_declaration = True

            for element in root.iter(
                    "{http://www.srcML.org/srcML/src}interface"):
                class_declaration_line = element.sourceline - 1
                class_declaration_lines.append(str(class_declaration_line))
                has_interface_declaration = True

            for element in root.iter("{http://www.srcML.org/srcML/src}enum"):
                class_declaration_line = element.sourceline - 1
                class_declaration_lines.append(str(class_declaration_line))
                has_enum_declaration = True

            for element in root.iter(
                    "{http://www.srcML.org/srcML/src}annotation_defn"):
                class_declaration_line = element.sourceline - 1
                class_declaration_lines.append(str(class_declaration_line))
                has_annotation_declaration = True

            for element in root.iter(
                    "{http://www.srcML.org/srcML/src}comment"):
                start_line = element.sourceline - 1
                comment_text = element.text
                comment_type = element.get("type")
                comment_format = element.get("format")

                if comment_type == 'line':
                    end_line = start_line
                else:
                    next_element = element.getnext()
                    if next_element is not None:
                        end_line = next_element.sourceline - 2
                    else:
                        end_line = start_line

                cursor.execute(
                    "insert into raw_comments (repository_id,file_id, file_versions_id, commit_hash, comment_text, comment_type, comment_format, start_line, end_line, has_class_declaration, has_interface_declaration, has_enum_declaration, has_annotation_declaration, class_declaration_lines) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
                    (repository_id, file_id, file_versions_id, commit_hash,
                     comment_text, comment_type, comment_format, start_line,
                     end_line, has_class_declaration,
                     has_interface_declaration, has_enum_declaration,
                     has_annotation_declaration,
                     ','.join(class_declaration_lines)))
                connection.commit()

    connection.close()
Exemple #30
0
def execute_git_log_to_get_versions(git_log_command, file_id, file_path, repository_path):


    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()

    git_log_file_regex = FileHandlerConfig.get_parameter('git_log_file_regex')

    commit_hash     = ''
    author_name     = ''
    author_email    = ''
    author_date     = ''
    version_path    = ''
    older_version_path = ''

    command = git_log_command + file_path
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr = subprocess.PIPE, stdin = subprocess.PIPE, shell=True, cwd=repository_path)
    git_log_output = process.communicate()[0].strip().decode("utf-8").split('\n')

    # print (git_log_output)
    for git_log_output_line in git_log_output:
        # removes non ascii characters
        stripped = (c for c in git_log_output_line if 0 < ord(c) < 127)
        stripped_line = ''.join(stripped)
        
        git_log_file_matcher = re.match(git_log_file_regex, stripped_line)
        if git_log_file_matcher is not None:
            if git_log_file_matcher.group(1):         
                if commit_hash is not '':
                    cursor.execute("insert into file_versions (file_id, commit_hash, author_name, author_email, author_date, version_path, older_version_path) values ( %s, %s, %s, %s, to_timestamp(%s, 'Dy Mon DD HH24:MI:SS YYYY +-####'), %s, %s)", (file_id, commit_hash, author_name, author_email, author_date, version_path, older_version_path))                  
                    connection.commit()
                commit_hash  = git_log_file_matcher.group(1)  

            if git_log_file_matcher.group(2):
                author_name  = git_log_file_matcher.group(2)
            if git_log_file_matcher.group(3):
                author_email = git_log_file_matcher.group(3) 
            if git_log_file_matcher.group(4):
                author_date  = git_log_file_matcher.group(4)
            if git_log_file_matcher.group(5):
                version_path = git_log_file_matcher.group(5).strip()
                older_version_path = ''
                if '=>' in version_path:
                    print (version_path)
                    if '{' in version_path :
                        sub_string = version_path[version_path.find('{'): version_path.find('}')+1]
                        difference_list = sub_string.split('=>')
                        if difference_list[0].replace('{', '') == ' ':
                            older_version_path = git_log_file_matcher.group(5).strip().replace(sub_string + "/", sub_string.split('=>')[0].strip().replace('{','').replace('}',''))           
                            version_path = git_log_file_matcher.group(5).strip().replace(sub_string, sub_string.split('=>')[1].strip().replace('{','').replace('}','')) 

                        elif difference_list[1].replace('}', '') == ' ':
                            older_version_path = git_log_file_matcher.group(5).strip().replace(sub_string, sub_string.split('=>')[0].strip().replace('{','').replace('}',''))           
                            version_path = git_log_file_matcher.group(5).strip().replace(sub_string + "/", sub_string.split('=>')[1].strip().replace('{','').replace('}','')) 

                        else:
                            older_version_path = git_log_file_matcher.group(5).strip().replace(sub_string, sub_string.split('=>')[0].strip().replace('{','').replace('}',''))
                            version_path = git_log_file_matcher.group(5).strip().replace(sub_string, sub_string.split('=>')[1].strip().replace('{','').replace('}',''))
                    else:
                        older_version_path = git_log_file_matcher.group(5).split('=>')[0].strip()
                        version_path = git_log_file_matcher.group(5).split('=>')[1].strip()

    # last line of the file
    cursor.execute("insert into file_versions (file_id, commit_hash, author_name, author_email, author_date, version_path, older_version_path) values ( %s, %s, %s, %s, to_timestamp(%s, 'Dy Mon DD HH24:MI:SS YYYY +-####'), %s, %s)", (file_id, commit_hash, author_name, author_email, author_date, version_path, older_version_path))
    connection.commit()
def extract_comments(repository_id, repository_name):
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()

    cursor.execute("select id from files where repository_id = %s ", (repository_id, ))
    files_results = cursor.fetchall()

    for file_line in files_results:
        file_id = file_line[0]
        parsed_files_directory = DiretoryConfig.get_parameter('parsed_files_directory') + repository_name
        cursor.execute('select id, commit_hash, version_path from file_versions where file_id = %s and has_parsed_file is true order by author_date', (file_id, ))
        file_vesions_result = cursor.fetchall()

        for file_versions_line in file_vesions_result:
            
            class_declaration_lines = []
            has_class_declaration = False
            has_interface_declaration = False
            has_enum_declaration = False
            has_annotation_declaration = False

            file_versions_id =  file_versions_line[0]
            commit_hash = file_versions_line[1]
            version_path = file_versions_line[2]
            file_extension = version_path.split('.')[-1]
                                                             
            parsed_file_output =  parsed_files_directory +"/"+ str(file_id) + "_" + str(file_versions_id) + "_" + commit_hash +"."+  file_extension
            print(parsed_file_output)
            try:
                tree = etree.parse(parsed_file_output)
                root = tree.getroot()
            except Exception as e:
                print(e)
            
            for element in root.iter("{http://www.srcML.org/srcML/src}class"):
                class_declaration_line = element.sourceline -1
                class_declaration_lines.append(str(class_declaration_line))
                has_class_declaration = True

            for element in root.iter("{http://www.srcML.org/srcML/src}interface"):
                class_declaration_line = element.sourceline -1
                class_declaration_lines.append(str(class_declaration_line))
                has_interface_declaration = True

            for element in root.iter("{http://www.srcML.org/srcML/src}enum"):
                class_declaration_line = element.sourceline -1
                class_declaration_lines.append(str(class_declaration_line))
                has_enum_declaration = True

            for element in root.iter("{http://www.srcML.org/srcML/src}annotation_defn"):
                class_declaration_line = element.sourceline -1
                class_declaration_lines.append(str(class_declaration_line))
                has_annotation_declaration = True


            for element in root.iter("{http://www.srcML.org/srcML/src}comment"):
                start_line = element.sourceline -1
                comment_text = element.text
                comment_type = element.get("type")
                comment_format = element.get("format")
                
                if comment_type == 'line':
                    end_line = start_line
                else:
                    next_element = element.getnext()
                    if next_element is not None:
                        end_line = next_element.sourceline -2
                    else:
                        end_line = start_line

                cursor.execute("insert into raw_comments (repository_id,file_id, file_versions_id, commit_hash, comment_text, comment_type, comment_format, start_line, end_line, has_class_declaration, has_interface_declaration, has_enum_declaration, has_annotation_declaration, class_declaration_lines) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", (repository_id, file_id, file_versions_id, commit_hash, comment_text, comment_type, comment_format, start_line, end_line, has_class_declaration, has_interface_declaration, has_enum_declaration, has_annotation_declaration, ','.join(class_declaration_lines))) 
                connection.commit()

    connection.close()
Exemple #32
0
def search_authors(repository_id, repository_name):
    before = timeit.default_timer()
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor() 
    cursor.execute("select file_id, treated_comment_text from processed_comments where repository_id = %s and td_classification != 'WITHOUT_CLASSIFICATION' group by 1,2 order by 1   ", (repository_id, ))
    files = cursor.fetchall()
    
    for file in files:        
        file_id = file[0]
        treated_comment_text = file[1]
        print("file id:", file_id)
        print("treated_comment_text:", treated_comment_text)        

        iteration_counter = 0
        has_removed_version = False
        is_introduced_version = False
        removed_version_commit_hash = ''
        introduced_version_commit_hash = ''
        introduced_version_processed_comment_id = ''

        cursor.execute("select a.id, b.author_date, b.commit_hash, b.author_name from processed_comments a, file_versions b where a.file_versions_id = b.id and a.file_id = %s and a.treated_comment_text = %s order by 1", (file_id, treated_comment_text))
        all_file_versions = cursor.fetchall()

        for file_version_line in all_file_versions:
            iteration_counter = iteration_counter + 1
            processed_comment_id = file_version_line[0]
            author_date = file_version_line[1]
            commit_hash = file_version_line[2]
            author_name = file_version_line[3]

            if introduced_version_commit_hash == '':
                is_introduced_version = True
                introduced_version_commit_hash = commit_hash
                introduced_version_processed_comment_id = processed_comment_id
            else:
                is_introduced_version = False
    
            cursor.execute("update processed_comments set introduced_version_commit_hash = %s, is_introduced_version = %s, introduced_version_author = %s, introduced_version_date = %s where id = %s", (introduced_version_commit_hash, is_introduced_version, author_name, author_date, processed_comment_id))
            connection.commit()
 
            if iteration_counter == len(all_file_versions):
                cursor.execute ("select id, commit_hash, author_name, author_date from file_versions where file_id = %s and author_date > %s order by author_date", (file_id, author_date))
                remaining_file_versions = cursor.fetchall()

                if len(remaining_file_versions) > 0:
                    removed_version_commit_hash = remaining_file_versions[0][1]
                    removed_version_author = remaining_file_versions[0][2]
                    removed_version_date = remaining_file_versions[0][3]
                    has_removed_version = True

                    cursor.execute("update processed_comments set removed_version_commit_hash = %s, has_removed_version = %s, removed_version_author = %s, removed_version_date = %s where id = %s", (removed_version_commit_hash, has_removed_version, removed_version_author, removed_version_date, introduced_version_processed_comment_id))
                    connection.commit()
                else:
                    cursor.execute("select deletion_commit_hash from files where id = %s", (file_id,))
                    file_commit_hash_result = cursor.fetchone()

                    if file_commit_hash_result[0] is not None:
                        repository_directory = DiretoryConfig.get_parameter('repository_directory') + repository_name
                        git_log_file_regex = TDAuthorsHandlerConfig.get_parameter('git_log_file_regex')

                        removed_version_commit_hash = file_commit_hash_result[0]
                        has_removed_version = True
                
                        git_log = "git log -1 " + removed_version_commit_hash
                        process = subprocess.Popen(git_log, stdout=subprocess.PIPE, shell=True, cwd= repository_directory)
                        proc_stdout = process.communicate()[0].strip().decode('utf-8').split('\n')
                        
                        for proc_stdout_line in proc_stdout:   
                            git_log_file_matcher =  re.match(git_log_file_regex, proc_stdout_line)    
                            if git_log_file_matcher is not None:
                                if git_log_file_matcher.group(2):
                                    git_commit_author = git_log_file_matcher.group(2)
                                if git_log_file_matcher.group(4):
                                    git_commit_date = git_log_file_matcher.group(4)
                            
                        cursor.execute("update processed_comments set removed_version_commit_hash = %s, has_removed_version = %s, removed_version_author = %s, removed_version_date = to_timestamp(%s, 'Dy Mon DD HH24:MI:SS YYYY +-####') where id = %s", (removed_version_commit_hash, has_removed_version, git_commit_author, git_commit_date, introduced_version_processed_comment_id))
                        connection.commit()

                    else:
                        cursor.execute("update processed_comments set has_removed_version = %s where id = %s", (has_removed_version, introduced_version_processed_comment_id))
                        connection.commit()