def has_to_clone_repository(clone_url): connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute("select count(*) from repositories where clone_url = %s", (clone_url,)) result = cursor.fetchone()[0] connection.close() return result == 0
def remove_license_comments(comments_to_keep): before = timeit.default_timer() print (len(comments_to_keep)) exception_words_to_remove_license_comments_regex = HeuristicHandlerConfig.get_parameter('exception_words_to_remove_license_comments_regex') connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute("select id, comment_text, end_line, class_declaration_lines from raw_comments where id in %s", [tuple(comments_to_keep),]) raw_comment_results = cursor.fetchall() connection.close() for raw_comment_line in raw_comment_results: raw_comment_id = raw_comment_line[0] comment_text = raw_comment_line[1] end_line = raw_comment_line[2] class_declaration_line = [int(i) for i in raw_comment_line[3].split(',')][0] if end_line < class_declaration_line : exception_words_to_remove_license_comments_matcher = re.search(exception_words_to_remove_license_comments_regex, comment_text) if exception_words_to_remove_license_comments_matcher is None: comments_to_keep.remove(raw_comment_id) print (len(comments_to_keep)) after = timeit.default_timer() print (after - before) return comments_to_keep
def remove_commented_source_code(comments_to_keep): before = timeit.default_timer() print (len(comments_to_keep)) commented_source_code_regex = HeuristicHandlerConfig.get_parameter('commented_source_code_regex') connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute("select id, comment_text from raw_comments where id in %s", [tuple(comments_to_keep),]) raw_comment_results = cursor.fetchall() connection.close() for raw_comment_line in raw_comment_results: raw_comment_id = raw_comment_line[0] comment_text = raw_comment_line[1] commented_source_code_matcher = re.search(commented_source_code_regex, comment_text) if commented_source_code_matcher is not None: # print (raw_comment_id) comments_to_keep.remove(raw_comment_id) print (len(comments_to_keep)) after = timeit.default_timer() print (after - before) return comments_to_keep
def remove_javadoc_comments(repository_id): before = timeit.default_timer() exception_words_to_remove_javadoc_comments_regex = HeuristicHandlerConfig.get_parameter('exception_words_to_remove_javadoc_comments_regex') comments_to_keep = [] connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute("select id, comment_text, comment_type, comment_format from raw_comments where repository_id = %s", (repository_id, )) raw_comment_results = cursor.fetchall() print (len(raw_comment_results)) for raw_comment_line in raw_comment_results: raw_comment_id = raw_comment_line[0] comment_text = raw_comment_line[1] comment_type = raw_comment_line[2] comment_format = raw_comment_line[3] if comment_format is not None and comment_format == 'javadoc': exception_words_to_remove_javadoc_comments_matcher = re.search(exception_words_to_remove_javadoc_comments_regex, comment_text) if exception_words_to_remove_javadoc_comments_matcher is not None: comments_to_keep.append(raw_comment_id) # print (raw_comment_id) else: comments_to_keep.append(raw_comment_id) connection.close() after = timeit.default_timer() print (len(comments_to_keep)) print (after - before) return comments_to_keep
def treat_comment_text(repository_id): before = timeit.default_timer() connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute("select comment_text, id from processed_comments where treated_comment_text is null and repository_id = %s", (repository_id, )) processed_comment_list = cursor.fetchall() formatted_comment_list = [] formatted_comment_id_list = [] for processed_comment in processed_comment_list: formatted_comment = " ".join(processed_comment[0].lower().replace('\n','').replace('\r\n', '').replace('\r', '').replace('\t', '').replace('//','').replace('/**','').replace('*/','').replace('/*','').replace('*','').replace(',','').replace(':','').replace('...','').replace(';','').split()) formatted_comment_list.append(formatted_comment) formatted_comment_id_list.append(processed_comment[1]) progress_counter = 0 total_comments = len(formatted_comment_id_list) for x in range(0, total_comments): progress_counter = progress_counter + 1 cursor.execute("update processed_comments set treated_comment_text = %s where id = %s", (formatted_comment_list[x], formatted_comment_id_list[x])) connection.commit() print(progress_counter, "out of: ", total_comments) connection.close() after = timeit.default_timer() print (after - before)
def remove_commented_source_code(comments_to_keep): before = timeit.default_timer() print(len(comments_to_keep)) commented_source_code_regex = HeuristicHandlerConfig.get_parameter( 'commented_source_code_regex') connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute("select id, comment_text from raw_comments where id in %s", [ tuple(comments_to_keep), ]) raw_comment_results = cursor.fetchall() connection.close() for raw_comment_line in raw_comment_results: raw_comment_id = raw_comment_line[0] comment_text = raw_comment_line[1] commented_source_code_matcher = re.search(commented_source_code_regex, comment_text) if commented_source_code_matcher is not None: # print (raw_comment_id) comments_to_keep.remove(raw_comment_id) print(len(comments_to_keep)) after = timeit.default_timer() print(after - before) return comments_to_keep
def parse_files_using_srcml(repository_id, repository_name): connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute("select id from files where repository_id = %s", (repository_id, )) files_results = cursor.fetchall() for file_line in files_results: file_id = file_line[0] file_versions_directory = DiretoryConfig.get_parameter('file_versions_directory') + repository_name parsed_files_directory = DiretoryConfig.get_parameter('parsed_files_directory') + repository_name create_directory(parsed_files_directory) cursor.execute('select id, commit_hash, version_path from file_versions where file_id = %s and has_parsed_file is false order by author_date', (file_id, )) file_vesions_result = cursor.fetchall() for file_versions_line in file_vesions_result: file_versions_id = file_versions_line[0] commit_hash = file_versions_line[1] version_path = file_versions_line[2] file_extension = version_path.split('.')[-1] local_file_copy = file_versions_directory +"/"+ str(file_id) + "_" + str(file_versions_id) + "_" + commit_hash +"."+ file_extension parsed_file_output = parsed_files_directory +"/"+ str(file_id) + "_" + str(file_versions_id) + "_" + commit_hash +"."+ file_extension subprocess.call(["srcml", local_file_copy, "-o", parsed_file_output]) cursor.execute("update file_versions set has_parsed_file = true where id = %s", (file_versions_id, )) connection.commit() connection.close()
def remove_license_comments(comments_to_keep): before = timeit.default_timer() print(len(comments_to_keep)) exception_words_to_remove_license_comments_regex = HeuristicHandlerConfig.get_parameter( 'exception_words_to_remove_license_comments_regex') connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute( "select id, comment_text, end_line, class_declaration_lines from raw_comments where id in %s", [ tuple(comments_to_keep), ]) raw_comment_results = cursor.fetchall() connection.close() for raw_comment_line in raw_comment_results: raw_comment_id = raw_comment_line[0] comment_text = raw_comment_line[1] end_line = raw_comment_line[2] class_declaration_line = [ int(i) for i in raw_comment_line[3].split(',') ][0] if end_line < class_declaration_line: exception_words_to_remove_license_comments_matcher = re.search( exception_words_to_remove_license_comments_regex, comment_text) if exception_words_to_remove_license_comments_matcher is None: comments_to_keep.remove(raw_comment_id) print(len(comments_to_keep)) after = timeit.default_timer() print(after - before) return comments_to_keep
def insert_cloned_repo_info(repository_name, master_branch): connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute( "insert into repositories (name, clone_url, master_branch) values (%s, %s, %s)", (repository_name, clone_url, master_branch)) connection.commit() connection.close()
def has_to_clone_repository(clone_url): connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute("select count(*) from repositories where clone_url = %s", (clone_url, )) result = cursor.fetchone()[0] connection.close() return result == 0
def merge_line_comments(repository_id): before = timeit.default_timer() connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute( "select distinct(file_versions_id) from processed_comments where repository_id = %s ", (repository_id, )) file_versions = cursor.fetchall() for file_version in file_versions: file_versions_id = file_version[0] print("file version:", file_versions_id) cursor.execute( "select id, comment_text, end_line from processed_comments where file_versions_id = %s and comment_type = 'line' order by end_line", (file_versions_id, )) sorted_comments = cursor.fetchall() iterator = iter(sorted_comments) comment = next(iterator, None) while comment is not None: # print(comment[2]) next_comment = next(iterator, None) if next_comment is None: break # print(next_comment[2]) comment_id = comment[0] comment_message = comment[1] while comment[2] - next_comment[2] == -1: print(comment_id) # print (comment_message) comment_message = comment_message + " " + next_comment[1] new_end_line = next_comment[2] print("new end line:", new_end_line) print("new commit message:", comment_message) cursor.execute( "update processed_comments set end_line = %s, comment_text= %s, comment_format = 'multiline' where id = %s", (new_end_line, comment_message, comment_id)) cursor.execute("delete from processed_comments where id = %s", (next_comment[0], )) connection.commit() comment = next_comment next_comment = next(iterator, None) if next_comment is None: break else: comment = next_comment after = timeit.default_timer() print(after - before)
def insert_snapshot_version_info(repository_id, name, version_date, version_order): connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute( "insert into tags (repository_id, name, version_date, version_order) values (%s, %s, to_timestamp(%s, 'YYYY-MM-DD HH24:MI:SS'), %s)", (repository_id, name, version_date, version_order)) connection.commit() connection.close()
def fetch_repositories(repo_list = tuple([])): connection = PSQLConnection.get_connection() cursor = connection.cursor() if not repo_list: cursor.execute("select id, name, master_branch, clone_url, cloned_date from repositories order by 1") else: cursor.execute("select id, name, master_branch, clone_url, cloned_date from repositories where name in %s", [tuple(repo_list),]) query_return = cursor.fetchall() connection.close() return query_return
def fetch_repositories(repo_list = tuple([])): connection = PSQLConnection.get_connection() cursor = connection.cursor() if not repo_list: cursor.execute("select id, name, master_branch, clone_url, cloned_date from repositories order by 1 ") else: cursor.execute("select id, name, master_branch, clone_url, cloned_date from repositories where name in %s", [tuple(repo_list),]) query_return = cursor.fetchall() connection.close() return query_return
def generate_training_dataset(): connection = PSQLConnection.get_connection() cursor = connection.cursor() default_nlp_path = DiretoryConfig.get_parameter('nlp_directory') training_dataset_path = default_nlp_path + NLPHandlerConfig.get_parameter('training_dataset_name') classification_types = NLPHandlerConfig.get_parameter('classification_types') cursor.execute("select classification, treated_comment_text from manually_classified_comments where classification in %s", [tuple(classification_types),]) write_formated_file(training_dataset_path, cursor.fetchall())
def insert_processed_comments(comments_to_keep): before = timeit.default_timer() connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute("insert into processed_comments(id, repository_id, file_id, file_versions_id, commit_hash, comment_text, comment_type, comment_format, start_line, end_line, has_class_declaration, has_interface_declaration, has_enum_declaration, has_annotation_declaration, class_declaration_lines) select id, repository_id, file_id, file_versions_id, commit_hash, comment_text, comment_type, comment_format, start_line, end_line, has_class_declaration, has_interface_declaration, has_enum_declaration, has_annotation_declaration, class_declaration_lines from raw_comments where id in %s", [tuple(comments_to_keep),]) connection.commit() connection.close() after = timeit.default_timer() print (after - before)
def classify_comments(repository_id): default_nlp_path = DiretoryConfig.get_parameter('nlp_directory') test_dataset_path = default_nlp_path + NLPHandlerConfig.get_parameter('test_dataset_name') connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute("select distinct(file_versions_id) from processed_comments where repository_id = %s", (repository_id, )) file_versions = cursor.fetchall() for file_version in file_versions: before = timeit.default_timer() file_versions_id = file_version[0] print("file version:", file_versions_id) cursor.execute("select 'WITHOUT_CLASSIFICATION' as classification, treated_comment_text, id from processed_comments where file_versions_id = %s and td_classification is null order by end_line", (file_versions_id, )) all_comments_from_file = cursor.fetchall() write_formated_file(test_dataset_path , all_comments_from_file) nlp_classifier_memory_use = NLPHandlerConfig.get_parameter('nlp_classifier_memory_use') command = 'java ' + nlp_classifier_memory_use + ' -jar stanford-classifier.jar -prop ./dataset.prop -1.useSplitWords -1.splitWordsRegexp "\s"' process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, cwd=default_nlp_path).communicate() subprocess.call("rm " + test_dataset_path , shell=True) output = process[0].strip().decode("utf-8").split('\n') # results = process[1].strip().decode("utf-8").split('\n') output_regex = NLPHandlerConfig.get_parameter('output_regex') comment_text_exact_regex = NLPHandlerConfig.get_parameter('comment_text_exact_regex') for comment in all_comments_from_file: treated_comment_text = comment[1] comment_id = comment[2] for line in output: comment_text_exact_matcher = re.match(comment_text_exact_regex, line) comment_text_from_output = comment_text_exact_matcher.group(1) if treated_comment_text == comment_text_from_output : output_without_comment = line.replace(treated_comment_text, '') output_matcher = re.findall(output_regex, line) if output_matcher is not None: golden_anwser = output_matcher[0].replace('\'', '') nlp_tool_classification = output_matcher[1].replace('\'', '') cursor.execute("update processed_comments set td_classification = %s where id = %s " , (nlp_tool_classification, comment_id) ) connection.commit() # print (golden_anwser , "-" , nlp_tool_classification) break after = timeit.default_timer() print (after - before)
def merge_line_comments(repository_id): before = timeit.default_timer() connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute("select distinct(file_versions_id) from processed_comments where repository_id = %s ", (repository_id, )) file_versions = cursor.fetchall() for file_version in file_versions: file_versions_id = file_version[0] print("file version:", file_versions_id) cursor.execute("select id, comment_text, end_line from processed_comments where file_versions_id = %s and comment_type = 'line' order by end_line", (file_versions_id, )) sorted_comments = cursor.fetchall() iterator = iter(sorted_comments) comment = next(iterator, None) while comment is not None: # print(comment[2]) next_comment = next(iterator, None) if next_comment is None: break # print(next_comment[2]) comment_id = comment[0] comment_message = comment[1] while comment[2] - next_comment[2] == -1: print (comment_id) # print (comment_message) comment_message = comment_message + " " + next_comment[1] new_end_line = next_comment[2] print ("new end line:", new_end_line) print ("new commit message:", comment_message) cursor.execute("update processed_comments set end_line = %s, comment_text= %s, comment_format = 'multiline' where id = %s", (new_end_line, comment_message, comment_id)) cursor.execute("delete from processed_comments where id = %s" , (next_comment[0], )) connection.commit() comment = next_comment next_comment = next(iterator, None) if next_comment is None: break else: comment = next_comment after = timeit.default_timer() print (after - before)
def search_deleted_files(repository_id, repository_name, master_branch): connection = PSQLConnection.get_connection() cursor = connection.cursor() repository_directory = DiretoryConfig.get_parameter('repository_directory') + repository_name git_deleted_log_file_regex = FileHandlerConfig.get_parameter('git_deleted_log_file_regex') file_regex = FileHandlerConfig.get_parameter('parseable_files_regex') command = "git log --diff-filter=D --summary" process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr = subprocess.PIPE, stdin = subprocess.PIPE, shell=True, cwd=repository_directory) git_log_output = process.communicate()[0].strip().decode("utf-8").split('\n') commit_hash = '' author_name = '' author_email = '' author_date = '' version_path = '' for git_log_output_line in git_log_output: # removes non ascii characters stripped = (c for c in git_log_output_line if 0 < ord(c) < 127) stripped_line = ''.join(stripped) git_log_file_matcher = re.match(git_deleted_log_file_regex, stripped_line) if git_log_file_matcher is not None: if git_log_file_matcher.group(1): commit_hash = git_log_file_matcher.group(1) # print (commit_hash) if git_log_file_matcher.group(2): author_name = git_log_file_matcher.group(2) # print (author_name) if git_log_file_matcher.group(3): author_email = git_log_file_matcher.group(3) # print (author_email) if git_log_file_matcher.group(4): author_date = git_log_file_matcher.group(4) # print (author_date) if git_log_file_matcher.group(5): version_path = git_log_file_matcher.group(5) file_regex_matcher = re.match(file_regex, version_path) if file_regex_matcher is not None: # print (version_path) cursor.execute("select count(*) from file_versions where older_version_path = %s and commit_hash = %s", (version_path, commit_hash)) found_in_database = cursor.fetchone()[0] if found_in_database == 0: print(found_in_database, version_path, commit_hash) file_name = version_path.split('/')[-1] file_id = insert_file(repository_id, file_name, version_path, commit_hash) if file_id is not None: execute_git_log_to_get_versions("git log "+commit_hash+"^ --follow --stat=350 --stat-graph-width=2 -- ", file_id, version_path, repository_directory)
def insert_processed_comments(comments_to_keep): before = timeit.default_timer() connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute( "insert into processed_comments(id, repository_id, file_id, file_versions_id, commit_hash, comment_text, comment_type, comment_format, start_line, end_line, has_class_declaration, has_interface_declaration, has_enum_declaration, has_annotation_declaration, class_declaration_lines) select id, repository_id, file_id, file_versions_id, commit_hash, comment_text, comment_type, comment_format, start_line, end_line, has_class_declaration, has_interface_declaration, has_enum_declaration, has_annotation_declaration, class_declaration_lines from raw_comments where id in %s", [ tuple(comments_to_keep), ]) connection.commit() connection.close() after = timeit.default_timer() print(after - before)
def insert_file(repository_id, name, absolute_path, deletion_commit_hash = None): inserted_id = None connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute("select count(*) from files where name = %s and file_path = %s ", (name, absolute_path)) count_check = cursor.fetchone()[0] if count_check == 0: cursor.execute("insert into files (repository_id, name, file_path, deletion_commit_hash) values (%s,%s,%s,%s) returning id", (repository_id, name, absolute_path, deletion_commit_hash)) inserted_id = cursor.fetchone()[0] connection.commit() connection.close() return inserted_id
def extract_file_versions(repository_id, repository_name): repository_path = DiretoryConfig.get_parameter('repository_directory') + repository_name git_log_file_regex = FileHandlerConfig.get_parameter('git_log_file_regex') connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute('select id, file_path from files where repository_id = %s', (repository_id, )) files_results = cursor.fetchall() connection.close() for files_results_line in files_results: file_id = files_results_line[0] file_path = files_results_line[1] execute_git_log_to_get_versions("git log --follow --stat=350 --stat-graph-width=2 -- ", file_id, file_path, repository_path)
def parse_files_using_srcml(repository_id, repository_name): connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute("select id from files where repository_id = %s", (repository_id, )) files_results = cursor.fetchall() for file_line in files_results: file_id = file_line[0] file_versions_directory = DiretoryConfig.get_parameter( 'file_versions_directory') + repository_name parsed_files_directory = DiretoryConfig.get_parameter( 'parsed_files_directory') + repository_name create_directory(parsed_files_directory) cursor.execute( 'select id, commit_hash, version_path from file_versions where file_id = %s and has_parsed_file is false order by author_date', (file_id, )) file_vesions_result = cursor.fetchall() for file_versions_line in file_vesions_result: file_versions_id = file_versions_line[0] commit_hash = file_versions_line[1] version_path = file_versions_line[2] file_extension = version_path.split('.')[-1] local_file_copy = file_versions_directory + "/" + str( file_id) + "_" + str( file_versions_id ) + "_" + commit_hash + "." + file_extension parsed_file_output = parsed_files_directory + "/" + str( file_id) + "_" + str( file_versions_id ) + "_" + commit_hash + "." + file_extension subprocess.call( ["srcml", local_file_copy, "-o", parsed_file_output]) cursor.execute( "update file_versions set has_parsed_file = true where id = %s", (file_versions_id, )) connection.commit() connection.close()
def treat_comment_text(repository_id): before = timeit.default_timer() connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute( "select comment_text, id from processed_comments where treated_comment_text is null and repository_id = %s", (repository_id, )) processed_comment_list = cursor.fetchall() formatted_comment_list = [] formatted_comment_id_list = [] for processed_comment in processed_comment_list: formatted_comment = " ".join(processed_comment[0].lower().replace( '\n', '').replace('\r\n', '').replace('\r', '').replace( '\t', '').replace('//', '').replace('/**', '').replace( '*/', '').replace('/*', '').replace('*', '').replace( ',', '').replace(':', '').replace('...', '').replace(';', '').split()) formatted_comment_list.append(formatted_comment) formatted_comment_id_list.append(processed_comment[1]) progress_counter = 0 total_comments = len(formatted_comment_id_list) for x in range(0, total_comments): progress_counter = progress_counter + 1 cursor.execute( "update processed_comments set treated_comment_text = %s where id = %s", (formatted_comment_list[x], formatted_comment_id_list[x])) connection.commit() print(progress_counter, "out of: ", total_comments) connection.close() after = timeit.default_timer() print(after - before)
def checkout_file_versions(repository_id, repository_name, master_branch): repository_directory = DiretoryConfig.get_parameter('repository_directory') + repository_name connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute("select id from files where repository_id = %s", (repository_id, )) files_results = cursor.fetchall() for file_line in files_results: file_id = file_line[0] checkout_to_latest_version(repository_name, master_branch) file_versions_directory = DiretoryConfig.get_parameter('file_versions_directory') + repository_name create_directory(file_versions_directory) cursor.execute('select id, commit_hash, version_path from file_versions where file_id = %s and has_local_file is false order by author_date', (file_id, )) file_vesions_result = cursor.fetchall() for file_versions_line in file_vesions_result: file_versions_id = file_versions_line[0] commit_hash = file_versions_line[1] version_path = file_versions_line[2] file_extension = version_path.split('.')[-1] git_checkout = "git checkout " + commit_hash cp_file = "cp " + version_path + " ../" + file_versions_directory +"/"+ str(file_id)+ "_" + str(file_versions_id) + "_" + commit_hash +"."+ file_extension print (cp_file) command = git_checkout + ";" + cp_file process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr = subprocess.PIPE, stdin = subprocess.PIPE, shell=True, cwd=repository_directory) git_log_output = process.communicate()[0].strip().decode("utf-8").split('\n') cursor.execute("update file_versions set has_local_file = true where id = %s", (file_versions_id, )) connection.commit() connection.close()
def remove_javadoc_comments(repository_id): before = timeit.default_timer() exception_words_to_remove_javadoc_comments_regex = HeuristicHandlerConfig.get_parameter( 'exception_words_to_remove_javadoc_comments_regex') comments_to_keep = [] connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute( "select id, comment_text, comment_type, comment_format from raw_comments where repository_id = %s", (repository_id, )) raw_comment_results = cursor.fetchall() print(len(raw_comment_results)) for raw_comment_line in raw_comment_results: raw_comment_id = raw_comment_line[0] comment_text = raw_comment_line[1] comment_type = raw_comment_line[2] comment_format = raw_comment_line[3] if comment_format is not None and comment_format == 'javadoc': exception_words_to_remove_javadoc_comments_matcher = re.search( exception_words_to_remove_javadoc_comments_regex, comment_text) if exception_words_to_remove_javadoc_comments_matcher is not None: comments_to_keep.append(raw_comment_id) # print (raw_comment_id) else: comments_to_keep.append(raw_comment_id) connection.close() after = timeit.default_timer() print(len(comments_to_keep)) print(after - before) return comments_to_keep
def insert_cloned_repo_info(repository_name, master_branch): connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute("insert into repositories (name, clone_url, master_branch) values (%s, %s, %s)", (repository_name, clone_url, master_branch)) connection.commit() connection.close()
def insert_snapshot_version_info(repository_id, name, version_date, version_order): connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute("insert into tags (repository_id, name, version_date, version_order) values (%s, %s, to_timestamp(%s, 'YYYY-MM-DD HH24:MI:SS'), %s)", (repository_id, name, version_date, version_order)) connection.commit() connection.close()
def extract_comments(repository_id, repository_name): connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute("select id from files where repository_id = %s ", (repository_id, )) files_results = cursor.fetchall() for file_line in files_results: file_id = file_line[0] parsed_files_directory = DiretoryConfig.get_parameter( 'parsed_files_directory') + repository_name cursor.execute( 'select id, commit_hash, version_path from file_versions where file_id = %s and has_parsed_file is true order by author_date', (file_id, )) file_vesions_result = cursor.fetchall() for file_versions_line in file_vesions_result: class_declaration_lines = [] has_class_declaration = False has_interface_declaration = False has_enum_declaration = False has_annotation_declaration = False file_versions_id = file_versions_line[0] commit_hash = file_versions_line[1] version_path = file_versions_line[2] file_extension = version_path.split('.')[-1] parsed_file_output = parsed_files_directory + "/" + str( file_id) + "_" + str( file_versions_id ) + "_" + commit_hash + "." + file_extension print(parsed_file_output) try: tree = etree.parse(parsed_file_output) root = tree.getroot() except Exception as e: print(e) for element in root.iter("{http://www.srcML.org/srcML/src}class"): class_declaration_line = element.sourceline - 1 class_declaration_lines.append(str(class_declaration_line)) has_class_declaration = True for element in root.iter( "{http://www.srcML.org/srcML/src}interface"): class_declaration_line = element.sourceline - 1 class_declaration_lines.append(str(class_declaration_line)) has_interface_declaration = True for element in root.iter("{http://www.srcML.org/srcML/src}enum"): class_declaration_line = element.sourceline - 1 class_declaration_lines.append(str(class_declaration_line)) has_enum_declaration = True for element in root.iter( "{http://www.srcML.org/srcML/src}annotation_defn"): class_declaration_line = element.sourceline - 1 class_declaration_lines.append(str(class_declaration_line)) has_annotation_declaration = True for element in root.iter( "{http://www.srcML.org/srcML/src}comment"): start_line = element.sourceline - 1 comment_text = element.text comment_type = element.get("type") comment_format = element.get("format") if comment_type == 'line': end_line = start_line else: next_element = element.getnext() if next_element is not None: end_line = next_element.sourceline - 2 else: end_line = start_line cursor.execute( "insert into raw_comments (repository_id,file_id, file_versions_id, commit_hash, comment_text, comment_type, comment_format, start_line, end_line, has_class_declaration, has_interface_declaration, has_enum_declaration, has_annotation_declaration, class_declaration_lines) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", (repository_id, file_id, file_versions_id, commit_hash, comment_text, comment_type, comment_format, start_line, end_line, has_class_declaration, has_interface_declaration, has_enum_declaration, has_annotation_declaration, ','.join(class_declaration_lines))) connection.commit() connection.close()
def execute_git_log_to_get_versions(git_log_command, file_id, file_path, repository_path): connection = PSQLConnection.get_connection() cursor = connection.cursor() git_log_file_regex = FileHandlerConfig.get_parameter('git_log_file_regex') commit_hash = '' author_name = '' author_email = '' author_date = '' version_path = '' older_version_path = '' command = git_log_command + file_path process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr = subprocess.PIPE, stdin = subprocess.PIPE, shell=True, cwd=repository_path) git_log_output = process.communicate()[0].strip().decode("utf-8").split('\n') # print (git_log_output) for git_log_output_line in git_log_output: # removes non ascii characters stripped = (c for c in git_log_output_line if 0 < ord(c) < 127) stripped_line = ''.join(stripped) git_log_file_matcher = re.match(git_log_file_regex, stripped_line) if git_log_file_matcher is not None: if git_log_file_matcher.group(1): if commit_hash is not '': cursor.execute("insert into file_versions (file_id, commit_hash, author_name, author_email, author_date, version_path, older_version_path) values ( %s, %s, %s, %s, to_timestamp(%s, 'Dy Mon DD HH24:MI:SS YYYY +-####'), %s, %s)", (file_id, commit_hash, author_name, author_email, author_date, version_path, older_version_path)) connection.commit() commit_hash = git_log_file_matcher.group(1) if git_log_file_matcher.group(2): author_name = git_log_file_matcher.group(2) if git_log_file_matcher.group(3): author_email = git_log_file_matcher.group(3) if git_log_file_matcher.group(4): author_date = git_log_file_matcher.group(4) if git_log_file_matcher.group(5): version_path = git_log_file_matcher.group(5).strip() older_version_path = '' if '=>' in version_path: print (version_path) if '{' in version_path : sub_string = version_path[version_path.find('{'): version_path.find('}')+1] difference_list = sub_string.split('=>') if difference_list[0].replace('{', '') == ' ': older_version_path = git_log_file_matcher.group(5).strip().replace(sub_string + "/", sub_string.split('=>')[0].strip().replace('{','').replace('}','')) version_path = git_log_file_matcher.group(5).strip().replace(sub_string, sub_string.split('=>')[1].strip().replace('{','').replace('}','')) elif difference_list[1].replace('}', '') == ' ': older_version_path = git_log_file_matcher.group(5).strip().replace(sub_string, sub_string.split('=>')[0].strip().replace('{','').replace('}','')) version_path = git_log_file_matcher.group(5).strip().replace(sub_string + "/", sub_string.split('=>')[1].strip().replace('{','').replace('}','')) else: older_version_path = git_log_file_matcher.group(5).strip().replace(sub_string, sub_string.split('=>')[0].strip().replace('{','').replace('}','')) version_path = git_log_file_matcher.group(5).strip().replace(sub_string, sub_string.split('=>')[1].strip().replace('{','').replace('}','')) else: older_version_path = git_log_file_matcher.group(5).split('=>')[0].strip() version_path = git_log_file_matcher.group(5).split('=>')[1].strip() # last line of the file cursor.execute("insert into file_versions (file_id, commit_hash, author_name, author_email, author_date, version_path, older_version_path) values ( %s, %s, %s, %s, to_timestamp(%s, 'Dy Mon DD HH24:MI:SS YYYY +-####'), %s, %s)", (file_id, commit_hash, author_name, author_email, author_date, version_path, older_version_path)) connection.commit()
def extract_comments(repository_id, repository_name): connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute("select id from files where repository_id = %s ", (repository_id, )) files_results = cursor.fetchall() for file_line in files_results: file_id = file_line[0] parsed_files_directory = DiretoryConfig.get_parameter('parsed_files_directory') + repository_name cursor.execute('select id, commit_hash, version_path from file_versions where file_id = %s and has_parsed_file is true order by author_date', (file_id, )) file_vesions_result = cursor.fetchall() for file_versions_line in file_vesions_result: class_declaration_lines = [] has_class_declaration = False has_interface_declaration = False has_enum_declaration = False has_annotation_declaration = False file_versions_id = file_versions_line[0] commit_hash = file_versions_line[1] version_path = file_versions_line[2] file_extension = version_path.split('.')[-1] parsed_file_output = parsed_files_directory +"/"+ str(file_id) + "_" + str(file_versions_id) + "_" + commit_hash +"."+ file_extension print(parsed_file_output) try: tree = etree.parse(parsed_file_output) root = tree.getroot() except Exception as e: print(e) for element in root.iter("{http://www.srcML.org/srcML/src}class"): class_declaration_line = element.sourceline -1 class_declaration_lines.append(str(class_declaration_line)) has_class_declaration = True for element in root.iter("{http://www.srcML.org/srcML/src}interface"): class_declaration_line = element.sourceline -1 class_declaration_lines.append(str(class_declaration_line)) has_interface_declaration = True for element in root.iter("{http://www.srcML.org/srcML/src}enum"): class_declaration_line = element.sourceline -1 class_declaration_lines.append(str(class_declaration_line)) has_enum_declaration = True for element in root.iter("{http://www.srcML.org/srcML/src}annotation_defn"): class_declaration_line = element.sourceline -1 class_declaration_lines.append(str(class_declaration_line)) has_annotation_declaration = True for element in root.iter("{http://www.srcML.org/srcML/src}comment"): start_line = element.sourceline -1 comment_text = element.text comment_type = element.get("type") comment_format = element.get("format") if comment_type == 'line': end_line = start_line else: next_element = element.getnext() if next_element is not None: end_line = next_element.sourceline -2 else: end_line = start_line cursor.execute("insert into raw_comments (repository_id,file_id, file_versions_id, commit_hash, comment_text, comment_type, comment_format, start_line, end_line, has_class_declaration, has_interface_declaration, has_enum_declaration, has_annotation_declaration, class_declaration_lines) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", (repository_id, file_id, file_versions_id, commit_hash, comment_text, comment_type, comment_format, start_line, end_line, has_class_declaration, has_interface_declaration, has_enum_declaration, has_annotation_declaration, ','.join(class_declaration_lines))) connection.commit() connection.close()
def search_authors(repository_id, repository_name): before = timeit.default_timer() connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute("select file_id, treated_comment_text from processed_comments where repository_id = %s and td_classification != 'WITHOUT_CLASSIFICATION' group by 1,2 order by 1 ", (repository_id, )) files = cursor.fetchall() for file in files: file_id = file[0] treated_comment_text = file[1] print("file id:", file_id) print("treated_comment_text:", treated_comment_text) iteration_counter = 0 has_removed_version = False is_introduced_version = False removed_version_commit_hash = '' introduced_version_commit_hash = '' introduced_version_processed_comment_id = '' cursor.execute("select a.id, b.author_date, b.commit_hash, b.author_name from processed_comments a, file_versions b where a.file_versions_id = b.id and a.file_id = %s and a.treated_comment_text = %s order by 1", (file_id, treated_comment_text)) all_file_versions = cursor.fetchall() for file_version_line in all_file_versions: iteration_counter = iteration_counter + 1 processed_comment_id = file_version_line[0] author_date = file_version_line[1] commit_hash = file_version_line[2] author_name = file_version_line[3] if introduced_version_commit_hash == '': is_introduced_version = True introduced_version_commit_hash = commit_hash introduced_version_processed_comment_id = processed_comment_id else: is_introduced_version = False cursor.execute("update processed_comments set introduced_version_commit_hash = %s, is_introduced_version = %s, introduced_version_author = %s, introduced_version_date = %s where id = %s", (introduced_version_commit_hash, is_introduced_version, author_name, author_date, processed_comment_id)) connection.commit() if iteration_counter == len(all_file_versions): cursor.execute ("select id, commit_hash, author_name, author_date from file_versions where file_id = %s and author_date > %s order by author_date", (file_id, author_date)) remaining_file_versions = cursor.fetchall() if len(remaining_file_versions) > 0: removed_version_commit_hash = remaining_file_versions[0][1] removed_version_author = remaining_file_versions[0][2] removed_version_date = remaining_file_versions[0][3] has_removed_version = True cursor.execute("update processed_comments set removed_version_commit_hash = %s, has_removed_version = %s, removed_version_author = %s, removed_version_date = %s where id = %s", (removed_version_commit_hash, has_removed_version, removed_version_author, removed_version_date, introduced_version_processed_comment_id)) connection.commit() else: cursor.execute("select deletion_commit_hash from files where id = %s", (file_id,)) file_commit_hash_result = cursor.fetchone() if file_commit_hash_result[0] is not None: repository_directory = DiretoryConfig.get_parameter('repository_directory') + repository_name git_log_file_regex = TDAuthorsHandlerConfig.get_parameter('git_log_file_regex') removed_version_commit_hash = file_commit_hash_result[0] has_removed_version = True git_log = "git log -1 " + removed_version_commit_hash process = subprocess.Popen(git_log, stdout=subprocess.PIPE, shell=True, cwd= repository_directory) proc_stdout = process.communicate()[0].strip().decode('utf-8').split('\n') for proc_stdout_line in proc_stdout: git_log_file_matcher = re.match(git_log_file_regex, proc_stdout_line) if git_log_file_matcher is not None: if git_log_file_matcher.group(2): git_commit_author = git_log_file_matcher.group(2) if git_log_file_matcher.group(4): git_commit_date = git_log_file_matcher.group(4) cursor.execute("update processed_comments set removed_version_commit_hash = %s, has_removed_version = %s, removed_version_author = %s, removed_version_date = to_timestamp(%s, 'Dy Mon DD HH24:MI:SS YYYY +-####') where id = %s", (removed_version_commit_hash, has_removed_version, git_commit_author, git_commit_date, introduced_version_processed_comment_id)) connection.commit() else: cursor.execute("update processed_comments set has_removed_version = %s where id = %s", (has_removed_version, introduced_version_processed_comment_id)) connection.commit()