def search_deleted_files(repository_id, repository_name, master_branch): connection = PSQLConnection.get_connection() cursor = connection.cursor() repository_directory = DiretoryConfig.get_parameter('repository_directory') + repository_name git_deleted_log_file_regex = FileHandlerConfig.get_parameter('git_deleted_log_file_regex') file_regex = FileHandlerConfig.get_parameter('parseable_files_regex') command = "git log --diff-filter=D --summary" process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr = subprocess.PIPE, stdin = subprocess.PIPE, shell=True, cwd=repository_directory) git_log_output = process.communicate()[0].strip().decode("utf-8").split('\n') commit_hash = '' author_name = '' author_email = '' author_date = '' version_path = '' for git_log_output_line in git_log_output: # removes non ascii characters stripped = (c for c in git_log_output_line if 0 < ord(c) < 127) stripped_line = ''.join(stripped) git_log_file_matcher = re.match(git_deleted_log_file_regex, stripped_line) if git_log_file_matcher is not None: if git_log_file_matcher.group(1): commit_hash = git_log_file_matcher.group(1) # print (commit_hash) if git_log_file_matcher.group(2): author_name = git_log_file_matcher.group(2) # print (author_name) if git_log_file_matcher.group(3): author_email = git_log_file_matcher.group(3) # print (author_email) if git_log_file_matcher.group(4): author_date = git_log_file_matcher.group(4) # print (author_date) if git_log_file_matcher.group(5): version_path = git_log_file_matcher.group(5) file_regex_matcher = re.match(file_regex, version_path) if file_regex_matcher is not None: # print (version_path) cursor.execute("select count(*) from file_versions where older_version_path = %s and commit_hash = %s", (version_path, commit_hash)) found_in_database = cursor.fetchone()[0] if found_in_database == 0: print(found_in_database, version_path, commit_hash) file_name = version_path.split('/')[-1] file_id = insert_file(repository_id, file_name, version_path, commit_hash) if file_id is not None: execute_git_log_to_get_versions("git log "+commit_hash+"^ --follow --stat=350 --stat-graph-width=2 -- ", file_id, version_path, repository_directory)
def process_parseable_files(repository_id, repository_name): repository_path = DiretoryConfig.get_parameter('repository_directory') + repository_name file_regex = FileHandlerConfig.get_parameter('parseable_files_regex') for root, dirs, files in os.walk(repository_path): for file in files: file_matcher = re.match(file_regex, file) if file_matcher is not None: absolute_path = os.path.join(root, file).replace(repository_path + '/', '') file_id = insert_file(repository_id, file, absolute_path) print (absolute_path)
def extract_file_versions(repository_id, repository_name): repository_path = DiretoryConfig.get_parameter('repository_directory') + repository_name git_log_file_regex = FileHandlerConfig.get_parameter('git_log_file_regex') connection = PSQLConnection.get_connection() cursor = connection.cursor() cursor.execute('select id, file_path from files where repository_id = %s', (repository_id, )) files_results = cursor.fetchall() connection.close() for files_results_line in files_results: file_id = files_results_line[0] file_path = files_results_line[1] execute_git_log_to_get_versions("git log --follow --stat=350 --stat-graph-width=2 -- ", file_id, file_path, repository_path)
def execute_git_log_to_get_versions(git_log_command, file_id, file_path, repository_path): connection = PSQLConnection.get_connection() cursor = connection.cursor() git_log_file_regex = FileHandlerConfig.get_parameter('git_log_file_regex') commit_hash = '' author_name = '' author_email = '' author_date = '' version_path = '' older_version_path = '' command = git_log_command + file_path process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr = subprocess.PIPE, stdin = subprocess.PIPE, shell=True, cwd=repository_path) git_log_output = process.communicate()[0].strip().decode("utf-8").split('\n') # print (git_log_output) for git_log_output_line in git_log_output: # removes non ascii characters stripped = (c for c in git_log_output_line if 0 < ord(c) < 127) stripped_line = ''.join(stripped) git_log_file_matcher = re.match(git_log_file_regex, stripped_line) if git_log_file_matcher is not None: if git_log_file_matcher.group(1): if commit_hash is not '': cursor.execute("insert into file_versions (file_id, commit_hash, author_name, author_email, author_date, version_path, older_version_path) values ( %s, %s, %s, %s, to_timestamp(%s, 'Dy Mon DD HH24:MI:SS YYYY +-####'), %s, %s)", (file_id, commit_hash, author_name, author_email, author_date, version_path, older_version_path)) connection.commit() commit_hash = git_log_file_matcher.group(1) if git_log_file_matcher.group(2): author_name = git_log_file_matcher.group(2) if git_log_file_matcher.group(3): author_email = git_log_file_matcher.group(3) if git_log_file_matcher.group(4): author_date = git_log_file_matcher.group(4) if git_log_file_matcher.group(5): version_path = git_log_file_matcher.group(5).strip() older_version_path = '' if '=>' in version_path: print (version_path) if '{' in version_path : sub_string = version_path[version_path.find('{'): version_path.find('}')+1] difference_list = sub_string.split('=>') if difference_list[0].replace('{', '') == ' ': older_version_path = git_log_file_matcher.group(5).strip().replace(sub_string + "/", sub_string.split('=>')[0].strip().replace('{','').replace('}','')) version_path = git_log_file_matcher.group(5).strip().replace(sub_string, sub_string.split('=>')[1].strip().replace('{','').replace('}','')) elif difference_list[1].replace('}', '') == ' ': older_version_path = git_log_file_matcher.group(5).strip().replace(sub_string, sub_string.split('=>')[0].strip().replace('{','').replace('}','')) version_path = git_log_file_matcher.group(5).strip().replace(sub_string + "/", sub_string.split('=>')[1].strip().replace('{','').replace('}','')) else: older_version_path = git_log_file_matcher.group(5).strip().replace(sub_string, sub_string.split('=>')[0].strip().replace('{','').replace('}','')) version_path = git_log_file_matcher.group(5).strip().replace(sub_string, sub_string.split('=>')[1].strip().replace('{','').replace('}','')) else: older_version_path = git_log_file_matcher.group(5).split('=>')[0].strip() version_path = git_log_file_matcher.group(5).split('=>')[1].strip() # last line of the file cursor.execute("insert into file_versions (file_id, commit_hash, author_name, author_email, author_date, version_path, older_version_path) values ( %s, %s, %s, %s, to_timestamp(%s, 'Dy Mon DD HH24:MI:SS YYYY +-####'), %s, %s)", (file_id, commit_hash, author_name, author_email, author_date, version_path, older_version_path)) connection.commit()