def get_timestamp_for_tag(tag, git_repo): ''' Retreive the timestamp the tag was created. Input: repo_url (str): the repository where the tag can be found tag (str): the tag Return: int: timestamp (use datetime.fromtimestamp(timestamp) for datetime) ''' if type(git_repo) != Git: raise TypeError('git-repo should be of type git_explorer.core.Git, not {}'.format(type(git_repo))) if type(tag) != str: raise TypeError('tag must be str, not {}'.format(type(tag))) if tag not in git_repo.get_tags(): raise ValueError('tag {} not found in git_repo'.format(tag)) commit_id = git_repo.get_commit_id_for_tag(tag) commit = Commit(git_repo, commit_id) return int(commit.get_timestamp())
def get_commits_between_interval_tags(intervals_tags, git_repo=None, repo_url=None): ''' Finds the commits between intervals tags Input: intervals_tags (list): tags for version intervals repo_url (str): the URL of the repository to draw the commits from Returns: list: a list with IDs for commits in the intervals ''' candidate_commits = list() # obtain candidate commits with git-explorer if git_repo == None: try: git_repo = Git(repo_url, cache_path=GIT_CACHE) git_repo.clone(skip_existing=True) except: raise TypeError('git-repo should be of type git_explorer.core.Git, not {}, or repo_url should be a valid github repository url.'.format(type(git_repo))) for interval_tags in intervals_tags: t1, t2 = interval_tags #@TODO: one tag before this one cid_1 = git_repo.get_commit_id_for_tag(t1) c1 = Commit(git_repo, cid_1) time_1 = c1.get_timestamp() cid_2 = git_repo.get_commit_id_for_tag(t2) c2 = Commit(git_repo, cid_2) candidates = git_repo.get_commits(since=time_1, ancestors_of=cid_2, exclude_ancestors_of=cid_1, filter_files='*') candidate_commits += candidates return list(dict.fromkeys(candidate_commits)) #only unique ids
def add_commits_to_database(connection, commit_ids, git_repo=None, repository_url=None, driver=None, with_message_references_content=False, verbose=True): ''' Add commits to the database Input: connection (sqlite3.connection): the connection to the database commit_ids (list): a list of commit_ids git_repo (git_explorer.core.Git): to use for extracting the content repository_url (str): if git_repo is not provided, a repository url is needed to initialize the git_repo driver: a webdriver can be provided to avoid javascript required pages with_message_references_content (bool): to add commits references (requires additional time) verbose (bool): "Definition of verbose: containing more words than necessary: WORDY" ''' if git_repo == None and repository_url == None: raise ValueError('Provide a git_repo or a repository_url') if git_repo == None: git_repo = Git(repository_url, cache_path=GIT_CACHE) git_repo.clone(skip_existing=True) if repository_url == None: repository_url = git_repo.get_url() repository_url = re.sub('\.git$|/$', '', repository_url) if type(commit_ids) == str: commit_ids = [commit_ids] if len(commit_ids) == 0: print('No commit IDs were provided') return cursor = connection.cursor() # to not add duplicates commit_ids = list(dict.fromkeys(commit_ids)) # to get only unique ids commits_already_in_the_db = list( pd.read_sql( "SELECT id FROM commits WHERE id IN {} and repository_url = '{}'". format(tuple(commit_ids + [commit_ids[0]]), repository_url), connection).id) commits_to_add = [ commit_id for commit_id in commit_ids if commit_id not in commits_already_in_the_db ] if len(commits_to_add) == 0: cursor.close() return if verbose: print(' {} / {} are already in the database, now adding the rest.'. format(len(commits_already_in_the_db), len(commit_ids))) for commit_id in tqdm(commits_to_add): try: # initialize commit object commit = Commit(git_repo, commit_id) # message execution is combined with timestamp execution to speed up to process message = commit._exec.run( ['git', 'log', '--format=%B%n%ct', '-n1', commit._id]) timestamp = message.pop(-1) diff = commit._exec.run([ 'git', 'diff', '--unified=1', commit._id + "^.." + commit._id ]) changed_files = get_changed_files_from_diff(diff) hunks = get_hunks_from_diff(diff) preprocessed_message = rank.simpler_filter_text(message) preprocessed_diff = rank.simpler_filter_text( re.sub( '[^A-Za-z0-9]+', ' ', ' '.join( rank.extract_relevant_lines_from_commit_diff(diff)))) preprocessed_changed_files = rank.simpler_filter_text( changed_files) if with_message_references_content: commit_message_reference_content = extract_commit_message_reference_content( message, repository_url, driver) preprocessed_commit_message_reference_content = rank.extract_n_most_occurring_words( commit_message_reference_content, n=20) else: commit_message_reference_content, preprocessed_commit_message_reference_content = None, None # add to database with connection: cursor.execute( "INSERT INTO commits VALUES (:repository_url, :id, :timestamp, :message, :changed_files, :diff, :hunks, :commit_message_reference_content, :preprocessed_message, :preprocessed_diff, :preprocessed_changed_files, :preprocessed_commit_message_reference_content)", { 'repository_url': repository_url, 'id': commit_id, 'timestamp': str(timestamp), 'message': str(message), 'changed_files': str(changed_files), 'diff': str(diff), 'hunks': str(hunks), 'commit_message_reference_content': commit_message_reference_content, 'preprocessed_message': preprocessed_message, 'preprocessed_diff': preprocessed_diff, 'preprocessed_changed_files': preprocessed_changed_files, 'preprocessed_commit_message_reference_content': preprocessed_commit_message_reference_content }) except: print(' Failed to add commit {}'.format(commit_id)) if verbose: print(' All commits have been added to the database.') cursor.close() return
def advisory_record_to_output(advisory_record, model, prospector_cursor, k=20): if k > len(advisory_record.candidate_commits): k = len(advisory_record.candidate_commits) # write as txt output as well string = 'PROSPECTOR\nA search engine for fix-commits for security vulnerabilities in Open-Source Software\nBy SAP - Antonino SABETTA & Daan HOMMERSOM\n\n' string += 'This file shows the result for the search for fix-commits for vulnerability {}\n'.format( advisory_record.id) string += 'Firstly, an advisory record is created containing information on the vulnerability.\n' string += 'This advisory record is used to select candidate commits. For these candidate commits,\n' string += 'ranking vectors are computed. These ranking vectors consist of several components that\n' string += 'can be used to predict whether a candidate commit is the fix commit we are looking for.\n' string += 'These candidates are then ranked on this probability score, and the first {} are shown\n'.format( k) string += 'in this file. In 77.68% of the cases, the fix is in the top 5. In 84.03% in the top 10, \n' string += 'and in 88.59% in the top 20.' string += '\n\nFEATURES:\n' string += 'The message_score, git_diff_score, changed_files_score reflect the lexical similarity with \n' string += 'the vulnerability description. The time_distance_before and time_distance_after reflect how \n' string += 'much time was between the vulnerability release date and the commit timestamp. The \n' string += 'reachability_score reflects whether a commit is reachable from one of the tags mentioned\n' string += 'in the vulnerability_description.' string += '\n\nWEIGHTS (Logistic Regression Coefficients):\n{}'.format( pd.DataFrame({ 'feature': advisory_record.ranking_vectors.columns, 'importance': model.coef_[0] }).set_index('feature').sort_values( 'importance', ascending=False).transpose().loc['importance']) string += '\n\nADVISORY RECORD - {}'.format(advisory_record.id) string += '\n - Vulnerability description: {}'.format( advisory_record.description) string += '\n - Published timestamp: {}'.format( advisory_record.published_timestamp) string += '\n - Repository: {}'.format(advisory_record.repo_url) string += '\n - Relevant tags: {}'.format(advisory_record.relevant_tags) string += '\n\nPROSPECTOR RESULTS - {}'.format(advisory_record.id) for i in range(k): string += '\n\nCandidate {}: {}/commit/{}'.format( i, advisory_record.repo_url, advisory_record.ranked_candidate_commits[i]) string += '\n - Tag(s): {}'.format( Commit(advisory_record.git_repo, advisory_record.ranked_candidate_commits[i]).get_tags()) ranking_vector = advisory_record.ranking_vectors.loc[ advisory_record.ranked_candidate_commits[i]] commit = prospector_cursor.execute( "SELECT message, changed_files, preprocessed_message FROM commits WHERE id = :commit_id AND repository_url = :repo_url", { 'commit_id': advisory_record.ranked_candidate_commits[i], 'repo_url': advisory_record.repo_url }).fetchone() commit_message = str(' '.join(ast.literal_eval(str( commit['message'])))) #commit['preprocessed_message']# string += "\n - Commit message: {}".format(repr(commit_message)) # string += "\n - Changed files: {}".format(commit['changed_files']) string += "\n - Ranking vector: \n{}".format(ranking_vector) return string
def test_get_changed_files_from_diff(example_vulnerability_git_repo): commit_id = 'e4c9304553f2868f67556644f5831eba60cf2c34' diff = Commit(example_vulnerability_git_repo, commit_id)._exec.run( ['git', 'diff', '--unified=1', commit_id + "^.." + commit_id]) assert database.get_changed_files_from_diff(diff) == ['pom.xml'] assert database.get_changed_files_from_diff(str(diff)) == ['pom.xml']