Example #1
0
def get_timestamp_for_tag(tag, git_repo):
    '''
    Retreive the timestamp the tag was created.

    Input:
        repo_url (str): the repository where the tag can be found
        tag (str): the tag

    Return:
        int: timestamp (use datetime.fromtimestamp(timestamp) for datetime)
    '''
    if type(git_repo) != Git:
        raise TypeError('git-repo should be of type git_explorer.core.Git, not {}'.format(type(git_repo)))
    if type(tag) != str:
        raise TypeError('tag must be str, not {}'.format(type(tag)))
    if tag not in git_repo.get_tags():
        raise ValueError('tag {} not found in git_repo'.format(tag))


    commit_id = git_repo.get_commit_id_for_tag(tag)
    commit = Commit(git_repo, commit_id)
    return int(commit.get_timestamp())
Example #2
0
def get_commits_between_interval_tags(intervals_tags, git_repo=None, repo_url=None):
    '''
    Finds the commits between intervals tags

    Input:
        intervals_tags (list): tags for version intervals
        repo_url (str): the URL of the repository to draw the commits from

    Returns:
        list: a list with IDs for commits in the intervals
    '''
    candidate_commits = list()

    # obtain candidate commits with git-explorer
    if git_repo == None:
        try:
            git_repo = Git(repo_url, cache_path=GIT_CACHE)
            git_repo.clone(skip_existing=True)
        except:
            raise TypeError('git-repo should be of type git_explorer.core.Git, not {}, or repo_url should be a valid github repository url.'.format(type(git_repo)))

    for interval_tags in intervals_tags:
        t1, t2 = interval_tags

        #@TODO: one tag before this one
        cid_1 = git_repo.get_commit_id_for_tag(t1)
        c1 = Commit(git_repo, cid_1)
        time_1 = c1.get_timestamp()

        cid_2 = git_repo.get_commit_id_for_tag(t2)
        c2 = Commit(git_repo, cid_2)

        candidates = git_repo.get_commits(since=time_1, ancestors_of=cid_2, exclude_ancestors_of=cid_1, filter_files='*')

        candidate_commits += candidates

    return list(dict.fromkeys(candidate_commits)) #only unique ids
def add_commits_to_database(connection,
                            commit_ids,
                            git_repo=None,
                            repository_url=None,
                            driver=None,
                            with_message_references_content=False,
                            verbose=True):
    '''
    Add commits to the database

    Input:
        connection (sqlite3.connection): the connection to the database
        commit_ids (list): a list of commit_ids
        git_repo (git_explorer.core.Git): to use for extracting the content
        repository_url (str): if git_repo is not provided, a repository url is needed to initialize the git_repo
        driver: a webdriver can be provided to avoid javascript required pages
        with_message_references_content (bool): to add commits references (requires additional time)
        verbose (bool): "Definition of verbose: containing more words than necessary: WORDY"
    '''
    if git_repo == None and repository_url == None:
        raise ValueError('Provide a git_repo or a repository_url')

    if git_repo == None:
        git_repo = Git(repository_url, cache_path=GIT_CACHE)
        git_repo.clone(skip_existing=True)

    if repository_url == None:
        repository_url = git_repo.get_url()
    repository_url = re.sub('\.git$|/$', '', repository_url)

    if type(commit_ids) == str:
        commit_ids = [commit_ids]
    if len(commit_ids) == 0:
        print('No commit IDs were provided')
        return

    cursor = connection.cursor()

    # to not add duplicates
    commit_ids = list(dict.fromkeys(commit_ids))  # to get only unique ids
    commits_already_in_the_db = list(
        pd.read_sql(
            "SELECT id FROM commits WHERE id IN {} and repository_url = '{}'".
            format(tuple(commit_ids + [commit_ids[0]]),
                   repository_url), connection).id)
    commits_to_add = [
        commit_id for commit_id in commit_ids
        if commit_id not in commits_already_in_the_db
    ]

    if len(commits_to_add) == 0:
        cursor.close()
        return

    if verbose:
        print('    {} / {} are already in the database, now adding the rest.'.
              format(len(commits_already_in_the_db), len(commit_ids)))

    for commit_id in tqdm(commits_to_add):
        try:
            # initialize commit object
            commit = Commit(git_repo, commit_id)

            # message execution is combined with timestamp execution to speed up to process
            message = commit._exec.run(
                ['git', 'log', '--format=%B%n%ct', '-n1', commit._id])
            timestamp = message.pop(-1)

            diff = commit._exec.run([
                'git', 'diff', '--unified=1', commit._id + "^.." + commit._id
            ])
            changed_files = get_changed_files_from_diff(diff)
            hunks = get_hunks_from_diff(diff)

            preprocessed_message = rank.simpler_filter_text(message)
            preprocessed_diff = rank.simpler_filter_text(
                re.sub(
                    '[^A-Za-z0-9]+', ' ', ' '.join(
                        rank.extract_relevant_lines_from_commit_diff(diff))))
            preprocessed_changed_files = rank.simpler_filter_text(
                changed_files)

            if with_message_references_content:
                commit_message_reference_content = extract_commit_message_reference_content(
                    message, repository_url, driver)
                preprocessed_commit_message_reference_content = rank.extract_n_most_occurring_words(
                    commit_message_reference_content, n=20)
            else:
                commit_message_reference_content, preprocessed_commit_message_reference_content = None, None

            # add to database
            with connection:
                cursor.execute(
                    "INSERT INTO commits VALUES (:repository_url, :id, :timestamp, :message, :changed_files, :diff, :hunks, :commit_message_reference_content, :preprocessed_message, :preprocessed_diff, :preprocessed_changed_files, :preprocessed_commit_message_reference_content)",
                    {
                        'repository_url':
                        repository_url,
                        'id':
                        commit_id,
                        'timestamp':
                        str(timestamp),
                        'message':
                        str(message),
                        'changed_files':
                        str(changed_files),
                        'diff':
                        str(diff),
                        'hunks':
                        str(hunks),
                        'commit_message_reference_content':
                        commit_message_reference_content,
                        'preprocessed_message':
                        preprocessed_message,
                        'preprocessed_diff':
                        preprocessed_diff,
                        'preprocessed_changed_files':
                        preprocessed_changed_files,
                        'preprocessed_commit_message_reference_content':
                        preprocessed_commit_message_reference_content
                    })
        except:
            print('    Failed to add commit {}'.format(commit_id))
    if verbose: print('    All commits have been added to the database.')
    cursor.close()
    return
Example #4
0
def advisory_record_to_output(advisory_record, model, prospector_cursor, k=20):
    if k > len(advisory_record.candidate_commits):
        k = len(advisory_record.candidate_commits)

    # write as txt output as well
    string = 'PROSPECTOR\nA search engine for fix-commits for security vulnerabilities in Open-Source Software\nBy SAP - Antonino SABETTA & Daan HOMMERSOM\n\n'

    string += 'This file shows the result for the search for fix-commits for vulnerability {}\n'.format(
        advisory_record.id)
    string += 'Firstly, an advisory record is created containing information on the vulnerability.\n'
    string += 'This advisory record is used to select candidate commits. For these candidate commits,\n'
    string += 'ranking vectors are computed. These ranking vectors consist of several components that\n'
    string += 'can be used to predict whether a candidate commit is the fix commit we are looking for.\n'
    string += 'These candidates are then ranked on this probability score, and the first {} are shown\n'.format(
        k)
    string += 'in this file. In 77.68% of the cases, the fix is in the top 5. In 84.03% in the top 10, \n'
    string += 'and in 88.59% in the top 20.'

    string += '\n\nFEATURES:\n'
    string += 'The message_score, git_diff_score, changed_files_score reflect the lexical similarity with \n'
    string += 'the vulnerability description. The time_distance_before and time_distance_after reflect how \n'
    string += 'much time was between the vulnerability release date and the commit timestamp. The \n'
    string += 'reachability_score reflects whether a commit is reachable from one of the tags mentioned\n'
    string += 'in the vulnerability_description.'

    string += '\n\nWEIGHTS (Logistic Regression Coefficients):\n{}'.format(
        pd.DataFrame({
            'feature': advisory_record.ranking_vectors.columns,
            'importance': model.coef_[0]
        }).set_index('feature').sort_values(
            'importance', ascending=False).transpose().loc['importance'])

    string += '\n\nADVISORY RECORD - {}'.format(advisory_record.id)

    string += '\n - Vulnerability description: {}'.format(
        advisory_record.description)
    string += '\n - Published timestamp: {}'.format(
        advisory_record.published_timestamp)
    string += '\n - Repository: {}'.format(advisory_record.repo_url)
    string += '\n - Relevant tags: {}'.format(advisory_record.relevant_tags)

    string += '\n\nPROSPECTOR RESULTS - {}'.format(advisory_record.id)

    for i in range(k):
        string += '\n\nCandidate {}: {}/commit/{}'.format(
            i, advisory_record.repo_url,
            advisory_record.ranked_candidate_commits[i])
        string += '\n - Tag(s): {}'.format(
            Commit(advisory_record.git_repo,
                   advisory_record.ranked_candidate_commits[i]).get_tags())
        ranking_vector = advisory_record.ranking_vectors.loc[
            advisory_record.ranked_candidate_commits[i]]
        commit = prospector_cursor.execute(
            "SELECT message, changed_files, preprocessed_message FROM commits WHERE id = :commit_id AND repository_url = :repo_url",
            {
                'commit_id': advisory_record.ranked_candidate_commits[i],
                'repo_url': advisory_record.repo_url
            }).fetchone()
        commit_message = str(' '.join(ast.literal_eval(str(
            commit['message']))))  #commit['preprocessed_message']#
        string += "\n - Commit message: {}".format(repr(commit_message))  #
        string += "\n - Changed files: {}".format(commit['changed_files'])
        string += "\n - Ranking vector: \n{}".format(ranking_vector)
    return string
Example #5
0
def test_get_changed_files_from_diff(example_vulnerability_git_repo):
    commit_id = 'e4c9304553f2868f67556644f5831eba60cf2c34'
    diff = Commit(example_vulnerability_git_repo, commit_id)._exec.run(
        ['git', 'diff', '--unified=1', commit_id + "^.." + commit_id])
    assert database.get_changed_files_from_diff(diff) == ['pom.xml']
    assert database.get_changed_files_from_diff(str(diff)) == ['pom.xml']