Ejemplo n.º 1
0
def test_add_commits_to_database(example_vulnerability,
                                 example_vulnerability_git_repo):
    connection, cursor = database.connect_with_database(':memory:')
    commit_ids_to_add = database.get_commit_ids_between_timestamp(
        since=1457823600,
        until=1529532000,
        git_repo=None,
        repository_url=example_vulnerability['repo_url'])
    database.add_commits_to_database(
        connection,
        commit_ids_to_add[:10],
        git_repo=example_vulnerability_git_repo,
        repository_url=example_vulnerability['repo_url'],
        with_message_references=False)
    cursor.execute('SELECT COUNT(id) FROM commits')
    assert cursor.fetchone()['COUNT(id)'] == 10

    # verify entries are correct
    cursor.execute('SELECT * FROM commits')
    row = cursor.fetchone()

    assert row[
        'repository_url'] == 'https://github.com/jenkinsci/promoted-builds-plugin'
    assert row['id'] == 'e4c9304553f2868f67556644f5831eba60cf2c34'
    assert row['timestamp'] == '1528139978'
    assert row[
        'message'] == "['[maven-release-plugin] prepare for next development iteration']"
    assert row['changed_files'] == "['pom.xml']"
    assert row[
        'diff'] == "['diff --git a/pom.xml b/pom.xml', 'index 3afe9c3..51b568a 100644', '--- a/pom.xml', '+++ b/pom.xml', '@@ -10,3 +10,3 @@', '   <artifactId>promoted-builds</artifactId>', '-  <version>3.2</version>', '+  <version>3.3-SNAPSHOT</version>', '   <packaging>hpi</packaging>', '@@ -41,3 +41,3 @@', '     <url>https://github.com/jenkinsci/${project.artifactId}-plugin</url>', '-    <tag>promoted-builds-3.2</tag>', '+    <tag>HEAD</tag>', '   </scm>']"
    assert row['hunks'] == "[(6, 8), (11, 13)]"
    assert row['commit_message_reference_content'] == None
    assert row[
        'preprocessed_message'] == "maven release plugin prepare development iteration"
    # assert row['preprocessed_diff'] == "artifactid artifact id promote build artifactid artifact id version version version snapshot version packaging hpi packaging url https github com jenkinsci project artifactid artifact id plugin url tag promote build tag tag head tag scm"
    assert row['preprocessed_changed_files'] == "pom.xml pom xml"
    assert row['preprocessed_commit_message_reference_content'] == None

    # test adding without reference content: to speed up the time
    database.add_commits_to_database(
        connection,
        commit_ids_to_add[10:20],
        git_repo=example_vulnerability_git_repo,
        repository_url=example_vulnerability['repo_url'],
        with_message_references=False)
    cursor.execute('SELECT COUNT(id) FROM commits')
    assert cursor.fetchone()['COUNT(id)'] == 20

    connection.close()
def load_vulnerabilities():
    prospector_connection, prospector_cursor = database.connect_with_database(
        commits_db_path)
    vulnerabilities_connection, vulnerabilities_cursor = database.connect_with_vulnerabilities_database(
        vulnerabilities_db_path)

    print("Reading vulnerabilities")
    vulnerabilities_df = pd.read_sql(
        "SELECT * FROM vulnerabilities",
        vulnerabilities_connection).set_index("vulnerability_id")
    db_references_df = pd.read_sql(
        "SELECT vulnerability_id, url, preprocessed_content FROM vulnerability_references",
        vulnerabilities_connection)
    advisory_references_df = pd.read_sql(
        "SELECT vulnerability_id, url FROM advisory_references",
        vulnerabilities_connection)
    fixes_df = pd.read_sql("SELECT * FROM fix_commits",
                           vulnerabilities_connection)

    # prospector_connection, prospector_cursor = database.connect_with_database(commits_db_path)
    tags_df = pd.read_sql("SELECT * FROM tags", prospector_connection)

    # Create repository_url_df
    repository_url_df = pd.DataFrame()
    for i, repo_url in enumerate(list(vulnerabilities_df.repo_url.unique())):
        repository_url_df.at[i, 'repo_url'] = repo_url
        repository_url_df.at[i, 'project_name'] = rank.simpler_filter_text(
            re.sub('^https?://|[^\w]', ' ', repo_url)).lower()
    repository_url_df['project_name'] = repository_url_df.apply(
        lambda x: ' '.join([
            token for token in x['project_name'].split(' ')
            if token not in ['github', 'com', 'git', 'org']
        ]),
        axis=1)

    return vulnerabilities_df, db_references_df, advisory_references_df, tags_df, repository_url_df, fixes_df
        ],
        'fix_commits': ['2bb79861dbaf7e8a9646fcd70359523fdb464d9c'],
        'project_name':
        'github JPCERTCC LogonTracer',
        'nvd_reference_content':
        'git GitHub hub cve-2018 logon tracer vulnerability jpcert base score cookie cvss av ac sign code use learn product release',
        'preprocessed_description':
        'LogonTracer logon tracer early allow remote attacker conduct xml external entity xxe attack unspecified vector'
    }
    return example_vulnerability


# databases are created in the notebook database_creation.ipynb
vulnerabilities_connection, vulnerabilities_cursor = database.connect_with_vulnerabilities_database(
    'test-vulnerabilities.db')
prospector_connection, prospector_cursor = database.connect_with_database(
    'test-commits.db')


@pytest.mark.database
def test_database_coverage(example_vulnerability):
    database.add_vulnerabiliy_to_database(
        vulnerabilities_connection, example_vulnerability['vulnerability_id'],
        example_vulnerability['repo_url'],
        example_vulnerability['description'],
        str(example_vulnerability['nvd_published_timestamp']))
    assert vulnerabilities_cursor.execute(
        "SELECT COUNT(vulnerability_id) FROM vulnerabilities WHERE vulnerability_id = :vulnerability_id;",
        {
            'vulnerability_id': example_vulnerability['vulnerability_id']
        }).fetchone()['COUNT(vulnerability_id)'] == 1
def connect_with_commits_database(path):
    return database.connect_with_database(path)
Ejemplo n.º 5
0
def main(vulnerability_id,
         verbose,
         description=None,
         published_timestamp=None,
         repo_url=None,
         project_name=None,
         references=None,
         k=10,
         vulnerability_specific_scaling=False):
    model = load(model_path)
    universal_columns_scaler = load(min_max_scaler_path)

    # databases are created in the notebook database_creation.ipynb
    # the vulnerabilities database
    vulnerabilities_connection, vulnerabilities_cursor = database.connect_with_vulnerabilities_database(
        'data/prospector-vulnerabilities.db', verbose=verbose)
    # the commits database
    prospector_connection, prospector_cursor = database.connect_with_database(
        'data/prospector-commits.db', verbose=verbose)

    # if the vulnerability is already in the database
    if database.if_new_vulnerability(vulnerabilities_cursor,
                                     vulnerability_id) == False:
        vulnerability = vulnerabilities_cursor.execute(
            "SELECT * FROM vulnerabilities WHERE vulnerability_id = :vulnerability_id",
            {
                'vulnerability_id': vulnerability_id
            }).fetchone()

        # keep the manually provided value if it has been provided, otherwise select the one in the DB
        repo_url = repo_url if repo_url != None else vulnerability['repo_url']
        published_timestamp = published_timestamp if published_timestamp != None else vulnerability[
            'published_date']

        if description == None:
            description = vulnerability['description']
            preprocessed_description = vulnerability[
                'preprocessed_description']
        else:
            preprocessed_description = rank.simpler_filter_text(description)

        if references != None:
            database.add_vulnerability_references_to_database(
                vulnerabilities_connection,
                vulnerability_id,
                references,
                driver=None,
                verbose=verbose)
        else:
            references = references if references != None else [
                nvd_reference['url']
                for nvd_reference in vulnerabilities_cursor.execute(
                    "SELECT url FROM vulnerability_references WHERE vulnerability_id = :vulnerability_id",
                    {'vulnerability_id': vulnerability_id})
            ]

    else:
        if verbose:
            print("Vulnerability {} is a new vulnerability".format(
                vulnerability_id))

        # gather information for the new vulnerability if needed
        if description == None or published_timestamp == None or references == None:
            try:
                nvd_description, nvd_published_timestamp, nvd_references = database.extract_nvd_content(
                    vulnerability_id)
            except:  #if the vulnerability is not in the NVD
                nvd_description, nvd_published_timestamp, nvd_references = None, None, None

            if description == None:
                if nvd_description == None:
                    # raise ValueError("Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability description manually.".format(vulnerability_id))
                    print(
                        "Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability description manually."
                        .format(vulnerability_id))
                    description = input()

                    if description == "SKIP!":
                        print('skipping this one')
                        return
                else:
                    description = nvd_description

            if published_timestamp == None:
                if nvd_published_timestamp == None:
                    # raise ValueError("Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability timestamp manually.".format(vulnerability_id))
                    print(
                        "Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability timestamp manually."
                        .format(vulnerability_id))
                    published_timestamp = input()
                else:
                    published_timestamp = nvd_published_timestamp

            if references == None:
                if nvd_references == None:
                    # raise ValueError("Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability description manually.".format(vulnerability_id))
                    print(
                        "Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability references manually (comma seperated)."
                        .format(vulnerability_id))
                    references = input()
                    references = references.split(',')
                else:
                    references = nvd_references

        # determine the repo_url
        if repo_url == None:
            if verbose: print('Suggesting a repository URL')
            repo_url = rank.map_description_to_repository_url(
                vulnerabilities_connection, vulnerability_id, description)

            print(
                'Does the vulnerability affect the following repository: {} [Y/n]'
                .format(repo_url))
            choice = input()
            if choice.lower() in [
                    '', 'y', 'yes'
            ]:  #@TODO: can be a while, where it is either yes or no, not enter
                print('Confirmed')
            else:
                print('Provide the (GitHub) URL of the affected repository:')
                repo_url = input()
                repo_url = re.sub('\.git$|/$', '', repo_url)
            print('repo_url:', repo_url)

        # add to the database
        preprocessed_description = rank.simpler_filter_text(description)
        with vulnerabilities_connection:
            vulnerabilities_cursor.execute(
                "INSERT INTO vulnerabilities VALUES (:vulnerability_id, :repo_url, :description, :published_timestamp, :preprocessed_description)",
                {
                    'vulnerability_id': vulnerability_id,
                    'repo_url': repo_url,
                    'description': description,
                    'published_timestamp': str(published_timestamp),
                    'preprocessed_description': preprocessed_description
                })

        # add the references to the database
        database.add_vulnerability_references_to_database(
            vulnerabilities_connection,
            vulnerability_id,
            references,
            driver=None,
            verbose=verbose)

    # determine the project_name
    if project_name == None:
        if verbose: print('Suggesting a project name')
        project_name = rank.extract_project_name_from_repository_url(repo_url)
        print('Does the vulnerability affect the following project: {} [Y/n]'.
              format(project_name))
        choice = input()
        if choice.lower() in [
                '', 'y', 'yes'
        ]:  #@TODO: can be a while, where it is either yes or no, not enter
            print('Confirmed')
        else:
            print('Provide the name of the affected project:')
            project_name = input()
    references_for_query = ''
    if len(references) == 1:
        references_for_query = "('" + references[0] + "')"
    else:
        references_for_query = tuple(references)
    references_content = tuple(
        pd.read_sql(
            "SELECT vulnerability_id, url, preprocessed_content FROM vulnerability_references WHERE url IN {} and vulnerability_id = '{}'"
            .format(references_for_query, vulnerability_id),
            vulnerabilities_connection).preprocessed_content)
    references_content = rank.extract_n_most_occurring_words(
        rank.remove_forbidden_words_from_string(
            string=' '.join(references_content),
            forbidden_words=rank.reference_stopwords +
            project_name.split(' ')),
        n=20)

    # @TODO: now adding all advisory references --> change to only using the provided references
    advisory_references = [
        advisory_reference['url']
        for advisory_reference in vulnerabilities_cursor.execute(
            "SELECT url FROM advisory_references WHERE vulnerability_id = :vulnerability_id",
            {'vulnerability_id': vulnerability_id})
    ]

    # creating advisory record
    advisory_record = rank.Advisory_record(
        vulnerability_id,
        published_timestamp,
        repo_url,
        references,
        references_content,
        advisory_references,
        description,
        prospector_connection,
        preprocessed_vulnerability_description=preprocessed_description,
        relevant_tags=None,
        verbose=verbose,
        since=None,
        until=None)

    if verbose:
        print("\nThe following advisory record has been created:")
        print(" - Vulnerability ID: {}".format(advisory_record.id))
        print(" - Vulnerability description: {}".format(
            advisory_record.description))
        print(" - Vulnerability published timestamp: {}".format(
            advisory_record.published_timestamp))
        print(" - Affected project: {}".format(advisory_record.project_name))
        print(" - Affected repository: {}".format(advisory_record.repo_url))
        print(" - References content extracted: {}".format(
            advisory_record.references_content))

    if verbose: print("\nGathering candidate commits:")
    advisory_record.gather_candidate_commits()

    if verbose: print("\nComputing ranking vectors:")
    advisory_record.compute_ranking_vectors(vulnerability_specific_scaling)

    if vulnerability_specific_scaling == False:
        if verbose:
            print(
                "\nscaling some columns using the pretrained scaler, and some vulnerability specific"
            )
        advisory_record.ranking_vectors[
            vulnerability_specific_columns] = MinMaxScaler().fit_transform(
                advisory_record.ranking_vectors[vulnerability_specific_columns]
            )
        advisory_record.ranking_vectors[
            universal_columns] = universal_columns_scaler.transform(
                advisory_record.ranking_vectors[universal_columns])
    advisory_record.ranking_vectors.drop(columns=columns_to_drop, inplace=True)

    if verbose: print("\nRanking the candidate commits:")
    advisory_record.ranked_candidate_commits = rank.rank_candidates(
        model, advisory_record.ranking_vectors)

    if verbose: print('\nResults:')
    advisory_record.ranking_vectors.set_index('commit_id', inplace=True)
    output = advisory_record_to_output(advisory_record,
                                       model,
                                       prospector_cursor,
                                       k=k)
    print(output)

    # # succeeded
    vulnerabilities_connection.close()
    prospector_connection.close()
    return advisory_record
Ejemplo n.º 6
0
def test_database_creation(example_vulnerability,
                           example_vulnerability_git_repo):
    connection, cursor = database.connect_with_database(':memory:')
    assert type(connection) == sqlite3.Connection
    assert type(cursor) == sqlite3.Cursor
    connection.close()