def get_vulnerability_data(vulnerability_id, vulnerabilities_df,
                           db_references_df):
    if type(vulnerabilities_df) != type(None) and vulnerability_id in list(
            vulnerabilities_df.index):
        repo_url, cve_description, cve_published_timestamp, preprocessed_description = vulnerabilities_df.loc[
            vulnerability_id]
        # cve_project_name = ' '.join(re.split('/|-|\.', cve_repo_url.lstrip('https?://')))
        references = list(db_references_df[db_references_df.vulnerability_id ==
                                           vulnerability_id].url)
    else:
        cve_description, cve_published_timestamp, references = database.extract_nvd_content(
            vulnerability_id)
        references = []
        preprocessed_description = rank.simpler_filter_text(cve_description)
    return cve_description, cve_published_timestamp, preprocessed_description, references
def dashboard_page(state):

    st.title("PROSPECTOR")

    st.subheader(
        "The search engine for fix-commits for security vulnerabilities in OSS"
    )
    st.write('By SAP - Antonino SABETTA & Daan HOMMERSOM')
    st.write('''
        How to use Prospector:
        \n1) Provide a vulnerability description, (GitHub) repository URL and a release date (or pick a CVE).
        \n2) Check whether Prospector fills in the rest correctly, and provide additional information if needed.
        \n3) Find security fixes!
    ''')

    # with st.beta_expander(label="Find out more", expanded=False):
    st.write('''
        The objective of Prospector is to minimize the (manual) effort needed for finding
        the fix commit of a known vulnerability in an open-source software project.
        Since these repositories can contain hundreds thousands commits, the commits are
        firstly filtered by only selecting all commits within two years before and
        one hundred days after the release date with a maximum of respectively 5215 and 100 commits.
        A study has shown that this selection has 93% recall.
        \n
        Firstly, an advisory record is created containing information on the vulnerability.
        This advisory record is used to select candidate commits. For these candidate commits,
        ranking vectors are computed. These ranking vectors consist of several components that
        can be used to predict whether a candidate commit is the fix commit we are looking for.
        These candidates are then ranked on this probability score.

        In 77.68% of the cases, the fix is in the top 5. In 84.03% in the top 10,
        and in 88.59% in the top 20.
    ''')

    st.subheader("ADVISORY RECORD")
    state.vulnerability_id = st.text_input(
        "Vulnerability identifyer:",
        value=state.vulnerability_id
        if state.vulnerability_id else '').upper()

    if state.vulnerability_id:
        try:
            cve_description, cve_published_timestamp, preprocessed_description, references = get_vulnerability_data(
                state.vulnerability_id, state.vulnerabilities_df,
                state.db_references_df)
        except:
            references = st.text_input(
                "Please provide useful references (separated by commas)")
            references = references.split(',')
            cve_description, cve_published_timestamp, preprocessed_description = '', time.time(
            ), None
    else:
        cve_description, cve_published_timestamp, preprocessed_description, references = '', time.time(
        ), None, []

    vulnerability_description = st.text_area("Vulnerability description",
                                             value=cve_description)
    project_name = st.text_input(
        "Project name",
        value=' '.join([
            token.text for token in nlp(vulnerability_description)
            if token.tag_ == 'NNP'
        ]))
    repo_url = st.text_input("Repository URL",
                             value=map_description_to_repository_url(
                                 vulnerability_id=state.vulnerability_id,
                                 description=project_name,
                                 vulnerabilities_df=state.vulnerabilities_df,
                                 repository_url_df=state.repository_url_df)
                             if project_name != '' else '')
    published_date = st.date_input("Vulnerability published date",
                                   value=datetime.fromtimestamp(
                                       int(cve_published_timestamp)))
    published_timestamp = int(time.mktime(published_date.timetuple()))

    state.advisory_record_confirmed = st.button(
        "CONFIRM ADVISORY RECORD"
    ) if not state.advisory_record_confirmed else True
    if state.advisory_record_confirmed:

        # option to clear the state
        if st.button("CLEAR FIELDS"):
            state.clear()

        # if it was a new vulnerability, add it to the DB
        if type(state.vulnerabilities_df) == type(
                None) or state.vulnerability_id not in list(
                    state.vulnerabilities_df.index):
            vulnerabilities_connection, vulnerabilities_cursor = database.connect_with_vulnerabilities_database(
                vulnerabilities_db_path)
            database.add_vulnerability_to_database(vulnerabilities_connection,
                                                   state.vulnerability_id,
                                                   repo_url,
                                                   vulnerability_description,
                                                   published_timestamp)

            # if it was not an NVD CVE, or the extraction failed
            if len(references) == 0:
                try:
                    cve_description, cve_published_timestamp, references = database.extract_nvd_content(
                        state.vulnerability_id)
                    references = [reference for reference in references]
                except:
                    references = st.text_input(
                        "Please provide useful references (separated by commas)"
                    )
                    references = references.split(',')

            database.add_vulnerability_references_to_database(
                vulnerabilities_connection,
                state.vulnerability_id,
                references,
                driver=None)
            prospector_connection, prospector_cursor = connect_with_commits_database(
                commits_db_path)
            database.add_tags_to_database(prospector_connection,
                                          tags=None,
                                          git_repo=None,
                                          repo_url=repo_url,
                                          verbose=True)
            state.vulnerabilities_df, state.db_references_df, state.advisory_references_df, state.tags_df, state.repository_url_df, state.fixes_df = load_vulnerabilities(
            )

        # gather values
        repository_tags = gather_tags(repo_url, state.tags_df)
        versions_in_description = filter.retreive_all_versions_from_description(
            vulnerability_description)
        tags_in_description = list(
            dict.fromkeys([
                tag for version in versions_in_description
                for tag in filter.get_tag_for_version(repository_tags, version)
            ]))
        references = [
            state.db_references_df.at[index, 'url'] for index in
            state.db_references_df[state.db_references_df.vulnerability_id ==
                                   state.vulnerability_id].index
        ]

        advisory_references = list(state.advisory_references_df[
            state.advisory_references_df.vulnerability_id ==
            state.vulnerability_id].url)

        # allow the user to influence the filtering
        state.advanced_settings = st.checkbox("Show advanced settings",
                                              state.advanced_settings)
        if state.advanced_settings:

            # the adding of references can be gone wrong
            first_commit_timestamp = rank.get_first_commit_timestamp(
                repo_url
            )  #@TODO: add a column to the database containing this value
            first_commit_date, today = datetime.fromtimestamp(
                int(first_commit_timestamp)).date(), datetime.fromtimestamp(
                    int(time.time())).date()
            lower_bound = published_date - timedelta(
                days=730) if published_date - timedelta(
                    days=730) > first_commit_date else first_commit_date
            upper_bound = published_date + timedelta(
                days=100) if published_date + timedelta(
                    days=100) < today else today

            since, until = st.slider("Published date based interval",
                                     min_value=first_commit_date,
                                     max_value=today,
                                     value=(lower_bound, upper_bound))
            since, until = int(time.mktime(since.timetuple())), int(
                time.mktime(until.timetuple()))

            # references
            additional_references = st.text_input(
                "Additional references (separated by commas)")
            if additional_references:
                references += additional_references.split(',')
                vulnerabilities_connection, vulnerabilities_cursor = database.connect_with_vulnerabilities_database(
                    vulnerabilities_db_path)
                database.add_vulnerability_references_to_database(
                    vulnerabilities_connection,
                    state.vulnerability_id,
                    references,
                    driver=None)

            selected_references = st.multiselect('Advisory references',
                                                 tuple(references),
                                                 default=tuple(references))

            # affected versions
            relevant_tags = st.multiselect(
                'Relevant tags',
                tuple(repository_tags),
                default=tuple(tags_in_description)
                if len(tags_in_description) != 0 else None)
            # st input int k
            k = st.number_input("The number of results to show",
                                min_value=1,
                                max_value=50,
                                value=10,
                                step=1)
        else:
            selected_references = references
            relevant_tags = tags_in_description
            since, until = None, None
            k = 10

        # st.write('vulnerability_description:', vulnerability_description)
        # st.write('references_content:', references_content)
        # st.write('vulnerability_id:', state.vulnerability_id)
        # st.write('since - published_timestamp - until:', since, published_timestamp, until)
        # st.write('repo_url:', repo_url)
        # # st.write('references:', references)
        # # st.write('advisory_references:', advisory_references)
        # st.write('relevant_tags:', relevant_tags)

        if st.button("Search prospects!"):
            model, min_max_scaler = load_model()
            prospector_connection, prospector_cursor = connect_with_commits_database(
                commits_db_path)

            preprocessed_description = rank.simpler_filter_text(
                vulnerability_description)

            references_content = tuple(state.db_references_df[
                (state.db_references_df.vulnerability_id ==
                 state.vulnerability_id)
                & (state.db_references_df.url.isin(selected_references))].
                                       preprocessed_content)
            references_content = rank.extract_n_most_occurring_words(
                rank.remove_forbidden_words_from_string(
                    string=' '.join(references_content),
                    forbidden_words=rank.reference_stopwords +
                    project_name.split(' ')),
                n=20)

            st.write(references_content)

            advisory_record = rank.Advisory_record(
                state.vulnerability_id,
                published_timestamp,
                repo_url,
                selected_references,
                references_content,
                advisory_references,
                vulnerability_description,
                prospector_connection,
                preprocessed_vulnerability_description=preprocessed_description,
                relevant_tags=relevant_tags,
                verbose=True,
                since=since,
                until=until)

            print(
                "\nGathering candidate commits and computing ranking vectors.")
            advisory_record.gather_candidate_commits()
            advisory_record.compute_ranking_vectors()

            # scaling some columns using the pretrained scaler, and some vulnerability specific
            advisory_record.ranking_vectors[
                vulnerability_specific_columns] = MinMaxScaler().fit_transform(
                    advisory_record.
                    ranking_vectors[vulnerability_specific_columns])
            advisory_record.ranking_vectors[
                universal_columns] = min_max_scaler.transform(
                    advisory_record.ranking_vectors[universal_columns])
            advisory_record.ranking_vectors.drop(columns=columns_to_drop,
                                                 inplace=True)

            advisory_record.ranked_candidate_commits = rank.rank_candidates(
                model, advisory_record.ranking_vectors)

            advisory_record.ranking_vectors.set_index('commit_id',
                                                      inplace=True)
            output = prospector_main.advisory_record_to_output(
                advisory_record, model, prospector_cursor, k=k)
            tmp_download_link = download_link(
                output,
                'Prospector_results-{}.txt'.format(state.vulnerability_id),
                "Click here to download Prospector's results as a txt file!")

            st.header("Results")

            st.markdown(tmp_download_link, unsafe_allow_html=True)

            st.write(
                "Showing the top {} candidates from {} candidates considered".
                format(k, len(advisory_record.ranking_vectors)))
            st.write(output)
Beispiel #3
0
def test_extract_nvd_content_errors():
    with pytest.raises(AssertionError):
        database.extract_nvd_content('BLA-BLA-BLA')
Beispiel #4
0
def main(vulnerability_id,
         verbose,
         description=None,
         published_timestamp=None,
         repo_url=None,
         project_name=None,
         references=None,
         k=10,
         vulnerability_specific_scaling=False):
    model = load(model_path)
    universal_columns_scaler = load(min_max_scaler_path)

    # databases are created in the notebook database_creation.ipynb
    # the vulnerabilities database
    vulnerabilities_connection, vulnerabilities_cursor = database.connect_with_vulnerabilities_database(
        'data/prospector-vulnerabilities.db', verbose=verbose)
    # the commits database
    prospector_connection, prospector_cursor = database.connect_with_database(
        'data/prospector-commits.db', verbose=verbose)

    # if the vulnerability is already in the database
    if database.if_new_vulnerability(vulnerabilities_cursor,
                                     vulnerability_id) == False:
        vulnerability = vulnerabilities_cursor.execute(
            "SELECT * FROM vulnerabilities WHERE vulnerability_id = :vulnerability_id",
            {
                'vulnerability_id': vulnerability_id
            }).fetchone()

        # keep the manually provided value if it has been provided, otherwise select the one in the DB
        repo_url = repo_url if repo_url != None else vulnerability['repo_url']
        published_timestamp = published_timestamp if published_timestamp != None else vulnerability[
            'published_date']

        if description == None:
            description = vulnerability['description']
            preprocessed_description = vulnerability[
                'preprocessed_description']
        else:
            preprocessed_description = rank.simpler_filter_text(description)

        if references != None:
            database.add_vulnerability_references_to_database(
                vulnerabilities_connection,
                vulnerability_id,
                references,
                driver=None,
                verbose=verbose)
        else:
            references = references if references != None else [
                nvd_reference['url']
                for nvd_reference in vulnerabilities_cursor.execute(
                    "SELECT url FROM vulnerability_references WHERE vulnerability_id = :vulnerability_id",
                    {'vulnerability_id': vulnerability_id})
            ]

    else:
        if verbose:
            print("Vulnerability {} is a new vulnerability".format(
                vulnerability_id))

        # gather information for the new vulnerability if needed
        if description == None or published_timestamp == None or references == None:
            try:
                nvd_description, nvd_published_timestamp, nvd_references = database.extract_nvd_content(
                    vulnerability_id)
            except:  #if the vulnerability is not in the NVD
                nvd_description, nvd_published_timestamp, nvd_references = None, None, None

            if description == None:
                if nvd_description == None:
                    # raise ValueError("Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability description manually.".format(vulnerability_id))
                    print(
                        "Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability description manually."
                        .format(vulnerability_id))
                    description = input()

                    if description == "SKIP!":
                        print('skipping this one')
                        return
                else:
                    description = nvd_description

            if published_timestamp == None:
                if nvd_published_timestamp == None:
                    # raise ValueError("Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability timestamp manually.".format(vulnerability_id))
                    print(
                        "Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability timestamp manually."
                        .format(vulnerability_id))
                    published_timestamp = input()
                else:
                    published_timestamp = nvd_published_timestamp

            if references == None:
                if nvd_references == None:
                    # raise ValueError("Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability description manually.".format(vulnerability_id))
                    print(
                        "Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability references manually (comma seperated)."
                        .format(vulnerability_id))
                    references = input()
                    references = references.split(',')
                else:
                    references = nvd_references

        # determine the repo_url
        if repo_url == None:
            if verbose: print('Suggesting a repository URL')
            repo_url = rank.map_description_to_repository_url(
                vulnerabilities_connection, vulnerability_id, description)

            print(
                'Does the vulnerability affect the following repository: {} [Y/n]'
                .format(repo_url))
            choice = input()
            if choice.lower() in [
                    '', 'y', 'yes'
            ]:  #@TODO: can be a while, where it is either yes or no, not enter
                print('Confirmed')
            else:
                print('Provide the (GitHub) URL of the affected repository:')
                repo_url = input()
                repo_url = re.sub('\.git$|/$', '', repo_url)
            print('repo_url:', repo_url)

        # add to the database
        preprocessed_description = rank.simpler_filter_text(description)
        with vulnerabilities_connection:
            vulnerabilities_cursor.execute(
                "INSERT INTO vulnerabilities VALUES (:vulnerability_id, :repo_url, :description, :published_timestamp, :preprocessed_description)",
                {
                    'vulnerability_id': vulnerability_id,
                    'repo_url': repo_url,
                    'description': description,
                    'published_timestamp': str(published_timestamp),
                    'preprocessed_description': preprocessed_description
                })

        # add the references to the database
        database.add_vulnerability_references_to_database(
            vulnerabilities_connection,
            vulnerability_id,
            references,
            driver=None,
            verbose=verbose)

    # determine the project_name
    if project_name == None:
        if verbose: print('Suggesting a project name')
        project_name = rank.extract_project_name_from_repository_url(repo_url)
        print('Does the vulnerability affect the following project: {} [Y/n]'.
              format(project_name))
        choice = input()
        if choice.lower() in [
                '', 'y', 'yes'
        ]:  #@TODO: can be a while, where it is either yes or no, not enter
            print('Confirmed')
        else:
            print('Provide the name of the affected project:')
            project_name = input()
    references_for_query = ''
    if len(references) == 1:
        references_for_query = "('" + references[0] + "')"
    else:
        references_for_query = tuple(references)
    references_content = tuple(
        pd.read_sql(
            "SELECT vulnerability_id, url, preprocessed_content FROM vulnerability_references WHERE url IN {} and vulnerability_id = '{}'"
            .format(references_for_query, vulnerability_id),
            vulnerabilities_connection).preprocessed_content)
    references_content = rank.extract_n_most_occurring_words(
        rank.remove_forbidden_words_from_string(
            string=' '.join(references_content),
            forbidden_words=rank.reference_stopwords +
            project_name.split(' ')),
        n=20)

    # @TODO: now adding all advisory references --> change to only using the provided references
    advisory_references = [
        advisory_reference['url']
        for advisory_reference in vulnerabilities_cursor.execute(
            "SELECT url FROM advisory_references WHERE vulnerability_id = :vulnerability_id",
            {'vulnerability_id': vulnerability_id})
    ]

    # creating advisory record
    advisory_record = rank.Advisory_record(
        vulnerability_id,
        published_timestamp,
        repo_url,
        references,
        references_content,
        advisory_references,
        description,
        prospector_connection,
        preprocessed_vulnerability_description=preprocessed_description,
        relevant_tags=None,
        verbose=verbose,
        since=None,
        until=None)

    if verbose:
        print("\nThe following advisory record has been created:")
        print(" - Vulnerability ID: {}".format(advisory_record.id))
        print(" - Vulnerability description: {}".format(
            advisory_record.description))
        print(" - Vulnerability published timestamp: {}".format(
            advisory_record.published_timestamp))
        print(" - Affected project: {}".format(advisory_record.project_name))
        print(" - Affected repository: {}".format(advisory_record.repo_url))
        print(" - References content extracted: {}".format(
            advisory_record.references_content))

    if verbose: print("\nGathering candidate commits:")
    advisory_record.gather_candidate_commits()

    if verbose: print("\nComputing ranking vectors:")
    advisory_record.compute_ranking_vectors(vulnerability_specific_scaling)

    if vulnerability_specific_scaling == False:
        if verbose:
            print(
                "\nscaling some columns using the pretrained scaler, and some vulnerability specific"
            )
        advisory_record.ranking_vectors[
            vulnerability_specific_columns] = MinMaxScaler().fit_transform(
                advisory_record.ranking_vectors[vulnerability_specific_columns]
            )
        advisory_record.ranking_vectors[
            universal_columns] = universal_columns_scaler.transform(
                advisory_record.ranking_vectors[universal_columns])
    advisory_record.ranking_vectors.drop(columns=columns_to_drop, inplace=True)

    if verbose: print("\nRanking the candidate commits:")
    advisory_record.ranked_candidate_commits = rank.rank_candidates(
        model, advisory_record.ranking_vectors)

    if verbose: print('\nResults:')
    advisory_record.ranking_vectors.set_index('commit_id', inplace=True)
    output = advisory_record_to_output(advisory_record,
                                       model,
                                       prospector_cursor,
                                       k=k)
    print(output)

    # # succeeded
    vulnerabilities_connection.close()
    prospector_connection.close()
    return advisory_record
Beispiel #5
0
def test_extract_nvd_content(example_vulnerability):
    nvd_content = database.extract_nvd_content(
        example_vulnerability['vulnerability_id'])
    assert nvd_content[0] == example_vulnerability['description']
    assert nvd_content[1] == example_vulnerability['nvd_published_timestamp']
    assert nvd_content[2] == example_vulnerability['references']