def create_advisory_record(vulnerability_id, published_timestamp, repo_url, references, advisory_references, vulnerability_description, prospector_connection, preprocessed_description, relevant_tags): return rank.Advisory_record(vulnerability_id, published_timestamp, repo_url, references, advisory_references, vulnerability_description, prospector_connection, preprocessed_description, relevant_tags)
def test_commit_coverage(example_vulnerability): vulnerabilities_cursor.execute( "SELECT url FROM advisory_references WHERE vulnerability_id = :vulnerability_id;", {'vulnerability_id': example_vulnerability['vulnerability_id']}) advisory_references = [ reference['url'] for reference in vulnerabilities_cursor ] advisory_record = rank.Advisory_record( vulnerability_id=example_vulnerability['vulnerability_id'], published_timestamp=example_vulnerability['nvd_published_timestamp'], repo_url=example_vulnerability['repo_url'], nvd_references=example_vulnerability['nvd_references'], advisory_references=advisory_references, vulnerability_description=example_vulnerability['description'], connection=prospector_connection, preprocessed_vulnerability_description=example_vulnerability[ 'preprocessed_description'] + example_vulnerability['nvd_reference_content']) advisory_record.gather_candidate_commits() assert len(advisory_record.candidate_commits) == 54
def dashboard_page(state): st.title("PROSPECTOR") st.subheader( "The search engine for fix-commits for security vulnerabilities in OSS" ) st.write('By SAP - Antonino SABETTA & Daan HOMMERSOM') st.write(''' How to use Prospector: \n1) Provide a vulnerability description, (GitHub) repository URL and a release date (or pick a CVE). \n2) Check whether Prospector fills in the rest correctly, and provide additional information if needed. \n3) Find security fixes! ''') # with st.beta_expander(label="Find out more", expanded=False): st.write(''' The objective of Prospector is to minimize the (manual) effort needed for finding the fix commit of a known vulnerability in an open-source software project. Since these repositories can contain hundreds thousands commits, the commits are firstly filtered by only selecting all commits within two years before and one hundred days after the release date with a maximum of respectively 5215 and 100 commits. A study has shown that this selection has 93% recall. \n Firstly, an advisory record is created containing information on the vulnerability. This advisory record is used to select candidate commits. For these candidate commits, ranking vectors are computed. These ranking vectors consist of several components that can be used to predict whether a candidate commit is the fix commit we are looking for. These candidates are then ranked on this probability score. In 77.68% of the cases, the fix is in the top 5. In 84.03% in the top 10, and in 88.59% in the top 20. ''') st.subheader("ADVISORY RECORD") state.vulnerability_id = st.text_input( "Vulnerability identifyer:", value=state.vulnerability_id if state.vulnerability_id else '').upper() if state.vulnerability_id: try: cve_description, cve_published_timestamp, preprocessed_description, references = get_vulnerability_data( state.vulnerability_id, state.vulnerabilities_df, state.db_references_df) except: references = st.text_input( "Please provide useful references (separated by commas)") references = references.split(',') cve_description, cve_published_timestamp, preprocessed_description = '', time.time( ), None else: cve_description, cve_published_timestamp, preprocessed_description, references = '', time.time( ), None, [] vulnerability_description = st.text_area("Vulnerability description", value=cve_description) project_name = st.text_input( "Project name", value=' '.join([ token.text for token in nlp(vulnerability_description) if token.tag_ == 'NNP' ])) repo_url = st.text_input("Repository URL", value=map_description_to_repository_url( vulnerability_id=state.vulnerability_id, description=project_name, vulnerabilities_df=state.vulnerabilities_df, repository_url_df=state.repository_url_df) if project_name != '' else '') published_date = st.date_input("Vulnerability published date", value=datetime.fromtimestamp( int(cve_published_timestamp))) published_timestamp = int(time.mktime(published_date.timetuple())) state.advisory_record_confirmed = st.button( "CONFIRM ADVISORY RECORD" ) if not state.advisory_record_confirmed else True if state.advisory_record_confirmed: # option to clear the state if st.button("CLEAR FIELDS"): state.clear() # if it was a new vulnerability, add it to the DB if type(state.vulnerabilities_df) == type( None) or state.vulnerability_id not in list( state.vulnerabilities_df.index): vulnerabilities_connection, vulnerabilities_cursor = database.connect_with_vulnerabilities_database( vulnerabilities_db_path) database.add_vulnerability_to_database(vulnerabilities_connection, state.vulnerability_id, repo_url, vulnerability_description, published_timestamp) # if it was not an NVD CVE, or the extraction failed if len(references) == 0: try: cve_description, cve_published_timestamp, references = database.extract_nvd_content( state.vulnerability_id) references = [reference for reference in references] except: references = st.text_input( "Please provide useful references (separated by commas)" ) references = references.split(',') database.add_vulnerability_references_to_database( vulnerabilities_connection, state.vulnerability_id, references, driver=None) prospector_connection, prospector_cursor = connect_with_commits_database( commits_db_path) database.add_tags_to_database(prospector_connection, tags=None, git_repo=None, repo_url=repo_url, verbose=True) state.vulnerabilities_df, state.db_references_df, state.advisory_references_df, state.tags_df, state.repository_url_df, state.fixes_df = load_vulnerabilities( ) # gather values repository_tags = gather_tags(repo_url, state.tags_df) versions_in_description = filter.retreive_all_versions_from_description( vulnerability_description) tags_in_description = list( dict.fromkeys([ tag for version in versions_in_description for tag in filter.get_tag_for_version(repository_tags, version) ])) references = [ state.db_references_df.at[index, 'url'] for index in state.db_references_df[state.db_references_df.vulnerability_id == state.vulnerability_id].index ] advisory_references = list(state.advisory_references_df[ state.advisory_references_df.vulnerability_id == state.vulnerability_id].url) # allow the user to influence the filtering state.advanced_settings = st.checkbox("Show advanced settings", state.advanced_settings) if state.advanced_settings: # the adding of references can be gone wrong first_commit_timestamp = rank.get_first_commit_timestamp( repo_url ) #@TODO: add a column to the database containing this value first_commit_date, today = datetime.fromtimestamp( int(first_commit_timestamp)).date(), datetime.fromtimestamp( int(time.time())).date() lower_bound = published_date - timedelta( days=730) if published_date - timedelta( days=730) > first_commit_date else first_commit_date upper_bound = published_date + timedelta( days=100) if published_date + timedelta( days=100) < today else today since, until = st.slider("Published date based interval", min_value=first_commit_date, max_value=today, value=(lower_bound, upper_bound)) since, until = int(time.mktime(since.timetuple())), int( time.mktime(until.timetuple())) # references additional_references = st.text_input( "Additional references (separated by commas)") if additional_references: references += additional_references.split(',') vulnerabilities_connection, vulnerabilities_cursor = database.connect_with_vulnerabilities_database( vulnerabilities_db_path) database.add_vulnerability_references_to_database( vulnerabilities_connection, state.vulnerability_id, references, driver=None) selected_references = st.multiselect('Advisory references', tuple(references), default=tuple(references)) # affected versions relevant_tags = st.multiselect( 'Relevant tags', tuple(repository_tags), default=tuple(tags_in_description) if len(tags_in_description) != 0 else None) # st input int k k = st.number_input("The number of results to show", min_value=1, max_value=50, value=10, step=1) else: selected_references = references relevant_tags = tags_in_description since, until = None, None k = 10 # st.write('vulnerability_description:', vulnerability_description) # st.write('references_content:', references_content) # st.write('vulnerability_id:', state.vulnerability_id) # st.write('since - published_timestamp - until:', since, published_timestamp, until) # st.write('repo_url:', repo_url) # # st.write('references:', references) # # st.write('advisory_references:', advisory_references) # st.write('relevant_tags:', relevant_tags) if st.button("Search prospects!"): model, min_max_scaler = load_model() prospector_connection, prospector_cursor = connect_with_commits_database( commits_db_path) preprocessed_description = rank.simpler_filter_text( vulnerability_description) references_content = tuple(state.db_references_df[ (state.db_references_df.vulnerability_id == state.vulnerability_id) & (state.db_references_df.url.isin(selected_references))]. preprocessed_content) references_content = rank.extract_n_most_occurring_words( rank.remove_forbidden_words_from_string( string=' '.join(references_content), forbidden_words=rank.reference_stopwords + project_name.split(' ')), n=20) st.write(references_content) advisory_record = rank.Advisory_record( state.vulnerability_id, published_timestamp, repo_url, selected_references, references_content, advisory_references, vulnerability_description, prospector_connection, preprocessed_vulnerability_description=preprocessed_description, relevant_tags=relevant_tags, verbose=True, since=since, until=until) print( "\nGathering candidate commits and computing ranking vectors.") advisory_record.gather_candidate_commits() advisory_record.compute_ranking_vectors() # scaling some columns using the pretrained scaler, and some vulnerability specific advisory_record.ranking_vectors[ vulnerability_specific_columns] = MinMaxScaler().fit_transform( advisory_record. ranking_vectors[vulnerability_specific_columns]) advisory_record.ranking_vectors[ universal_columns] = min_max_scaler.transform( advisory_record.ranking_vectors[universal_columns]) advisory_record.ranking_vectors.drop(columns=columns_to_drop, inplace=True) advisory_record.ranked_candidate_commits = rank.rank_candidates( model, advisory_record.ranking_vectors) advisory_record.ranking_vectors.set_index('commit_id', inplace=True) output = prospector_main.advisory_record_to_output( advisory_record, model, prospector_cursor, k=k) tmp_download_link = download_link( output, 'Prospector_results-{}.txt'.format(state.vulnerability_id), "Click here to download Prospector's results as a txt file!") st.header("Results") st.markdown(tmp_download_link, unsafe_allow_html=True) st.write( "Showing the top {} candidates from {} candidates considered". format(k, len(advisory_record.ranking_vectors))) st.write(output)
def main(vulnerability_id, verbose, description=None, published_timestamp=None, repo_url=None, project_name=None, references=None, k=10, vulnerability_specific_scaling=False): model = load(model_path) universal_columns_scaler = load(min_max_scaler_path) # databases are created in the notebook database_creation.ipynb # the vulnerabilities database vulnerabilities_connection, vulnerabilities_cursor = database.connect_with_vulnerabilities_database( 'data/prospector-vulnerabilities.db', verbose=verbose) # the commits database prospector_connection, prospector_cursor = database.connect_with_database( 'data/prospector-commits.db', verbose=verbose) # if the vulnerability is already in the database if database.if_new_vulnerability(vulnerabilities_cursor, vulnerability_id) == False: vulnerability = vulnerabilities_cursor.execute( "SELECT * FROM vulnerabilities WHERE vulnerability_id = :vulnerability_id", { 'vulnerability_id': vulnerability_id }).fetchone() # keep the manually provided value if it has been provided, otherwise select the one in the DB repo_url = repo_url if repo_url != None else vulnerability['repo_url'] published_timestamp = published_timestamp if published_timestamp != None else vulnerability[ 'published_date'] if description == None: description = vulnerability['description'] preprocessed_description = vulnerability[ 'preprocessed_description'] else: preprocessed_description = rank.simpler_filter_text(description) if references != None: database.add_vulnerability_references_to_database( vulnerabilities_connection, vulnerability_id, references, driver=None, verbose=verbose) else: references = references if references != None else [ nvd_reference['url'] for nvd_reference in vulnerabilities_cursor.execute( "SELECT url FROM vulnerability_references WHERE vulnerability_id = :vulnerability_id", {'vulnerability_id': vulnerability_id}) ] else: if verbose: print("Vulnerability {} is a new vulnerability".format( vulnerability_id)) # gather information for the new vulnerability if needed if description == None or published_timestamp == None or references == None: try: nvd_description, nvd_published_timestamp, nvd_references = database.extract_nvd_content( vulnerability_id) except: #if the vulnerability is not in the NVD nvd_description, nvd_published_timestamp, nvd_references = None, None, None if description == None: if nvd_description == None: # raise ValueError("Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability description manually.".format(vulnerability_id)) print( "Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability description manually." .format(vulnerability_id)) description = input() if description == "SKIP!": print('skipping this one') return else: description = nvd_description if published_timestamp == None: if nvd_published_timestamp == None: # raise ValueError("Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability timestamp manually.".format(vulnerability_id)) print( "Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability timestamp manually." .format(vulnerability_id)) published_timestamp = input() else: published_timestamp = nvd_published_timestamp if references == None: if nvd_references == None: # raise ValueError("Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability description manually.".format(vulnerability_id)) print( "Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability references manually (comma seperated)." .format(vulnerability_id)) references = input() references = references.split(',') else: references = nvd_references # determine the repo_url if repo_url == None: if verbose: print('Suggesting a repository URL') repo_url = rank.map_description_to_repository_url( vulnerabilities_connection, vulnerability_id, description) print( 'Does the vulnerability affect the following repository: {} [Y/n]' .format(repo_url)) choice = input() if choice.lower() in [ '', 'y', 'yes' ]: #@TODO: can be a while, where it is either yes or no, not enter print('Confirmed') else: print('Provide the (GitHub) URL of the affected repository:') repo_url = input() repo_url = re.sub('\.git$|/$', '', repo_url) print('repo_url:', repo_url) # add to the database preprocessed_description = rank.simpler_filter_text(description) with vulnerabilities_connection: vulnerabilities_cursor.execute( "INSERT INTO vulnerabilities VALUES (:vulnerability_id, :repo_url, :description, :published_timestamp, :preprocessed_description)", { 'vulnerability_id': vulnerability_id, 'repo_url': repo_url, 'description': description, 'published_timestamp': str(published_timestamp), 'preprocessed_description': preprocessed_description }) # add the references to the database database.add_vulnerability_references_to_database( vulnerabilities_connection, vulnerability_id, references, driver=None, verbose=verbose) # determine the project_name if project_name == None: if verbose: print('Suggesting a project name') project_name = rank.extract_project_name_from_repository_url(repo_url) print('Does the vulnerability affect the following project: {} [Y/n]'. format(project_name)) choice = input() if choice.lower() in [ '', 'y', 'yes' ]: #@TODO: can be a while, where it is either yes or no, not enter print('Confirmed') else: print('Provide the name of the affected project:') project_name = input() references_for_query = '' if len(references) == 1: references_for_query = "('" + references[0] + "')" else: references_for_query = tuple(references) references_content = tuple( pd.read_sql( "SELECT vulnerability_id, url, preprocessed_content FROM vulnerability_references WHERE url IN {} and vulnerability_id = '{}'" .format(references_for_query, vulnerability_id), vulnerabilities_connection).preprocessed_content) references_content = rank.extract_n_most_occurring_words( rank.remove_forbidden_words_from_string( string=' '.join(references_content), forbidden_words=rank.reference_stopwords + project_name.split(' ')), n=20) # @TODO: now adding all advisory references --> change to only using the provided references advisory_references = [ advisory_reference['url'] for advisory_reference in vulnerabilities_cursor.execute( "SELECT url FROM advisory_references WHERE vulnerability_id = :vulnerability_id", {'vulnerability_id': vulnerability_id}) ] # creating advisory record advisory_record = rank.Advisory_record( vulnerability_id, published_timestamp, repo_url, references, references_content, advisory_references, description, prospector_connection, preprocessed_vulnerability_description=preprocessed_description, relevant_tags=None, verbose=verbose, since=None, until=None) if verbose: print("\nThe following advisory record has been created:") print(" - Vulnerability ID: {}".format(advisory_record.id)) print(" - Vulnerability description: {}".format( advisory_record.description)) print(" - Vulnerability published timestamp: {}".format( advisory_record.published_timestamp)) print(" - Affected project: {}".format(advisory_record.project_name)) print(" - Affected repository: {}".format(advisory_record.repo_url)) print(" - References content extracted: {}".format( advisory_record.references_content)) if verbose: print("\nGathering candidate commits:") advisory_record.gather_candidate_commits() if verbose: print("\nComputing ranking vectors:") advisory_record.compute_ranking_vectors(vulnerability_specific_scaling) if vulnerability_specific_scaling == False: if verbose: print( "\nscaling some columns using the pretrained scaler, and some vulnerability specific" ) advisory_record.ranking_vectors[ vulnerability_specific_columns] = MinMaxScaler().fit_transform( advisory_record.ranking_vectors[vulnerability_specific_columns] ) advisory_record.ranking_vectors[ universal_columns] = universal_columns_scaler.transform( advisory_record.ranking_vectors[universal_columns]) advisory_record.ranking_vectors.drop(columns=columns_to_drop, inplace=True) if verbose: print("\nRanking the candidate commits:") advisory_record.ranked_candidate_commits = rank.rank_candidates( model, advisory_record.ranking_vectors) if verbose: print('\nResults:') advisory_record.ranking_vectors.set_index('commit_id', inplace=True) output = advisory_record_to_output(advisory_record, model, prospector_cursor, k=k) print(output) # # succeeded vulnerabilities_connection.close() prospector_connection.close() return advisory_record