def extract_commit_message_reference_content(commit_message, repo_url, driver=None): ''' Can be used to find references in commit messages and extract the content from these references Input: commit_message (list/str): the commit message repo_url (str): the repository URL (when commits refer to a Git issue) driver: a webdriver can be provided to avoid javascript required pages Returns: list: a list containing the preprocessed content of the references that have been found ''' if type(commit_message) == list: commit_message = ' '.join(commit_message) repo_url = re.sub('\.git$|/$', '', repo_url) references = rank.find_references(commit_message) references_content = list() for reference in references: time.sleep(0.5) try: if 'http' not in reference: url = repo_url + '/issues/' + reference.lstrip('#') r = requests.get(url) soup = BeautifulSoup(r.content, "html.parser") # check if reference is found and whether it is an issue or pull page if reference.lstrip('#') in r.url and ('/issues/' in r.url or '/pull/' in r.url): references_content.append(rank.simpler_filter_text(' '.join([string for string in soup.stripped_strings if string not in strings_on_every_GitHub_page]))) else: if 'securityfocus.com' in reference.strip('/.'): #securityfocus.com requires a selection in a menu reference = reference.strip('/.') + '/discuss' try: r = requests.get(reference.strip('.')) #can be end of the sentence soup = BeautifulSoup(r.content, "html.parser") reference_content = ' '.join([string for string in soup.stripped_strings]) # Apache pony mail requires the webdriver to see the content if 'requires JavaScript enabled' in reference_content and driver != None: driver.get(reference.strip('.')) time.sleep(0.5) soup = BeautifulSoup(driver.page_source, "html.parser") reference_content = ' '.join([string for string in soup.stripped_strings]) references_content.append(rank.simpler_filter_text(reference_content)) except: if driver != None: driver.get(reference.strip('.')) time.sleep(0.5) soup = BeautifulSoup(driver.page_source, "html.parser") reference_content = ' '.join([string for string in soup.stripped_strings]) references_content.append(rank.simpler_filter_text(reference_content)) except: print('Failed in obtaining content for reference {}'.format(reference)) return references_content
def test_extract_n_most_occurring_words(example_commit_content): assert rank.extract_n_most_occurring_words(rank.simpler_filter_text( 'Messages contain fix indicating words like fixing, fix or fixes, can also contain a lot of different words. And we do not want a lot of stopwords! From this description, fix should be the returned word and and and not not not a stopword.' ), n=1) == 'fix' assert rank.extract_n_most_occurring_words(rank.simpler_filter_text( example_commit_content['message']), n=1) == 'add' assert rank.extract_n_most_occurring_words(rank.simpler_filter_text( ' '.join(example_commit_content['message'])), n=1) == 'add'
def test_simpler_filter_text(example_commit_content): ''' The function should be able to handle real commit content, where the message and diff are provided as list ''' assert rank.simpler_filter_text( text=example_commit_content['message']) == 'add changelog merge' assert rank.simpler_filter_text(text=' '.join( example_commit_content['message'])) == 'add changelog merge' assert rank.simpler_filter_text( text= 'This is an example sentence to test the functionalities of filtered_text' ) == 'example sentence test functionality filtered_text filter text'
def map_description_to_repository_url(vulnerability_id, description, vulnerabilities_df, repository_url_df): # if the vulnerabilities df is empty if type(vulnerabilities_df) == type(None): return if vulnerability_id in list(vulnerabilities_df.index): return vulnerabilities_df.at[vulnerability_id, 'repo_url'] # else return url with highest lexical similarity repo_urls = list(repository_url_df.repo_url) project_names = list(repository_url_df.project_name) preprocessed_description = rank.simpler_filter_text([ re.sub('[^\w]', ' ', token.text) for token in nlp(description) ]).lower() tfidf_vectorized_strings = TfidfVectorizer().fit_transform( [preprocessed_description] + project_names) scores = { repo_url: cosine_similarity(tfidf_vectorized_strings[0], tfidf_vectorized_strings[i + 1])[0][0] for i, repo_url in enumerate(repo_urls) } return list({ k: v for k, v in sorted( scores.items(), key=lambda item: item[1], reverse=True) }.keys())[0]
def add_vulnerability_to_database(connection, vulnerability_id, repo_url, description=None, published_timestamp=None, references=None, driver=None, verbose=True): ''' Input: connection (sqlite3.connection): the connection with the database vulnerability_id (str): the identifier of the vulnerability repo_url (str): the repository url description (str): the description of the vulnerability can be provided manually, or will be extracted from the NVD published_timestamp (str): vulnerability published timestamp can be provided manually, or will be extracted from the NVD references (list): vulnerability references can be provided manually, or will be extracted from the NVD driver: i.e. a chromedriver can be provided to scrape with when requests does not succeed verbose (bool): "Definition of verbose: containing more words than necessary: WORDY" ''' if type(published_timestamp) == int: published_timestamp = str(published_timestamp) #preprocess repo_url entry repo_url = re.sub('\.git$|/$', '', repo_url) cursor = connection.cursor() if if_new_vulnerability(cursor, vulnerability_id): # gather information for the new vulnerability if needed if description == None or published_timestamp == None or references == None: try: nvd_description, nvd_published_timestamp, nvd_references = extract_nvd_content(vulnerability_id) except: #if the vulnerability is not in the NVD nvd_description, nvd_published_timestamp, nvd_references = None, None, None if description == None: if nvd_description == None: raise ValueError("Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability description manually.".format(vulnerability_id)) else: description = nvd_description if published_timestamp == None: if nvd_published_timestamp == None: raise ValueError("Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability published timestamp manually.".format(vulnerability_id)) else: published_timestamp = nvd_published_timestamp if references == None: if nvd_references == None: raise ValueError("Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a advisory references manually.".format(vulnerability_id)) else: references = nvd_references # add to the database preprocessed_description = rank.simpler_filter_text(description) with connection: cursor.execute("INSERT INTO vulnerabilities VALUES (:vulnerability_id, :repo_url, :description, :published_timestamp, :preprocessed_description)", {'vulnerability_id':vulnerability_id, 'repo_url':repo_url, 'description':description, 'published_timestamp':str(published_timestamp), 'preprocessed_description':preprocessed_description}) # add the references to the database if references != None and len(references) > 0: add_vulnerability_references_to_database(connection, vulnerability_id, references, driver=driver, verbose=verbose) elif verbose: print(" There is already a vulnerability with ID {} in the database".format(vulnerability_id)) cursor.close() return
def get_vulnerability_data(vulnerability_id, vulnerabilities_df, db_references_df): if type(vulnerabilities_df) != type(None) and vulnerability_id in list( vulnerabilities_df.index): repo_url, cve_description, cve_published_timestamp, preprocessed_description = vulnerabilities_df.loc[ vulnerability_id] # cve_project_name = ' '.join(re.split('/|-|\.', cve_repo_url.lstrip('https?://'))) references = list(db_references_df[db_references_df.vulnerability_id == vulnerability_id].url) else: cve_description, cve_published_timestamp, references = database.extract_nvd_content( vulnerability_id) references = [] preprocessed_description = rank.simpler_filter_text(cve_description) return cve_description, cve_published_timestamp, preprocessed_description, references
def load_vulnerabilities(): prospector_connection, prospector_cursor = database.connect_with_database( commits_db_path) vulnerabilities_connection, vulnerabilities_cursor = database.connect_with_vulnerabilities_database( vulnerabilities_db_path) print("Reading vulnerabilities") vulnerabilities_df = pd.read_sql( "SELECT * FROM vulnerabilities", vulnerabilities_connection).set_index("vulnerability_id") db_references_df = pd.read_sql( "SELECT vulnerability_id, url, preprocessed_content FROM vulnerability_references", vulnerabilities_connection) advisory_references_df = pd.read_sql( "SELECT vulnerability_id, url FROM advisory_references", vulnerabilities_connection) fixes_df = pd.read_sql("SELECT * FROM fix_commits", vulnerabilities_connection) # prospector_connection, prospector_cursor = database.connect_with_database(commits_db_path) tags_df = pd.read_sql("SELECT * FROM tags", prospector_connection) # Create repository_url_df repository_url_df = pd.DataFrame() for i, repo_url in enumerate(list(vulnerabilities_df.repo_url.unique())): repository_url_df.at[i, 'repo_url'] = repo_url repository_url_df.at[i, 'project_name'] = rank.simpler_filter_text( re.sub('^https?://|[^\w]', ' ', repo_url)).lower() repository_url_df['project_name'] = repository_url_df.apply( lambda x: ' '.join([ token for token in x['project_name'].split(' ') if token not in ['github', 'com', 'git', 'org'] ]), axis=1) return vulnerabilities_df, db_references_df, advisory_references_df, tags_df, repository_url_df, fixes_df
def dashboard_page(state): st.title("PROSPECTOR") st.subheader( "The search engine for fix-commits for security vulnerabilities in OSS" ) st.write('By SAP - Antonino SABETTA & Daan HOMMERSOM') st.write(''' How to use Prospector: \n1) Provide a vulnerability description, (GitHub) repository URL and a release date (or pick a CVE). \n2) Check whether Prospector fills in the rest correctly, and provide additional information if needed. \n3) Find security fixes! ''') # with st.beta_expander(label="Find out more", expanded=False): st.write(''' The objective of Prospector is to minimize the (manual) effort needed for finding the fix commit of a known vulnerability in an open-source software project. Since these repositories can contain hundreds thousands commits, the commits are firstly filtered by only selecting all commits within two years before and one hundred days after the release date with a maximum of respectively 5215 and 100 commits. A study has shown that this selection has 93% recall. \n Firstly, an advisory record is created containing information on the vulnerability. This advisory record is used to select candidate commits. For these candidate commits, ranking vectors are computed. These ranking vectors consist of several components that can be used to predict whether a candidate commit is the fix commit we are looking for. These candidates are then ranked on this probability score. In 77.68% of the cases, the fix is in the top 5. In 84.03% in the top 10, and in 88.59% in the top 20. ''') st.subheader("ADVISORY RECORD") state.vulnerability_id = st.text_input( "Vulnerability identifyer:", value=state.vulnerability_id if state.vulnerability_id else '').upper() if state.vulnerability_id: try: cve_description, cve_published_timestamp, preprocessed_description, references = get_vulnerability_data( state.vulnerability_id, state.vulnerabilities_df, state.db_references_df) except: references = st.text_input( "Please provide useful references (separated by commas)") references = references.split(',') cve_description, cve_published_timestamp, preprocessed_description = '', time.time( ), None else: cve_description, cve_published_timestamp, preprocessed_description, references = '', time.time( ), None, [] vulnerability_description = st.text_area("Vulnerability description", value=cve_description) project_name = st.text_input( "Project name", value=' '.join([ token.text for token in nlp(vulnerability_description) if token.tag_ == 'NNP' ])) repo_url = st.text_input("Repository URL", value=map_description_to_repository_url( vulnerability_id=state.vulnerability_id, description=project_name, vulnerabilities_df=state.vulnerabilities_df, repository_url_df=state.repository_url_df) if project_name != '' else '') published_date = st.date_input("Vulnerability published date", value=datetime.fromtimestamp( int(cve_published_timestamp))) published_timestamp = int(time.mktime(published_date.timetuple())) state.advisory_record_confirmed = st.button( "CONFIRM ADVISORY RECORD" ) if not state.advisory_record_confirmed else True if state.advisory_record_confirmed: # option to clear the state if st.button("CLEAR FIELDS"): state.clear() # if it was a new vulnerability, add it to the DB if type(state.vulnerabilities_df) == type( None) or state.vulnerability_id not in list( state.vulnerabilities_df.index): vulnerabilities_connection, vulnerabilities_cursor = database.connect_with_vulnerabilities_database( vulnerabilities_db_path) database.add_vulnerability_to_database(vulnerabilities_connection, state.vulnerability_id, repo_url, vulnerability_description, published_timestamp) # if it was not an NVD CVE, or the extraction failed if len(references) == 0: try: cve_description, cve_published_timestamp, references = database.extract_nvd_content( state.vulnerability_id) references = [reference for reference in references] except: references = st.text_input( "Please provide useful references (separated by commas)" ) references = references.split(',') database.add_vulnerability_references_to_database( vulnerabilities_connection, state.vulnerability_id, references, driver=None) prospector_connection, prospector_cursor = connect_with_commits_database( commits_db_path) database.add_tags_to_database(prospector_connection, tags=None, git_repo=None, repo_url=repo_url, verbose=True) state.vulnerabilities_df, state.db_references_df, state.advisory_references_df, state.tags_df, state.repository_url_df, state.fixes_df = load_vulnerabilities( ) # gather values repository_tags = gather_tags(repo_url, state.tags_df) versions_in_description = filter.retreive_all_versions_from_description( vulnerability_description) tags_in_description = list( dict.fromkeys([ tag for version in versions_in_description for tag in filter.get_tag_for_version(repository_tags, version) ])) references = [ state.db_references_df.at[index, 'url'] for index in state.db_references_df[state.db_references_df.vulnerability_id == state.vulnerability_id].index ] advisory_references = list(state.advisory_references_df[ state.advisory_references_df.vulnerability_id == state.vulnerability_id].url) # allow the user to influence the filtering state.advanced_settings = st.checkbox("Show advanced settings", state.advanced_settings) if state.advanced_settings: # the adding of references can be gone wrong first_commit_timestamp = rank.get_first_commit_timestamp( repo_url ) #@TODO: add a column to the database containing this value first_commit_date, today = datetime.fromtimestamp( int(first_commit_timestamp)).date(), datetime.fromtimestamp( int(time.time())).date() lower_bound = published_date - timedelta( days=730) if published_date - timedelta( days=730) > first_commit_date else first_commit_date upper_bound = published_date + timedelta( days=100) if published_date + timedelta( days=100) < today else today since, until = st.slider("Published date based interval", min_value=first_commit_date, max_value=today, value=(lower_bound, upper_bound)) since, until = int(time.mktime(since.timetuple())), int( time.mktime(until.timetuple())) # references additional_references = st.text_input( "Additional references (separated by commas)") if additional_references: references += additional_references.split(',') vulnerabilities_connection, vulnerabilities_cursor = database.connect_with_vulnerabilities_database( vulnerabilities_db_path) database.add_vulnerability_references_to_database( vulnerabilities_connection, state.vulnerability_id, references, driver=None) selected_references = st.multiselect('Advisory references', tuple(references), default=tuple(references)) # affected versions relevant_tags = st.multiselect( 'Relevant tags', tuple(repository_tags), default=tuple(tags_in_description) if len(tags_in_description) != 0 else None) # st input int k k = st.number_input("The number of results to show", min_value=1, max_value=50, value=10, step=1) else: selected_references = references relevant_tags = tags_in_description since, until = None, None k = 10 # st.write('vulnerability_description:', vulnerability_description) # st.write('references_content:', references_content) # st.write('vulnerability_id:', state.vulnerability_id) # st.write('since - published_timestamp - until:', since, published_timestamp, until) # st.write('repo_url:', repo_url) # # st.write('references:', references) # # st.write('advisory_references:', advisory_references) # st.write('relevant_tags:', relevant_tags) if st.button("Search prospects!"): model, min_max_scaler = load_model() prospector_connection, prospector_cursor = connect_with_commits_database( commits_db_path) preprocessed_description = rank.simpler_filter_text( vulnerability_description) references_content = tuple(state.db_references_df[ (state.db_references_df.vulnerability_id == state.vulnerability_id) & (state.db_references_df.url.isin(selected_references))]. preprocessed_content) references_content = rank.extract_n_most_occurring_words( rank.remove_forbidden_words_from_string( string=' '.join(references_content), forbidden_words=rank.reference_stopwords + project_name.split(' ')), n=20) st.write(references_content) advisory_record = rank.Advisory_record( state.vulnerability_id, published_timestamp, repo_url, selected_references, references_content, advisory_references, vulnerability_description, prospector_connection, preprocessed_vulnerability_description=preprocessed_description, relevant_tags=relevant_tags, verbose=True, since=since, until=until) print( "\nGathering candidate commits and computing ranking vectors.") advisory_record.gather_candidate_commits() advisory_record.compute_ranking_vectors() # scaling some columns using the pretrained scaler, and some vulnerability specific advisory_record.ranking_vectors[ vulnerability_specific_columns] = MinMaxScaler().fit_transform( advisory_record. ranking_vectors[vulnerability_specific_columns]) advisory_record.ranking_vectors[ universal_columns] = min_max_scaler.transform( advisory_record.ranking_vectors[universal_columns]) advisory_record.ranking_vectors.drop(columns=columns_to_drop, inplace=True) advisory_record.ranked_candidate_commits = rank.rank_candidates( model, advisory_record.ranking_vectors) advisory_record.ranking_vectors.set_index('commit_id', inplace=True) output = prospector_main.advisory_record_to_output( advisory_record, model, prospector_cursor, k=k) tmp_download_link = download_link( output, 'Prospector_results-{}.txt'.format(state.vulnerability_id), "Click here to download Prospector's results as a txt file!") st.header("Results") st.markdown(tmp_download_link, unsafe_allow_html=True) st.write( "Showing the top {} candidates from {} candidates considered". format(k, len(advisory_record.ranking_vectors))) st.write(output)
def main(vulnerability_id, verbose, description=None, published_timestamp=None, repo_url=None, project_name=None, references=None, k=10, vulnerability_specific_scaling=False): model = load(model_path) universal_columns_scaler = load(min_max_scaler_path) # databases are created in the notebook database_creation.ipynb # the vulnerabilities database vulnerabilities_connection, vulnerabilities_cursor = database.connect_with_vulnerabilities_database( 'data/prospector-vulnerabilities.db', verbose=verbose) # the commits database prospector_connection, prospector_cursor = database.connect_with_database( 'data/prospector-commits.db', verbose=verbose) # if the vulnerability is already in the database if database.if_new_vulnerability(vulnerabilities_cursor, vulnerability_id) == False: vulnerability = vulnerabilities_cursor.execute( "SELECT * FROM vulnerabilities WHERE vulnerability_id = :vulnerability_id", { 'vulnerability_id': vulnerability_id }).fetchone() # keep the manually provided value if it has been provided, otherwise select the one in the DB repo_url = repo_url if repo_url != None else vulnerability['repo_url'] published_timestamp = published_timestamp if published_timestamp != None else vulnerability[ 'published_date'] if description == None: description = vulnerability['description'] preprocessed_description = vulnerability[ 'preprocessed_description'] else: preprocessed_description = rank.simpler_filter_text(description) if references != None: database.add_vulnerability_references_to_database( vulnerabilities_connection, vulnerability_id, references, driver=None, verbose=verbose) else: references = references if references != None else [ nvd_reference['url'] for nvd_reference in vulnerabilities_cursor.execute( "SELECT url FROM vulnerability_references WHERE vulnerability_id = :vulnerability_id", {'vulnerability_id': vulnerability_id}) ] else: if verbose: print("Vulnerability {} is a new vulnerability".format( vulnerability_id)) # gather information for the new vulnerability if needed if description == None or published_timestamp == None or references == None: try: nvd_description, nvd_published_timestamp, nvd_references = database.extract_nvd_content( vulnerability_id) except: #if the vulnerability is not in the NVD nvd_description, nvd_published_timestamp, nvd_references = None, None, None if description == None: if nvd_description == None: # raise ValueError("Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability description manually.".format(vulnerability_id)) print( "Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability description manually." .format(vulnerability_id)) description = input() if description == "SKIP!": print('skipping this one') return else: description = nvd_description if published_timestamp == None: if nvd_published_timestamp == None: # raise ValueError("Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability timestamp manually.".format(vulnerability_id)) print( "Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability timestamp manually." .format(vulnerability_id)) published_timestamp = input() else: published_timestamp = nvd_published_timestamp if references == None: if nvd_references == None: # raise ValueError("Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability description manually.".format(vulnerability_id)) print( "Since the provided vulnerability ID {} cannot be found in the NVD, you must provide a vulnerability references manually (comma seperated)." .format(vulnerability_id)) references = input() references = references.split(',') else: references = nvd_references # determine the repo_url if repo_url == None: if verbose: print('Suggesting a repository URL') repo_url = rank.map_description_to_repository_url( vulnerabilities_connection, vulnerability_id, description) print( 'Does the vulnerability affect the following repository: {} [Y/n]' .format(repo_url)) choice = input() if choice.lower() in [ '', 'y', 'yes' ]: #@TODO: can be a while, where it is either yes or no, not enter print('Confirmed') else: print('Provide the (GitHub) URL of the affected repository:') repo_url = input() repo_url = re.sub('\.git$|/$', '', repo_url) print('repo_url:', repo_url) # add to the database preprocessed_description = rank.simpler_filter_text(description) with vulnerabilities_connection: vulnerabilities_cursor.execute( "INSERT INTO vulnerabilities VALUES (:vulnerability_id, :repo_url, :description, :published_timestamp, :preprocessed_description)", { 'vulnerability_id': vulnerability_id, 'repo_url': repo_url, 'description': description, 'published_timestamp': str(published_timestamp), 'preprocessed_description': preprocessed_description }) # add the references to the database database.add_vulnerability_references_to_database( vulnerabilities_connection, vulnerability_id, references, driver=None, verbose=verbose) # determine the project_name if project_name == None: if verbose: print('Suggesting a project name') project_name = rank.extract_project_name_from_repository_url(repo_url) print('Does the vulnerability affect the following project: {} [Y/n]'. format(project_name)) choice = input() if choice.lower() in [ '', 'y', 'yes' ]: #@TODO: can be a while, where it is either yes or no, not enter print('Confirmed') else: print('Provide the name of the affected project:') project_name = input() references_for_query = '' if len(references) == 1: references_for_query = "('" + references[0] + "')" else: references_for_query = tuple(references) references_content = tuple( pd.read_sql( "SELECT vulnerability_id, url, preprocessed_content FROM vulnerability_references WHERE url IN {} and vulnerability_id = '{}'" .format(references_for_query, vulnerability_id), vulnerabilities_connection).preprocessed_content) references_content = rank.extract_n_most_occurring_words( rank.remove_forbidden_words_from_string( string=' '.join(references_content), forbidden_words=rank.reference_stopwords + project_name.split(' ')), n=20) # @TODO: now adding all advisory references --> change to only using the provided references advisory_references = [ advisory_reference['url'] for advisory_reference in vulnerabilities_cursor.execute( "SELECT url FROM advisory_references WHERE vulnerability_id = :vulnerability_id", {'vulnerability_id': vulnerability_id}) ] # creating advisory record advisory_record = rank.Advisory_record( vulnerability_id, published_timestamp, repo_url, references, references_content, advisory_references, description, prospector_connection, preprocessed_vulnerability_description=preprocessed_description, relevant_tags=None, verbose=verbose, since=None, until=None) if verbose: print("\nThe following advisory record has been created:") print(" - Vulnerability ID: {}".format(advisory_record.id)) print(" - Vulnerability description: {}".format( advisory_record.description)) print(" - Vulnerability published timestamp: {}".format( advisory_record.published_timestamp)) print(" - Affected project: {}".format(advisory_record.project_name)) print(" - Affected repository: {}".format(advisory_record.repo_url)) print(" - References content extracted: {}".format( advisory_record.references_content)) if verbose: print("\nGathering candidate commits:") advisory_record.gather_candidate_commits() if verbose: print("\nComputing ranking vectors:") advisory_record.compute_ranking_vectors(vulnerability_specific_scaling) if vulnerability_specific_scaling == False: if verbose: print( "\nscaling some columns using the pretrained scaler, and some vulnerability specific" ) advisory_record.ranking_vectors[ vulnerability_specific_columns] = MinMaxScaler().fit_transform( advisory_record.ranking_vectors[vulnerability_specific_columns] ) advisory_record.ranking_vectors[ universal_columns] = universal_columns_scaler.transform( advisory_record.ranking_vectors[universal_columns]) advisory_record.ranking_vectors.drop(columns=columns_to_drop, inplace=True) if verbose: print("\nRanking the candidate commits:") advisory_record.ranked_candidate_commits = rank.rank_candidates( model, advisory_record.ranking_vectors) if verbose: print('\nResults:') advisory_record.ranking_vectors.set_index('commit_id', inplace=True) output = advisory_record_to_output(advisory_record, model, prospector_cursor, k=k) print(output) # # succeeded vulnerabilities_connection.close() prospector_connection.close() return advisory_record
def add_commits_to_database(connection, commit_ids, git_repo=None, repository_url=None, driver=None, with_message_references_content=False, verbose=True): ''' Add commits to the database Input: connection (sqlite3.connection): the connection to the database commit_ids (list): a list of commit_ids git_repo (git_explorer.core.Git): to use for extracting the content repository_url (str): if git_repo is not provided, a repository url is needed to initialize the git_repo driver: a webdriver can be provided to avoid javascript required pages with_message_references_content (bool): to add commits references (requires additional time) verbose (bool): "Definition of verbose: containing more words than necessary: WORDY" ''' if git_repo == None and repository_url == None: raise ValueError('Provide a git_repo or a repository_url') if git_repo == None: git_repo = Git(repository_url, cache_path=GIT_CACHE) git_repo.clone(skip_existing=True) if repository_url == None: repository_url = git_repo.get_url() repository_url = re.sub('\.git$|/$', '', repository_url) if type(commit_ids) == str: commit_ids = [commit_ids] if len(commit_ids) == 0: print('No commit IDs were provided') return cursor = connection.cursor() # to not add duplicates commit_ids = list(dict.fromkeys(commit_ids)) # to get only unique ids commits_already_in_the_db = list( pd.read_sql( "SELECT id FROM commits WHERE id IN {} and repository_url = '{}'". format(tuple(commit_ids + [commit_ids[0]]), repository_url), connection).id) commits_to_add = [ commit_id for commit_id in commit_ids if commit_id not in commits_already_in_the_db ] if len(commits_to_add) == 0: cursor.close() return if verbose: print(' {} / {} are already in the database, now adding the rest.'. format(len(commits_already_in_the_db), len(commit_ids))) for commit_id in tqdm(commits_to_add): try: # initialize commit object commit = Commit(git_repo, commit_id) # message execution is combined with timestamp execution to speed up to process message = commit._exec.run( ['git', 'log', '--format=%B%n%ct', '-n1', commit._id]) timestamp = message.pop(-1) diff = commit._exec.run([ 'git', 'diff', '--unified=1', commit._id + "^.." + commit._id ]) changed_files = get_changed_files_from_diff(diff) hunks = get_hunks_from_diff(diff) preprocessed_message = rank.simpler_filter_text(message) preprocessed_diff = rank.simpler_filter_text( re.sub( '[^A-Za-z0-9]+', ' ', ' '.join( rank.extract_relevant_lines_from_commit_diff(diff)))) preprocessed_changed_files = rank.simpler_filter_text( changed_files) if with_message_references_content: commit_message_reference_content = extract_commit_message_reference_content( message, repository_url, driver) preprocessed_commit_message_reference_content = rank.extract_n_most_occurring_words( commit_message_reference_content, n=20) else: commit_message_reference_content, preprocessed_commit_message_reference_content = None, None # add to database with connection: cursor.execute( "INSERT INTO commits VALUES (:repository_url, :id, :timestamp, :message, :changed_files, :diff, :hunks, :commit_message_reference_content, :preprocessed_message, :preprocessed_diff, :preprocessed_changed_files, :preprocessed_commit_message_reference_content)", { 'repository_url': repository_url, 'id': commit_id, 'timestamp': str(timestamp), 'message': str(message), 'changed_files': str(changed_files), 'diff': str(diff), 'hunks': str(hunks), 'commit_message_reference_content': commit_message_reference_content, 'preprocessed_message': preprocessed_message, 'preprocessed_diff': preprocessed_diff, 'preprocessed_changed_files': preprocessed_changed_files, 'preprocessed_commit_message_reference_content': preprocessed_commit_message_reference_content }) except: print(' Failed to add commit {}'.format(commit_id)) if verbose: print(' All commits have been added to the database.') cursor.close() return
def add_vulnerability_references_to_database(connection, vulnerability_id, references, driver=None, verbose=True): ''' Input: connection (sqlite3.connection): the connection with the database vulnerability_id (str): the identifier of the vulnerability references (list): the (NVD) references verbose (bool): "Definition of verbose: containing more words than necessary: WORDY" driver: a webdriver can be provided to avoid javascript required pages ''' if type(references) == str: references = [references] cursor = connection.cursor() for reference in references: # cursor.execute("SELECT * FROM vulnerability_references WHERE url = :url AND vulnerability_id = :vulnerability_id;", # {'url' : reference, 'vulnerability_id':vulnerability_id}) # if len(cursor.fetchall()) == 0: if cursor.execute( "SELECT EXISTS(SELECT 1 FROM vulnerability_references WHERE url = :url AND vulnerability_id = :vulnerability_id LIMIT 1) AS 'exists';", { 'url': reference, 'vulnerability_id': vulnerability_id }).fetchone()['exists'] == 0: time.sleep(random.random()) if reference[-4:] == '.pdf' and verbose: print(' Skipping reference since reference is a pdf') elif any([term in reference for term in test_url_terms]) == False: try: if 'securityfocus.com' in reference.strip( '/.' ): #securityfocus.com requires a selection in a menu reference = reference.strip('/.') + '/discuss' try: r = requests.get( reference.strip('.')) #can be end of the sentence soup = BeautifulSoup(r.content, "html.parser") reference_content = ' '.join( [string for string in soup.stripped_strings]) # Apache pony mail requires the webdriver to see the content if 'requires JavaScript enabled' in reference_content and driver != None: driver.get(reference.strip('.')) time.sleep(0.5) soup = BeautifulSoup(driver.page_source, "html.parser") reference_content = ' '.join( [string for string in soup.stripped_strings]) except: if driver != None: driver.get(reference.strip('.')) time.sleep(0.5) soup = BeautifulSoup(driver.page_source, "html.parser") reference_content = ' '.join( [string for string in soup.stripped_strings]) preprocessed_reference_content = rank.simpler_filter_text( reference_content) # add to database with connection: cursor.execute( "INSERT INTO vulnerability_references VALUES (:url, :vulnerability_id, :preprocessed_content)", { 'url': reference.strip('/'), 'vulnerability_id': vulnerability_id, 'preprocessed_content': str(preprocessed_reference_content) }) try: # add the urls referred to on these pages to the advisory references DB urls_found = [ link.get('href').strip('/') for link in soup.find_all('a') if link.get('href') and 'http' in link.get('href') ] add_advisory_references_to_database( connection, vulnerability_id, urls_found) except: print("Failed in adding advisory references") except: print( ' reference {} could not be added to the db'.format( reference)) elif verbose: print(' reference {} is already in the db'.format(reference)) cursor.close() return