def main(run_date): # keep a record of key data items so we can log what we've done with Auditor(data_set=JOB_NAME) as auditor: graph = load_cmdb_graph(JOB_NAME, SOURCE_PROJECT, SOURCE_BUCKET, CMDB_GRAPH_BLOB) # set up a temp file for saving to # set the auditor to automatically track the written records temp_file = dt.temp_file(JOB_NAME, auditor) # the main processing loop for blob in dt.get_list_of_blobs(SOURCE_PROJECT, SOURCE_BUCKET, VM_FINDINGS_BLOB + '.*' + datetime.date.strftime(run_date, '%Y-%m-%d')): print(blob.name) for line in dt.read_blob_lines(SOURCE_PROJECT, SOURCE_BUCKET, blob.name): auditor.records_read = auditor.records_read + 1 vm_finding = json.loads(line) by_ip = find_record_in_graph(graph, vm_finding.get('IP'), vm_finding.get('NETBIOS')) merged = {**vm_finding, **by_ip} temp_file.write_json_line(merged) blob_name = TARGET_BLOB.replace('%date', '%Y-%m-%d') blob_name = run_date.strftime(blob_name) temp_file.save_to_bucket(TARGET_PROJECT, TARGET_BUCKET, blob_name)
def main(run_date): # keep a record of key data items so we can log what we've done with Auditor(JOB_NAME, r'../../config/va_auditor.yaml') as auditor: # set up a temp file to save the records to temp_file = dt.temp_file(JOB_NAME, auditor) records = {} # the main loop for blob in dt.get_list_of_blobs( SOURCE_PROJECT, SOURCE_BUCKET, SOURCE_BLOB + '.*' + datetime.date.strftime(run_date, '%Y-%m-%d')): for line in dt.read_blob_lines(SOURCE_PROJECT, SOURCE_BUCKET, blob.name): details = extract_details(line) if details[0] in records: records[details[0]].append(details[1]) else: records[details[0]] = [details[1]] for record in records: json_line = {"QID": record, "CVES": records[record]} temp_file.write_json_line(json_line) blob_name = TARGET_BLOB.replace('%date', '%Y-%m-%d') blob_name = run_date.strftime(blob_name) temp_file.save_to_bucket(TARGET_PROJECT, TARGET_BUCKET, blob_name)
def load_cmdb_graph(JOB_NAME, SOURCE_PROJECT, SOURCE_BUCKET, SOURCE_BLOB): graph_file = dt.temp_file(JOB_NAME + '_GRAPH') # copy the graph file for blob in dt.get_list_of_blobs(SOURCE_PROJECT, SOURCE_BUCKET, SOURCE_BLOB): with open(graph_file.file_name, 'wb') as file: blob.download_to_file(file) # open the graph file graph = nx.read_graphml(graph_file.file_name) print("Status: {} nodes and {} edges".format(len(graph.nodes), len(graph.edges))) return graph
def process_job(job_name, source_blob, target_blob): # keep a record of key data items so we can log what we've done with Auditor(data_set=job_name) as auditor: # set up a temp file for saving to # set the auditor to automatically track the written records temp_file = dt.temp_file(job_name, auditor) # we can't be sure today's files will be present, so look for the latest files for blob in dt.get_list_of_blobs(SOURCE_PROJECT, SOURCE_BUCKET, source_blob): # we want the whole file, so download it all at once. payload = blob.download_as_string() json_block = json.loads(payload) for json_record in json_block: auditor.records_read = auditor.records_read + 1 temp_file.write_json_line(json_record) temp_file.save_to_bucket(TARGET_PROJECT, TARGET_BUCKET, target_blob)
def main(run_date): # keep a record of key data items so we can log what we've done auditor = Auditor(JOB_NAME, r'../../config/va_auditor.yaml') auditor.commencement_time = datetime.datetime.today() # set up a temp file for saving to # set the auditor to automatically track the written records temp_file = dt.temp_file(JOB_NAME, auditor) #Create QVM all report. # Takes in CVE summary (CVEId, CVSS data, QID, MFL/Exploit data). Key by CVEId # Takes in QID-CVE map. Can search by CVE or QID, many->many relationship # Takes in Asset findings (QVM == Qualys machine scan results, along with CMDB data. Key by QID, IP Address # Takes in Qualys descriptions and such like). Key by IP address # Once all data available, create triage rating based upon OLD triage algo and add. Then output as CSV(?) # Generator across Asset findings (each will have an IP, some CMDB data and a QID). Then get CVE from QIDCVEMap to get # CVE summary data. Get triage based upon compounded data from Triage subroutine, and add any QID description needed. # Then output as csv (possibly? Still to do...) # Get CVE summary data CVESummaries = {} for blob in dt.get_list_of_blobs( SOURCE_PROJECT, SOURCE_BUCKET, CVE_SUMMARY_SOURCE_BLOB_PATH + '.*' + datetime.datetime.strftime(run_date, '%Y-%m-%d')): for line in dt.read_blob_lines(SOURCE_PROJECT, SOURCE_BUCKET, blob.name): data_record = json.loads(line) CVESummaries[data_record['CVE']] = data_record # Likewise QID summaries (will have the QID verbose description on it) QIDSummaries = {} for blob in dt.get_list_of_blobs( SOURCE_PROJECT, SOURCE_BUCKET, QID_SUMMARY_SOURCE_BLOB_PATH + '.*' + datetime.datetime.strftime(run_date, '%Y-%m-%d')): for line in dt.read_blob_lines(SOURCE_PROJECT, SOURCE_BUCKET, blob.name): data_record = json.loads(line) QIDSummaries[data_record['QID']] = data_record # And finally likewise the QID -> CVE map data. This is many <-> many, so collect it as sets of CVE Ids # which are keyed by the QID in question, as it will be searched by QID. CVEsForAllQIDs = {} for blob in dt.get_list_of_blobs( SOURCE_PROJECT, SOURCE_BUCKET, QID_CVE_SOURCE_BLOB_PATH + '.*' + datetime.datetime.strftime(run_date, '%Y-%m-%d')): for line in dt.read_blob_lines(SOURCE_PROJECT, SOURCE_BUCKET, blob.name): data_record = json.loads(line) if data_record['QID'] in CVEsForAllQIDs: # Add to existing set CVEsForAllQIDs[data_record['QID']].add(data_record['CVE']) else: # New item on dict creating a new set CVEsForAllQIDs[data_record['QID']] = {data_record['CVE']} # Now, parse the whole finding set retrieving the enrichment data from the existing indices for blob in dt.get_list_of_blobs( SOURCE_PROJECT, SOURCE_BUCKET, ASSET_FINDINGS_SOURCE_BLOB_PATH + '.*' + datetime.datetime.strftime(run_date, '%Y-%m-%d')): for line in dt.read_blob_lines(SOURCE_PROJECT, SOURCE_BUCKET, blob.name): finding = json.loads(line) # Do some column renames where appropriate to match VSM reporting names finding['VulnID'] = finding.pop('QID') finding['ScanScore'] = finding.pop('SEVERITY') if 'ENVIRONMENT' in finding and not finding[ 'ENVIRONMENT'] is None and finding['ENVIRONMENT'].upper( )[:4] == 'PROD': serverIsProduction = True else: serverIsProduction = False if 'CBP' in finding: CBP = getMaxCBP(finding['CBP']) # Homogenise the values if 'NONE' in CBP.upper(): CBP = '' else: CBP = '' # Presumes no CBP if no data returned. May need to revisit # Return the CBP value to the findings dict so that its duplicates are eliminated finding['CBP'] = CBP # Add various keys that are missing in some cases with empty values to the # finding so that the output data is consistent in the fields it presents if not 'PORT' in finding or finding['PORT'] is None: finding['PORT'] = '' if not 'SOX' in finding or finding['SOX'] is None: finding['SOX'] = 'false' if not 'STEWARD' in finding or finding['STEWARD'] is None: finding['STEWARD'] = '' if not 'CMDB_OS' in finding or finding['CMDB_OS'] is None: finding['CMDB_OS'] = '' if not 'CMDB_OS_VERSION' in finding or finding[ 'CMDB_OS_VERSION'] is None: finding['CMDB_OS_VERSION'] = '' # Retrieve the QID summary for the finding if finding['VulnID'] in QIDSummaries: qidSummary = QIDSummaries[finding['VulnID']] else: # Got a QID with no summary, so build a dummy one. Should really not happen. qidSummary = { 'QID': finding['VulnID'], 'Patchable': 'Unknown', 'Published_Date': 'Unknown', 'baseScore': 0, 'availabilityImpact': 'NONE', 'confidentialityImpact': 'NONE', 'integrityImpact': 'NONE', 'VulnerabilityName': '', 'Category': '', 'Solution': '', 'VendorNotes': '' } # Get all the CVEs associated with the finding (may be more than one) if finding['VulnID'] in CVEsForAllQIDs: # Code to generate triage based upon matching CVE data CVEIdsForQID = CVEsForAllQIDs[finding['VulnID']] # Get all the summaries. The odd selector is Dictionary Comprehension syntax and can be read as # 'Create a new dictionary (keys:values) based on the keys and values from CVESummaries if the key for # an entry in CVESummaries is in CVEsForQID' CVESummariesForQID = { k: v for (k, v) in CVESummaries.items() if k in CVEIdsForQID } # Get a single line rollup of all the CVE data for the QID that can then be used for both triage and return data. cveSummaryForQID = CVESummaryForQID(CVESummariesForQID) # The triage will rely on the highest/worst values for any of the CVEs returned, so pass the generator for those into # a routine to derive that. TriageString = Triage(cveSummaryForQID['MFL'], cveSummaryForQID['BaseScore'], cveSummaryForQID['Exploit_Known'], cveSummaryForQID['UserInteraction'], serverIsProduction, CBP, cveSummaryForQID['Confidentiality'], cveSummaryForQID['Integrity'], cveSummaryForQID['Availability']) # Finally, bundle the whole lot together as a dict out output data. data_out = dict(finding, **cveSummaryForQID) # concatenates these dicts else: # QID has no matching CVE/CVSS data. Generate triage based off Qualys data. TODO Find correct Algo for this # Prepare a dict to look like the CVSS one. Score and vectors are taken from the QID summary # UI is presumed to be false, as this data is not available for QID findings (and QID findings tend # to be stuff like unpatched software which require no UI anyway) fakeCVESummary = { 'CVE': '', 'Confidentiality': qidSummary['confidentialityImpact'].upper(), 'Integrity': qidSummary['integrityImpact'].upper(), 'Availability': qidSummary['availabilityImpact'].upper(), 'UserInteraction': False, 'BaseScore': float(qidSummary['baseScore']), 'MFL': False, 'Exploit_Known': False, 'MFLCVEs': '', 'MFLCount': 0 } # Prepare a Triage string based upon the QID data as loaded into the fake CVE summary above TriageString = Triage(fakeCVESummary['MFL'], fakeCVESummary['BaseScore'], fakeCVESummary['Exploit_Known'], fakeCVESummary['UserInteraction'], serverIsProduction, CBP, fakeCVESummary['Confidentiality'], fakeCVESummary['Integrity'], fakeCVESummary['Availability']) # And create the reportLine much as before data_out = dict(finding, **fakeCVESummary) # concatenates these dicts # Add QIDSummary data to the output data_out['Patchable'] = qidSummary[ 'Patchable'] # Add the required fields from the QID summary data_out['Published_Date'] = qidSummary['Published_Date'] data_out['VulnerabilityName'] = qidSummary.get( 'VulnerabilityName') or '' data_out['Category'] = qidSummary.get('Category') or '' data_out['Solution'] = qidSummary.get('Solution') or '' data_out['VendorReferences'] = qidSummary.get( 'VendorReferences') or '' # Add the triage string data_out[ 'TriagedRating'] = TriageString # Adds the triaged value to the return dict # Derive the ScanType from the supplied ASSET_TYPE if it is present if not 'ASSET_TYPE' in finding or finding['ASSET_TYPE'] is None: data_out[ 'ScanType'] = '' # Don't set this if there is no ASSET_TYPE. May change. elif finding['ASSET_TYPE'] == 'server': data_out['ScanType'] = 'I' # Internal elif finding['ASSET_TYPE'] == 'workstation': data_out['ScanType'] = 'E' # Endpoint else: data_out[ 'ScanType'] = '' # Should never be hit, but assures that a value of some sort is returned # Add the derived date-based data data_out['ReportDate'] = datetime.datetime.now().strftime( '%Y-%m-%dT%H:%M:%SZ') data_out['Cycle'] = datetime.datetime.now().strftime('%m %Y') firstFoundDate = datetime.datetime.strptime( finding['FIRST_FOUND_DATETIME'], '%Y-%m-%dT%H:%M:%SZ') delta = datetime.datetime.now() - firstFoundDate data_out['DaysSinceFirstFound'] = delta.days if 'High' in TriageString: targetRemediationDate = firstFoundDate + timedelta(weeks=4) elif 'Medium' in TriageString: targetRemediationDate = firstFoundDate + timedelta( days=183 ) # 6 months is a variable time. Pick a good approximation else: # Low targetRemediationDate = firstFoundDate + timedelta( days=365 ) # as is one year (think leap years). Again, approximate data_out['RemediationDue'] = targetRemediationDate.strftime( '%Y-%m-%dT%H:%M:%SZ') data_out[ 'TargetBreached'] = targetRemediationDate < datetime.datetime.now( ) # Other fields data_out['Concat'] = finding['ID'] + '-' + finding['VulnID'] # Write out line to temp file (calls json.dumps to write string out) temp_file.write_json_line(data_out) # finally write out the temp file to the bucket after incorporating the run_date preFormat = TARGET_BLOB.replace('%date', '%Y-%m-%d') destinationFile = run_date.strftime(preFormat) temp_file.save_to_bucket(TARGET_PROJECT, TARGET_BUCKET, destinationFile) # No need to explicitly remove the local file. temp_file class has a destructor that will do that. temp_file = None auditor.completion_time = datetime.datetime.today() auditor.log_event()
def main(): # keep a record of key data items so we can log what we've done with Auditor(JOB_NAME, r'../../config/va_auditor.yaml') as auditor: # set up a temp file for saving to # set the auditor to automatically track the written records temp_file = dt.temp_file(JOB_NAME, auditor) # create a list of the CVEs in these two sets mfl_blob = dt.select_file_records(SOURCE_PROJECT, SOURCE_BUCKET, MFL_LIST_BLOB) mfl_index = set(jl.create_index(mfl_blob, 'CVE')) edb_blob = dt.select_file_records(SOURCE_PROJECT, SOURCE_BUCKET, CVES_WITH_EXPLOITS_BLOB) edb_index = set(jl.create_index(edb_blob, 'CVE')) # the main loop for blob in dt.get_list_of_blobs(SOURCE_PROJECT, SOURCE_BUCKET, NVD_CVE_SUMMARY_BLOB): for nvd_cve_summary_line in dt.read_blob_lines( SOURCE_PROJECT, SOURCE_BUCKET, blob.name): record = json.loads(nvd_cve_summary_line) result = {} result['CVE'] = record.get('CVE') if record['v2.0'] != {}: result['Confidentiality'] = record['v2.0'].get( 'confidentialityImpact') result['Integrity'] = record['v2.0'].get('integrityImpact') result['Availability'] = record['v2.0'].get( 'availabilityImpact') result['UserInteraction'] = record['v2.0'].get( 'userInteractionRequired') result['BaseScore'] = record['v2.0'].get('baseScore') elif record['v3.0'] != {}: result['Confidentiality'] = record['v3.0'].get( 'confidentialityImpact') result['Integrity'] = record['v3.0'].get('integrityImpact') result['Availability'] = record['v3.0'].get( 'availabilityImpact') result['UserInteraction'] = record['v3.0'].get( 'userInteraction') result['BaseScore'] = record['v3.0'].get('baseScore') else: result['Confidentiality'] = '' result['Integrity'] = '' result['Availability'] = '' result['UserInteraction'] = '' result['BaseScore'] = '' # could have also implemented by adding an MFL=True # column to the MFL set and joined on CVE result = jl.set_value(result, 'MFL', lambda x: x.get('CVE') in mfl_index) result = jl.set_value(result, 'Exploit_Known', lambda x: x.get('CVE') in edb_index) temp_file.write_json_line(result) # save the temp file to the bucket temp_file.save_to_bucket(TARGET_PROJECT, TARGET_BUCKET, TARGET_BLOB)
def main(): # keep a record of key data items so we can log what we've done auditor = Auditor(JOB_NAME, r'../../config/va_auditor.yaml') auditor.commencement_time = datetime.datetime.today() # set up a temp file for saving to # set the auditor to automatically track the written records temp_File = dt.temp_file(JOB_NAME, auditor) # Read in data from ExploitDB. Gives a dataframe with one row per Exploit Id ExploitDBDataRaw = pd.read_csv(SOURCE_URL_EXPLOITDB) ExploitDBData = ExploitDBDataRaw.fillna( '') # deal with missing port numbers # Rename the id column to match the desired column name for later use ExploitDBData['ExploitId'] = ExploitDBData.pop('id') # Read in data from CVE-EDB Xrefs. Output a list of json lines mapping CVE to EDB Id. Note that these are # many to many so keyed by both fields. edb = pd.read_html(SOURCE_URL_CVE_EDB_XREF) edb = edb[3] # Use the fourth table which maps EDB -> CVEs # Each row of this DF contains a single ExploitId and a string of one or more CVEs. for i in range(len(edb)): if len(edb[0][i].split(':')) < 2: continue # Line returned has invalid data. Do not attempt to process sploitNumber = edb[0][i].split(':')[ 1] # converts 'ExploitDB:<number>' into number as an int if not isinstance(sploitNumber, int): continue # Again, data returned is invalid, it doesn't have an integer ExploitDB Id # Check if the Mitre EDB Id actually exists on the EDB. If it doesn't then ignore this row. if sploitNumber in ExploitDBData['ExploitId'].values: # Split the CVEs into an array in their own right CVEs = dt.find_cves(str(edb[1][i])) # and get the ExploitDB row that will be used to enrich from for each CVE. # # .loc finds for field=value. The 'records' parameter ensures that only the records, not the # index number are taken into the output. The final [0] is because the to_dict('records') generates a list # not a dict. ExploitDBRowDict = ExploitDBData.loc[ExploitDBData['ExploitId'] == sploitNumber].to_dict( 'records')[0] for cve in CVEs: # Generate output combining the ExploitDB row with the CVEId row = dict({'CVE': cve}, **ExploitDBRowDict) temp_File.write_json_line(row) # finally write out the temp file to the bucket after incorporating the run_date preFormat = TARGET_BLOB.replace('%date', '%Y-%m-%d') destinationFile = datetime.datetime.today().strftime( preFormat ) # always write this as at today as the source data is always as at today temp_File.save_to_bucket(TARGET_PROJECT, TARGET_BUCKET, destinationFile) # No need to explicitly remove the local file. temp_file class has a destructor that will do that. temp_File = None # Write out the two source data files so that any confirmation of validity from sources can use # the source data as it was at the time of creation. # # Mitre CVE -> EDB XRef table to temporary file... temp_File = dt.temp_file('XREF', auditor) for line in edb.iterrows(): temp_File.write_text_line(line[1].to_json()) # ... and file to blob in GVA preFormat = BLOB_FOR_MITRE_CVE_EXPLOITS.replace('%date', '%Y-%m-%d') destinationFile = datetime.datetime.today().strftime( preFormat ) # always write this as at today as the source data is always as at today temp_File.save_to_bucket(TARGET_PROJECT, TARGET_BUCKET, destinationFile) temp_File = None # ExploitDB source data to temporary file... temp_File = dt.temp_file('ExploitDB', auditor) for line in ExploitDBDataRaw.iterrows(): temp_File.write_text_line(line[1].to_json()) # ... and again to blob in GVA preFormat = BLOB_FOR_EXPLOITDB.replace('%date', '%Y-%m-%d') destinationFile = datetime.datetime.today().strftime( preFormat ) # always write this as at today as the source data is always as at today temp_File.save_to_bucket(TARGET_PROJECT, TARGET_BUCKET, destinationFile) temp_File = None # Tidy up auditor auditor.completion_time = datetime.datetime.today() auditor.log_event()
import sys import re sys.path.insert(1, os.path.join(sys.path[0], '..')) sys.path.insert(1, os.path.join(sys.path[0], '../..')) import va_dt_common.common as dt import graph_tools as gt try: import ujson as json except ImportError: import json CONFIG_FILE = 'config.yaml' config = dt.read_config(CONFIG_FILE) JOB_NAME = config.get('job_name') TEMP_FILE = dt.temp_file(JOB_NAME) SOURCE_PROJECT = config.get('source_project') SOURCE_BUCKET = config.get('source_bucket') CI_SERVER_BLOB = config.get('ci_server_blob') RELATIONSHIP_BLOB = config.get('relationship_blob') CI_SERVICE_DISCOVERED_BLOB = config.get('service_discovered_blob') PC_HARDWARE_BLOB = config.get('pc_hardware_blob') TARGET_PROJECT = config.get('target_project') TARGET_BUCKET = config.get('target_bucket') TARGET_BLOB = config.get('target_blob') ERRORBIN_PROJECT = config.get('errorbin_project') ERRORBIN_BUCKET = config.get('errorbin_bucket') ERRORBIN_BLOB = config.get('errorbin_blob')
def main(run_date): # keep a record of key data items so we can log what we've done with Auditor(JOB_NAME, r'../../config/va_auditor.yaml') as auditor: graph = load_cmdb_graph(JOB_NAME, SOURCE_PROJECT, SOURCE_BUCKET, CMDB_GRAPH_BLOB) cat_a = gt.search_nodes(graph, {'cbp_category': 'Tier A'}) # set up a temp file for saving to # set the auditor to automatically track the written records temp_file = dt.temp_file(JOB_NAME, auditor) # the main processing loop # walk the graph from each cat A application output = [] for application in cat_a.nodes(): app = graph.nodes()[application] app_graph = gt.walk_from(graph, [application], depth=1, reverse=True) for app_server in gt.select_nodes_by_type(app_graph, 'ci_server'): server = graph.nodes()[app_server] svr_graph = gt.walk_from(graph, [app_server], depth=1, reverse=True) has_ip = False for ip_addr in gt.select_nodes_by_type(svr_graph, 'ip_address'): ip = graph.nodes()[ip_addr] has_ip = True line = { 'APPLICATION': app.get('display_name'), 'HOST_NAME': server.get('display_name'), 'DNS': server.get('dns_domain'), 'FQDN': server.get('fqdn'), 'CMDB_OS': server.get('os'), 'CMDB_OS_VERSION': server.get('os_version'), 'ENVIRONMENT': server.get('environment'), 'IP': ip.get('display_name') } output.append(line) temp_file.write_json_line(line) # pick up the entries that don't have an IP address if not has_ip: line = { 'APPLICATION': app.get('display_name'), 'HOST_NAME': server.get('display_name'), 'DNS': server.get('dns_domain'), 'FQDN': server.get('fqdn'), 'CMDB_OS': server.get('os'), 'CMDB_OS_VERSION': server.get('os_version'), 'ENVIRONMENT': server.get('environment'), 'IP': None } output.append(line) temp_file.write_json_line(line) blob_name = TARGET_BLOB.replace('%date', '%Y-%m-%d') blob_name = run_date.strftime(blob_name) temp_file.save_to_bucket(TARGET_PROJECT, TARGET_BUCKET, blob_name) csv_blob_name = TARGET_CSV_BLOB.replace('%date', '%Y-%m-%d') csv_blob_name = run_date.strftime(csv_blob_name) # json_lists.save_as_csv() expects a filename not a file object and NamedTemporaryFile returns an object # so we're just reusing the logic here. with tempfile.NamedTemporaryFile(mode='w', encoding='utf8', newline='') as temp_csv_file: record = output[0] columns = record.keys() csv_file = csv.DictWriter(temp_csv_file, fieldnames=columns) csv_file.writeheader() for record in output: record = jl.select_record_fields(record, columns) csv_file.writerow(record) dt.save_file_to_bucket(temp_csv_file.name, TARGET_PROJECT, TARGET_BUCKET, csv_blob_name)