def getRoutingFileForDate(routing_date, files_path, DEBUG): db_handler = DBHandler('') available_routing_files = db_handler.getPathsToRoutingFilesForDate( routing_date) db_handler.close() routing_file = '' if 'bgprib.mrt' in available_routing_files: routing_file = available_routing_files['bgprib.mrt'] elif 'dmp.gz' in available_routing_files and 'v6.dmp.gz' in available_routing_files: # If there is not bgprib.mrt file available, but the two dmp files # (for v4 and v6) are available, I use them dmp_file = available_routing_files['dmp.gz'] readable_dmp = BGPDataHandler.getReadableFile( dmp_file, False, files_path, DEBUG) v6dmp_file = available_routing_files['v6.dmp.gz'] readable_v6dmp = BGPDataHandler.getReadableFile( v6dmp_file, False, files_path, DEBUG) routing_file = BGPDataHandler.concatenateFiles('{}/{}_v4andv6.dmp.readable'\ .format(files_path, routing_date), readable_dmp, readable_v6dmp) elif 'dmp.gz' in available_routing_files: # If there is only one of the dmp files available, I will work with it routing_file = available_routing_files['dmp.gz'] elif 'v6.dmp.gz' in available_routing_files: # If there is only one of the dmp files available, I will work with it routing_file = available_routing_files['v6.dmp.gz'] return routing_file
def data(self, NomLogique=None, DataType=None): if NomLogique == None or DataType == None: return "{}" else: db_handler = DBHandler(self.db_name) dds_data_dict = format_to_dict(db_handler.get_dds_data_list(NomLogique, DataType)) for key in dds_data_dict: dds_data_dict[key]["dataType"] = DataType try: idObjet = dds_data_dict[key]["idObjet"] idInfo = dds_data_dict[key]["idInfo"] dds_data_dict[key]["libelle"] = self.code_to_libelle_dict[self.nom_to_type_dict[idObjet]][DataType][idInfo] except: dds_data_dict[key]["libelle"] = "Inconnu" dds_data_json = format_to_json(dds_data_dict) db_handler.close() return dds_data_json
def descobjet(self): db_handler = DBHandler(self.db_name) raw_descobjet_dict = format_to_dict(db_handler.get_descobjet_list()) # Specific format for DescObjet descobjet_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(str)))) for obj in raw_descobjet_dict: groupe = raw_descobjet_dict[obj]["idGroupe"] espace = raw_descobjet_dict[obj]["idEspace"] libelle = raw_descobjet_dict[obj]["libelle"] nom = raw_descobjet_dict[obj]["nom"] type = raw_descobjet_dict[obj]["type"] descobjet_dict[groupe][espace][libelle]["nom"] = nom descobjet_dict[groupe][espace][libelle]["type"] = type self.nom_to_type_dict[nom] = type descobjet_json = json.dumps(descobjet_dict, sort_keys=True, indent=4) db_handler.close() return descobjet_json
db_routing_data_v4_dates = set( db_handler.getListOfDatesFromArchiveIndex_v4Only()) db_routing_data_v6_dates = set( db_handler.getListOfDatesFromArchiveIndex_v6Only()) db_routing_data_v4andv6_dates = set( db_handler.getListOfDatesFromArchiveIndex_v4andv6()) missing_routing = complete_dates_set - db_routing_data_v4andv6_dates.union( db_routing_data_v4_dates.intersection(db_routing_data_v6_dates)) missing_routing_v4 = missing_routing - db_routing_data_v6_dates missing_routing_v6 = missing_routing - db_routing_data_v4_dates db_updates_dates = set(db_handler.getListOfDatesForUpdates()) missing_updates = complete_dates_set - db_updates_dates db_handler.close() print "Dates missing in the DB" print "{} dates missing for prefixes.".format(len(missing_pref)) print missing_pref print "{} dates missing for ASes.".format(len(missing_ASes)) print missing_ASes print "{} dates missing for routing data.".format(len(missing_routing)) print missing_routing print "{} dates missing for v4 routing data.".format(len(missing_routing_v4)) print missing_routing_v4 print "{} dates missing for v6 routing data.".format(len(missing_routing_v6)) print missing_routing_v6 print "{} dates missing for updates.".format(len(missing_updates)) print missing_updates
def main(argv): routing_file = '' readables_path = '' archive_folder = '/data/wattle/bgplog' proc_num = -1 data_type = 'visibility' DEBUG = False try: opts, args = getopt.getopt(argv,"ht:A:f:n:D", ['data_type=', 'archive_folder=', 'procNumber=', 'routingFile=',]) except getopt.GetoptError: print 'Usage: {} -h | -t <visibility/routing> (-A <archive folder> -n <process number> | -f <readable routing file>) [-D]'.format(sys.argv[0]) print "t: Data type. Type of data to be inserted into the DB." print "Visibility -> To insert the dates during which prefixes, origin ASes and middle ASes were seen in the routing table." print "Routing -> To insert into the archive_index table the list of rows in the BGP routing table for the available dates." print "Visibility will be used by default." print "A: Provide the path to the folder containing hitorical routing data." print "AND" print "n: Provide a process number from 1 to 5, which allows the script to process a specific subset of the available files so that different scripts can process different files." print "OR" print "f: Provide the path to a routing file." print "D: DEBUG mode" sys.exit() for opt, arg in opts: if opt == '-h': print 'Usage: {} -h | -t <visibility/routing> (-A <archive folder> -n <process number> | -f <readable routing file>) [-D]'.format(sys.argv[0]) print "t: Data type. Type of data to be inserted into the DB." print "Visibility -> To insert the dates during which prefixes, origin ASes and middle ASes were seen in the routing table." print "Routing -> To insert into the archive_index table the list of rows in the BGP routing table for the available dates." print "Visibility will be used by default." print "A: Provide the path to the folder containing hitorical routing data." print "AND" print "n: Provide a process number from 1 to 5, which allows the script to process a specific subset of the available files so that different scripts can process different files." print "OR" print "f: Provide the path to a routing file." print "D: DEBUG mode" sys.exit() elif opt == '-t': data_type = arg if data_type != 'visibility' and data_type != 'routing': print "Wrong data type! You MUST choose between 'visibility' and 'routing'." sys.exit(-1) elif opt == '-A': archive_folder = os.path.abspath(arg) elif opt == '-n': try: proc_num = int(arg) except ValueError: print "The process number MUST be a number." sys.exit(-1) elif opt == '-f': routing_file = os.path.abspath(arg) elif opt == '-D': DEBUG = True else: assert False, 'Unhandled option' if proc_num == -1: if routing_file == '': print "If you don't provide the path to a routing file you MUST provide a process number." sys.exit(-1) else: file_date = BGPDataHandler.getDateFromFileName(routing_file) if file_date.year in [2007, 2008, 2009]: proc_num = 1 elif file_date.year in [2010, 2011]: proc_num = 2 elif file_date.year in [2012, 2013]: proc_num = 3 elif file_date.year in [2014, 2015]: proc_num = 4 elif file_date.year in[2016, 2017]: proc_num = 5 else: print "Routing file corresponds to date out of the considered range." sys.exit(-1) readables_path = '/home/sofia/BGP_stats_files/hist_part{}'.format(proc_num) files_path = '/home/sofia/BGP_stats_files/Visibility_Routing_CSVs/CSVs{}'.format(proc_num) output_file = '{}/CSVgeneration_{}_{}_{}.output'.format(files_path, data_type, proc_num, datetime.today().date()) bgp_handler = BGPDataHandler(DEBUG, readables_path) if routing_file != '': generateFilesFromRoutingFile(files_path, routing_file, bgp_handler, data_type, dict(), output_file, archive_folder, DEBUG) else: db_handler = DBHandler() dates_ready = dict() if data_type == 'visibility': sys.stdout.write('Checking for dates already in the DB\n') existing_dates_pref = set(db_handler.getListOfDatesForPrefixes()) for ex_date in existing_dates_pref: # We don't want to insert duplicated data, # therefore, we assume that if the date is present # in the prefixes table, all the prefixes for that date, # v4 and v6, have already been inserted. # After finishing with the bulk insertion, all the dates need # to be checked to determine if there is any missing data. if ex_date not in dates_ready: dates_ready[ex_date] = dict() if 'prefixes' not in dates_ready[ex_date]: dates_ready[ex_date]['prefixes'] = defaultdict(bool) dates_ready[ex_date]['prefixes']['v4'] = True dates_ready[ex_date]['prefixes']['v6'] = True existing_dates_orASes = set(db_handler.getListOfDatesForOriginASes()) for ex_date in existing_dates_orASes: if ex_date not in dates_ready: dates_ready[ex_date] = dict() if 'originASes' not in dates_ready[ex_date]: dates_ready[ex_date]['originASes'] = defaultdict(bool) dates_ready[ex_date]['originASes']['v4'] = True dates_ready[ex_date]['originASes']['v6'] = True existing_dates_midASes = set(db_handler.getListOfDatesForMiddleASes()) for ex_date in existing_dates_midASes: if ex_date not in dates_ready: dates_ready[ex_date] = dict() if 'middleASes' not in dates_ready[ex_date]: dates_ready[ex_date]['middleASes'] = defaultdict(bool) dates_ready[ex_date]['middleASes']['v4'] = True dates_ready[ex_date]['middleASes']['v6'] = True elif data_type == 'routing': existing_dates_v4 = set(db_handler.getListOfDatesFromArchiveIndex_v4Only()) for ex_date in existing_dates_v4: if ex_date not in dates_ready: dates_ready[ex_date] = defaultdict(bool) dates_ready[ex_date]['routing_v4'] = True existing_dates_v6 = set(db_handler.getListOfDatesFromArchiveIndex_v6Only()) for ex_date in existing_dates_v6: if ex_date not in dates_ready: dates_ready[ex_date] = defaultdict(bool) dates_ready[ex_date]['routing_v6'] = True existing_dates_v4andv6 = set(db_handler.getListOfDatesFromArchiveIndex_v4andv6()) for ex_date in existing_dates_v4andv6: if ex_date not in dates_ready: dates_ready[ex_date] = defaultdict(bool) dates_ready[ex_date]['routing_v4andv6'] = True db_handler.close() sys.stdout.write('Checking for existing CSV files\n') dates_ready = getDatesOfExistingCSVs(files_path, data_type, dates_ready) sys.stdout.write('Starting to generate CSV files from readable files\n') dates_ready = generateFilesFromReadables(readables_path, data_type, dates_ready, files_path, bgp_handler, output_file, archive_folder, DEBUG) sys.stdout.write('Starting to generate CSV files from bgprib.mrt files\n') dates_ready = generateFilesFromOtherRoutingFiles(\ archive_folder, data_type, dates_ready, files_path, bgp_handler, proc_num, 'bgprib.mrt', output_file, DEBUG) sys.stdout.write('Starting to generate CSV files from dmp.gz files\n') dates_ready = generateFilesFromOtherRoutingFiles(\ archive_folder, data_type, dates_ready, files_path, bgp_handler, proc_num, 'dmp.gz', output_file, DEBUG) completeDatesSet = getCompleteDatesSet(proc_num) with open(output_file, 'a') as output: if data_type == 'visibility': output.write('Dates that are not in the prefixes or in the asns tables in the DB and for which some of the CSV files were not created.\n') for ex_date in completeDatesSet: if ex_date not in dates_ready: output.write('Visibility data not ready for date {}\n'.format(ex_date)) else: for item in ['prefixes', 'originASes', 'middleASes']: if item not in dates_ready[ex_date]: output.write('Visibility data for {} not ready for date {}\n'.format(item, ex_date)) else: for v in ['v4', 'v6']: if not dates_ready[ex_date][item][v]: output.write('Visibility data for {} coming from {} file not ready for date {}.\n'.format(item, v, ex_date)) elif data_type == 'routing': output.write('Dates that are not in the archive_index table in the DB and for which some of the CSV files were not created.\n') for ex_date in completeDatesSet: if ex_date not in dates_ready: output.write('Routing data about v4 prefixes not ready for date {}\n'.format(ex_date)) output.write('Routing data about v6 prefixes not ready for date {}\n'.format(ex_date)) else: if not dates_ready[ex_date]['routing_v4']: output.write('Routing data about v4 prefixes not ready for date {}\n'.format(ex_date)) if not dates_ready[ex_date]['routing_v6']: output.write('Routing data about v6 prefixes not ready for date {}\n'.format(ex_date)) sys.stdout.write('Finished generating CSV files. Output file {} created.\n'.format(output_file))
def generateCSVFromUpdatesFile(updates_file, files_path, readables_path, DEBUG, output_file): sys.stdout.write( 'Starting to generate CSV file from {}\n'.format(updates_file)) db_handler = DBHandler('') file_already_exists = db_handler.checkIfUpdatesFileExists( updates_file, BGPDataHandler.getDateFromFileName(updates_file).year) db_handler.close() if file_already_exists: return '' filename = updates_file.split('/')[-1] csv_file = '{}/{}.csv'.format(files_path, filename) if os.path.exists(csv_file): with open(output_file, 'a') as output: output.write( 'CSV file for updates file {} already exists.\n'.format( updates_file)) return 'already_existed' if updates_file.endswith('log.gz'): unzipped_file = '{}/{}'.format(files_path, filename[:-3]) if not os.path.exists(unzipped_file): with gzip.open(updates_file, 'rb') as gzip_file,\ open(unzipped_file, 'wb') as output: try: output.write(gzip_file.read()) except IOError: with open(output_file, 'a') as output: output.write( 'IOError unzipping file {}\n'.format(updates_file)) return '' filtered_file = '{}.filtered'.format(unzipped_file) if not os.path.exists(filtered_file): with open(filtered_file, 'w') as filtered: cmd = shlex.split('grep debugging {}'.format(unzipped_file)) p = subprocess.Popen(cmd, stdout=subprocess.PIPE) cmd2 = shlex.split('grep rcvd') p2 = subprocess.Popen(cmd2, stdin=p.stdout, stdout=filtered) p2.communicate() p.kill() announcements_file = '{}.announcements'.format(unzipped_file) if not os.path.exists(announcements_file): with open(announcements_file, 'w') as announcements_f: cmd = shlex.split("grep -v withdrawn {}".format(filtered_file)) p = subprocess.Popen(cmd, stdout=announcements_f) p.communicate() withdrawals_file = '{}.withdrawals'.format(unzipped_file) if not os.path.exists(withdrawals_file): with open(withdrawals_file, 'w') as withdrawals_f: cmd = shlex.split("grep withdrawn {}".format(filtered_file)) p = subprocess.Popen(cmd, stdout=withdrawals_f) p.communicate() # 2015/08/01 00:01:31 debugging: BGP: 202.12.28.1 rcvd UPDATE about 199.60.233.0/24 -- withdrawn # We first get a TextFileReader to read the file in chunks (in case it is too big) withdrawals_reader = pd.read_csv( withdrawals_file, iterator=True, chunksize=1000, header=None, sep=' ', index_col=False, usecols=[0, 1, 4, 8], names=['update_date', 'update_time', 'bgp_neighbor', 'prefix']) # We then put the chunks into a single DataFrame withdrawals_df = pd.concat(withdrawals_reader, ignore_index=True) withdrawals_df['upd_type'] = 'W' withdrawals_df['peerAS'] = -1 withdrawals_df['source_file'] = updates_file withdrawals_df.to_csv(csv_file, header=False, index=False, quoting=2, columns=[ 'update_date', 'update_time', 'upd_type', 'bgp_neighbor', 'peerAS', 'prefix', 'source_file' ]) with open(announcements_file, 'rb+') as announcements_f, open(csv_file, 'a') as csv_f: update_date = '' update_time = '' bgp_neighbor = '' peerAS = -1 prefixes = [] for line in announcements_f: if 'flapped' in line: continue line_parts = line.strip().split() # If a new announcement starts # 2015/08/01 00:01:26 debugging: BGP: 64.71.180.177 rcvd UPDATE w/ attr: nexthop 64.71.180.177, origin i, path 6939 3491 12389 57617 # 2015/08/01 00:01:26 debugging: BGP: 64.71.180.177 rcvd 91.106.234.0/24 # 2015/08/01 00:01:26 debugging: BGP: 64.71.180.177 rcvd 37.1.77.0/24 # 2015/08/01 00:01:26 debugging: BGP: 64.71.180.177 rcvd 37.1.64.0/20 # 2015/08/01 00:01:26 debugging: BGP: 64.71.180.177 rcvd 91.106.232.0/21 if 'UPDATE' in line: # If we were processing another announcement, we write it # to the csv file if len(prefixes) > 0: for prefix in prefixes: csv_f.write('"{}","{}","{}","{}",{},"{}","{}"\n'\ .format(update_date, update_time, 'A', bgp_neighbor, peerAS, prefix, updates_file)) update_date = line_parts[0] update_time = line_parts[1] bgp_neighbor = line_parts[4] if 'path' in line: peerAS = line.split('path')[1].split()[0] if '.' in peerAS: left, right = peerAS.split('.') peerAS = int(left) * 65536 + int(right) else: peerAS = int(peerAS) else: peerAS = -1 prefixes = [] else: prefixes.append(line_parts[6].replace('...duplicate', '')) # We have to write to the csv file the last announcement if len(prefixes) > 0: for prefix in prefixes: csv_f.write('"{}","{}","{}","{}",{},"{}","{}"\n'\ .format(update_date, update_time, 'A', bgp_neighbor, peerAS, prefix, updates_file)) os.remove(unzipped_file) os.remove(filtered_file) os.remove(announcements_file) os.remove(withdrawals_file) elif updates_file.endswith('bgpupd.mrt'): readable_file = BGPDataHandler.getReadableFile(updates_file, False, readables_path, DEBUG) readable_woSTATE = '{}.woSTATE'.format(readable_file) if not os.path.exists(readable_woSTATE): with open(readable_woSTATE, 'w') as woSTATE: cmd = shlex.split('grep -v STATE {}'.format(readable_file)) p = subprocess.Popen(cmd, stdout=woSTATE) p.communicate() readable_announcements = '{}.announcements'.format(readable_file) if not os.path.exists(readable_announcements): with open(readable_announcements, 'w') as announcements: cmd = shlex.split('grep \'|A|\' {}'.format(readable_woSTATE)) p = subprocess.Popen(cmd, stdout=announcements) p.communicate() announcements_df = getDF(readable_announcements, 'A', updates_file) readable_withdrawals = '{}.withdrawals'.format(readable_file) if not os.path.exists(readable_withdrawals): with open(readable_withdrawals, 'w') as withdrawals: cmd = shlex.split('grep \'|W|\' {}'.format(readable_woSTATE)) p = subprocess.Popen(cmd, stdout=withdrawals) p.communicate() withdrawals_df = getDF(readable_withdrawals, 'W', updates_file) updates_df = pd.concat([announcements_df, withdrawals_df]) updates_df.to_csv(csv_file, header=False, index=False, quoting=2, columns=[ 'update_date', 'update_time', 'upd_type', 'bgp_neighbor', 'peerAS', 'prefix', 'source_file' ]) os.remove(readable_file) os.remove(readable_woSTATE) os.remove(readable_announcements) os.remove(readable_withdrawals) return csv_file
def loadUpdatesDFs(self, updates_date): db_handler = DBHandler('') self.updates_prefixes = db_handler.getUpdatesDF_prefix(updates_date) self.updates_peerASes = db_handler.getUpdatesDF_peerAS(updates_date) db_handler.close() return True
def loadStructuresFromArchive(self, routing_date=''): if routing_date == '': db_handler = DBHandler('') routing_files =\ db_handler.getPathsToMostRecentRoutingFiles() db_handler.close() if len(routing_files) == 0: sys.stderr.write("There are no files in the archive!\n") return False else: db_handler = DBHandler('') routing_files = db_handler.getPathsToRoutingFilesForDate( routing_date) db_handler.close() if len(routing_files) == 0: sys.stderr.write( "There are no paths to routing files for the date provided in the DB.\n" ) return False bgprib_file = [] dmp_files = [] for extension in routing_files: if extension == 'bgprib.mrt': bgprib_file.append(routing_files[extension]) break else: # extension == 'dmp.gz' or extension == 'v6.dmp.gz' dmp_files.append(routing_files[extension]) # If a bgprib file is available, we use it if len(bgprib_file) > 0: routing_files = bgprib_file #If not, we use the dmp files else: routing_files = dmp_files files_date, bgp_df, ipv4Prefixes_radix, ipv6Prefixes_radix,\ ipv4_longest_pref, ipv6_longest_pref =\ BGPDataHandler.processMultipleRoutingFiles(routing_files, True, False, self.files_path, self.DEBUG) aux_date = datetime.strptime('1970', '%Y').date() if files_date != aux_date and files_date is not None: routing_loaded = True self.routingDate = files_date updates_loaded = self.loadUpdatesDFs(files_date) if bgp_df.shape[0] != 0: self.bgp_df = bgp_df if len(ipv4Prefixes_radix.prefixes()) != 0: self.ipv4Prefixes_radix = ipv4Prefixes_radix if len(ipv6Prefixes_radix.prefixes()) != 0: self.ipv6Prefixes_radix = ipv6Prefixes_radix if ipv4_longest_pref != -1: self.ipv4_longest_pref = ipv4_longest_pref else: self.ipv4_longest_pref = 32 if ipv6_longest_pref != -1: self.ipv6_longest_pref = ipv6_longest_pref else: self.ipv6_longest_pref = 64 if routing_loaded and updates_loaded: sys.stdout.write( "Class data structures were loaded successfully!\n") return True else: return False