def get_peers(clctr,ym,rib_location): # should end with .bz2/.gz print rib_location loc = hdname + 'archive.routeviews.org/' peer_num_list = open(loc + 'peer_num', 'a') peers = [] # get .txt if os.path.exists(rib_location+'.txt.gz'): # .xx.txt.gz file exists subprocess.call('gunzip '+rib_location+'.txt.gz', shell=True) # unpack elif os.path.exists(rib_location): # .bz2/.gz file exists cmlib.parse_mrt(rib_location, rib_location+'.txt') os.remove(rib_location) # then remove .bz2/.gz # read .txt with open(rib_location+'.txt', 'r') as f: # get peers from RIB for line in f: try: addr = line.split('|')[3] if addr not in peers: peers.append(addr) except: pass f.close() # compress RIB into .gz if not os.path.exists(rib_location+'.txt.gz'): cmlib.pack_gz(rib_location+'.txt') peer_num = len(peers) peer_num_list.write(clctr + ' ' + ym + ' ' + str(peer_num) + '\n') return peers
def parse_updates(sdate, cl_name): flist = open(hdname+'metadata/'+sdate+'/updt_filelist_'+cl_name, 'r') # .xx.txt.gz file name for line in flist: line = line.replace('\n', '') if not os.path.exists(line): # xx.txt.gz not exists, .bz2/.gz exists print line cmlib.parse_mrt(line.replace('.txt.gz', ''), line.replace('txt.gz', 'txt')) cmlib.pack_gz(line.replace('txt.gz', 'txt')) os.remove(line.replace('.txt.gz', '')) # remove .bz2/.gz update files else: # xx.txt.gz exists pass flist.close()
def parse_update_files(listfile): # all update files from one collectors/list flist = open(listfile, 'r') for line in flist: line = line.rstrip('\n') fsize = float(line.split('|')[1]) print 'fsize=',fsize line = line.split('|')[0].replace('.txt.gz', '') # get the original .bz2/gz file name if not os.path.exists(datadir+line+'.txt.gz'): cmlib.parse_mrt(datadir+line, datadir+line+'.txt', fsize) # .bz2/gz => .bz2/gz.txt cmlib.pack_gz(datadir+line+'.txt') # .bz2/gz.txt => .bz2/gz.txt.gz #os.remove(datadir+line) # remove the original .bz2/.gz file else: print 'Parsed file exists' print datadir+line+'.txt.gz' pass flist.close() return 0
def parse_update_files(listfile): # all update files from one collectors/list flist = open(listfile, 'r') for line in flist: line = line.rstrip('\n') fsize = float(line.split('|')[1]) print 'fsize=', fsize line = line.split('|')[0].replace( '.txt.gz', '') # get the original .bz2/gz file name if not os.path.exists(datadir + line + '.txt.gz'): cmlib.parse_mrt(datadir + line, datadir + line + '.txt', fsize) # .bz2/gz => .bz2/gz.txt cmlib.pack_gz(datadir + line + '.txt') # .bz2/gz.txt => .bz2/gz.txt.gz #os.remove(datadir+line) # remove the original .bz2/.gz file else: print 'Parsed file exists' print datadir + line + '.txt.gz' pass flist.close() return 0
def get_pfx2as_trie(self): print 'Calculating prefix to AS number trie...' pfx2as = patricia.trie(None) if int(self.sdate) >= 20050509: self.get_pfx2as_file() pfx2as_file = '' tmp = os.listdir(datadir+'support/'+self.sdate+'/') for line in tmp: if 'pfx2as' in line: pfx2as_file = line break f = open(datadir+'support/'+self.sdate+'/'+pfx2as_file) for line in f: line = line.rstrip('\n') attr = line.split() if '_' in attr[2] or ',' in attr[2]: continue pfx = cmlib.ip_to_binary(attr[0]+'/'+attr[1], '0.0.0.0') try: pfx2as[pfx] = int(attr[2]) # pfx: origin AS except: # When will this happen? pfx2as[pfx] = -1 f.close() else: # Extract info from RIB of the monitor route-views2 mydate = self.sdate[0:4] + '.' + self.sdate[4:6] rib_location = datadir+'routeviews.org/bgpdata/'+mydate+'/RIBS/' dir_list = os.listdir(datadir+'routeviews.org/bgpdata/'+mydate+'/RIBS/') for f in dir_list: if not f.startswith('.'): rib_location = rib_location + f # if RIB is of the same month. That's OK. break if rib_location.endswith('txt.gz'): subprocess.call('gunzip '+rib_location, shell=True) # unpack rib_location = rib_location.replace('.txt.gz', '.txt') elif not rib_location.endswith('txt'): # .bz2/.gz file exists cmlib.parse_mrt(rib_location, rib_location+'.txt') os.remove(rib_location) # then remove .bz2/.gz rib_location = rib_location + '.txt' # now rib file definitely ends with .txt, let's rock and roll with open(rib_location, 'r') as f: for line in f: try: tmp = line.split('|')[5] pfx = cmlib.ip_to_binary(tmp, '0.0.0.0') ASlist = line.split('|')[6] originAS = ASlist.split()[-1] try: pfx2as[pfx] = int(originAS) except: pfx2as[pfx] = -1 except: pass f.close() # compress RIB into .gz if not os.path.exists(rib_location+'.gz'): cmlib.pack_gz(rib_location) return pfx2as
def download_one_rib(self, my_date): tmp_month = my_date[0:4] + '.' + my_date[4:6] if self.co.startswith('rrc'): web_location = rrc_root + self.co + '/' + tmp_month + '/' else: web_location = rv_root + self.co + '/bgpdata/' + tmp_month + '/RIBS/' web_location = web_location.replace('//', '/') webraw = cmlib.get_weblist('http://' + web_location) cmlib.make_dir(datadir+web_location) #---------------------------------------------------------------- # select a RIB file with reasonable (not strange) file size rib_list = webraw.split('\n') filter(lambda a: a != '', rib_list) filter(lambda a: a != '\n', rib_list) rib_list = [item for item in rib_list if 'rib' in item or 'bview' in item] sizelist = list() for line in rib_list: size = line.split()[-1] fsize = cmlib.parse_size(size) sizelist.append(fsize) avg = np.mean(sizelist) target_line = None # stores the RIB file for downloading largest_line = None max = -1 closest = 99999 for line in rib_list: fdate = line.split()[0].split('.')[-3] size = line.split()[-1] fsize = cmlib.parse_size(size) if fsize > max: max = fsize largest_line = line diff = abs(int(fdate)-int(my_date)) # >0 # XXX logic here not clear (but seems effective) if diff <= closest and fsize > 0.9 * avg and fsize < 1.1 * avg: target_line = line closest = diff if target_line is None: assert largest_line is not None print 'Failed. Resort to downloading the largest RIB...' target_line = largest_line # work-around for a special case print 'Selected RIB:', target_line size = target_line.split()[-1] # claimed RIB file size fsize = cmlib.parse_size(size) filename = target_line.split()[0] full_loc = datadir + web_location + filename # .bz2/.gz if os.path.exists(full_loc+'.txt'): # only for clearer logic os.remove(full_loc+'.txt') #------------------------------------------------------------------ # Download the RIB if os.path.exists(full_loc+'.txt.gz'): print 'existed size & original size:',os.path.getsize(full_loc+'.txt.gz'),fsize if os.path.getsize(full_loc+'.txt.gz') > 0.6 * fsize: # 0.6 is good enough return full_loc+'.txt.gz' # Do not download else: os.remove(full_loc+'.txt.gz') # too small to be complete if os.path.exists(full_loc): if os.path.getsize(full_loc) <= 0.95 * fsize: os.remove(full_loc) else: # Good! cmlib.parse_mrt(full_loc, full_loc+'.txt', fsize) cmlib.pack_gz(full_loc+'.txt') return full_loc+'.txt.gz' cmlib.force_download_file('http://'+web_location, datadir+web_location, filename) cmlib.parse_mrt(full_loc, full_loc+'.txt', fsize) cmlib.pack_gz(full_loc+'.txt') os.remove(full_loc) # remove the original file return full_loc+'.txt.gz'
def download_one_rib_before_unix(self, my_date, unix): # my_date for deciding month tmp_month = my_date[0:4] + '.' + my_date[4:6] if self.co.startswith('rrc'): web_location = rrc_root + self.co + '/' + tmp_month + '/' else: web_location = rv_root + self.co + '/bgpdata/' + tmp_month + '/RIBS/' web_location = web_location.replace('//', '/') try: webraw = cmlib.get_weblist('http://' + web_location) print 'Getting list from ' + 'http://' + web_location except: return -1 cmlib.make_dir(datadir+web_location) #---------------------------------------------------------------- # select a RIB file right before the unix and with reasonable (not strange) file size rib_list = webraw.split('\n') filter(lambda a: a != '', rib_list) filter(lambda a: a != '\n', rib_list) rib_list = [item for item in rib_list if 'rib' in item or 'bview' in item] sizelist = list() for line in rib_list: size = line.split()[-1] fsize = cmlib.parse_size(size) sizelist.append(fsize) avg = np.mean(sizelist) ok_rib_list = list() # RIBs whose size is OK for line in rib_list: fsize = cmlib.parse_size(line.split()[-1]) if fsize > 0.9 * avg: ok_rib_list.append(line) target_line = None # the RIB closest to unix min = 9999999999 for line in ok_rib_list: fdate = line.split()[0].split('.')[-3] ftime = line.split()[0].split('.')[-2] dtstr = fdate+ftime objdt = datetime.datetime.strptime(dtstr, '%Y%m%d%H%M') runix = time_lib.mktime(objdt.timetuple()) + 8*60*60 # F**k! Time zone! print objdt, runix, unix if runix <= unix and unix-runix < min: min = unix-runix print 'min changed to ', min target_line = line print 'Selected RIB:', target_line if target_line == None: return -1 size = target_line.split()[-1] # claimed RIB file size fsize = cmlib.parse_size(size) filename = target_line.split()[0] full_loc = datadir + web_location + filename # .bz2/.gz if os.path.exists(full_loc+'.txt'): # only for clearer logic os.remove(full_loc+'.txt') #------------------------------------------------------------------ # Download the RIB if os.path.exists(full_loc+'.txt.gz'): print 'existed!!!!!!!!!!!!' return full_loc+'.txt.gz' # Do not download if os.path.exists(full_loc): cmlib.parse_mrt(full_loc, full_loc+'.txt', fsize) cmlib.pack_gz(full_loc+'.txt') return full_loc+'.txt.gz' cmlib.force_download_file('http://'+web_location, datadir+web_location, filename) cmlib.parse_mrt(full_loc, full_loc+'.txt', fsize) cmlib.pack_gz(full_loc+'.txt') os.remove(full_loc) # remove the original file return full_loc+'.txt.gz'
def download_one_rib(self, my_date): tmp_month = my_date[0:4] + '.' + my_date[4:6] if self.co.startswith('rrc'): web_location = rrc_root + self.co + '/' + tmp_month + '/' else: web_location = rv_root + self.co + '/bgpdata/' + tmp_month + '/RIBS/' web_location = web_location.replace('//', '/') webraw = cmlib.get_weblist('http://' + web_location) cmlib.make_dir(datadir + web_location) #---------------------------------------------------------------- # select a RIB file with reasonable (not strange) file size rib_list = webraw.split('\n') filter(lambda a: a != '', rib_list) filter(lambda a: a != '\n', rib_list) rib_list = [ item for item in rib_list if 'rib' in item or 'bview' in item ] sizelist = list() for line in rib_list: size = line.split()[-1] fsize = cmlib.parse_size(size) sizelist.append(fsize) avg = np.mean(sizelist) target_line = None # stores the RIB file for downloading largest_line = None max = -1 closest = 99999 for line in rib_list: fdate = line.split()[0].split('.')[-3] size = line.split()[-1] fsize = cmlib.parse_size(size) if fsize > max: max = fsize largest_line = line diff = abs(int(fdate) - int(my_date)) # >0 # XXX logic here not clear (but seems effective) if diff <= closest and fsize > 0.9 * avg and fsize < 1.1 * avg: target_line = line closest = diff if target_line is None: assert largest_line is not None print 'Failed. Resort to downloading the largest RIB...' target_line = largest_line # work-around for a special case print 'Selected RIB:', target_line size = target_line.split()[-1] # claimed RIB file size fsize = cmlib.parse_size(size) filename = target_line.split()[0] full_loc = datadir + web_location + filename # .bz2/.gz if os.path.exists(full_loc + '.txt'): # only for clearer logic os.remove(full_loc + '.txt') #------------------------------------------------------------------ # Download the RIB if os.path.exists(full_loc + '.txt.gz'): print 'existed size & original size:', os.path.getsize( full_loc + '.txt.gz'), fsize if os.path.getsize(full_loc + '.txt.gz') > 0.6 * fsize: # 0.6 is good enough return full_loc + '.txt.gz' # Do not download else: os.remove(full_loc + '.txt.gz') # too small to be complete if os.path.exists(full_loc): if os.path.getsize(full_loc) <= 0.95 * fsize: os.remove(full_loc) else: # Good! cmlib.parse_mrt(full_loc, full_loc + '.txt', fsize) cmlib.pack_gz(full_loc + '.txt') return full_loc + '.txt.gz' cmlib.force_download_file('http://' + web_location, datadir + web_location, filename) cmlib.parse_mrt(full_loc, full_loc + '.txt', fsize) cmlib.pack_gz(full_loc + '.txt') os.remove(full_loc) # remove the original file return full_loc + '.txt.gz'
def download_one_rib_before_unix(self, my_date, unix): # my_date for deciding month tmp_month = my_date[0:4] + '.' + my_date[4:6] if self.co.startswith('rrc'): web_location = rrc_root + self.co + '/' + tmp_month + '/' else: web_location = rv_root + self.co + '/bgpdata/' + tmp_month + '/RIBS/' web_location = web_location.replace('//', '/') try: webraw = cmlib.get_weblist('http://' + web_location) print 'Getting list from ' + 'http://' + web_location except: return -1 cmlib.make_dir(datadir + web_location) #---------------------------------------------------------------- # select a RIB file right before the unix and with reasonable (not strange) file size rib_list = webraw.split('\n') filter(lambda a: a != '', rib_list) filter(lambda a: a != '\n', rib_list) rib_list = [ item for item in rib_list if 'rib' in item or 'bview' in item ] sizelist = list() for line in rib_list: size = line.split()[-1] fsize = cmlib.parse_size(size) sizelist.append(fsize) avg = np.mean(sizelist) ok_rib_list = list() # RIBs whose size is OK for line in rib_list: fsize = cmlib.parse_size(line.split()[-1]) if fsize > 0.9 * avg: ok_rib_list.append(line) target_line = None # the RIB closest to unix min = 9999999999 for line in ok_rib_list: fdate = line.split()[0].split('.')[-3] ftime = line.split()[0].split('.')[-2] dtstr = fdate + ftime objdt = datetime.datetime.strptime(dtstr, '%Y%m%d%H%M') runix = time_lib.mktime( objdt.timetuple()) + 8 * 60 * 60 # F**k! Time zone! print objdt, runix, unix if runix <= unix and unix - runix < min: min = unix - runix print 'min changed to ', min target_line = line print 'Selected RIB:', target_line if target_line == None: return -1 size = target_line.split()[-1] # claimed RIB file size fsize = cmlib.parse_size(size) filename = target_line.split()[0] full_loc = datadir + web_location + filename # .bz2/.gz if os.path.exists(full_loc + '.txt'): # only for clearer logic os.remove(full_loc + '.txt') #------------------------------------------------------------------ # Download the RIB if os.path.exists(full_loc + '.txt.gz'): print 'existed!!!!!!!!!!!!' return full_loc + '.txt.gz' # Do not download if os.path.exists(full_loc): cmlib.parse_mrt(full_loc, full_loc + '.txt', fsize) cmlib.pack_gz(full_loc + '.txt') return full_loc + '.txt.gz' cmlib.force_download_file('http://' + web_location, datadir + web_location, filename) cmlib.parse_mrt(full_loc, full_loc + '.txt', fsize) cmlib.pack_gz(full_loc + '.txt') os.remove(full_loc) # remove the original file return full_loc + '.txt.gz'
def get_pfx2as(self): self.get_pfx2as_file() print 'Calculating prefix to AS number trie...' pfx2as = dict() if int(self.sdate) >= 20050509: self.get_pfx2as_file() pfx2as_file = '' tmp = os.listdir(self.spt_dir) for line in tmp: if 'pfx2as' in line: pfx2as_file = line break f = open(self.spt_dir+pfx2as_file) for line in f: line = line.rstrip('\n') attr = line.split() if '_' in attr[2] or ',' in attr[2]: continue pfx = attr[0]+'/'+attr[1] try: pfx2as[pfx] = int(attr[2]) # pfx: origin AS except: # When will this happen? pfx2as[pfx] = -1 f.close() else: # Extract info from RIB of the monitor route-views2 and XXX mydate = self.sdate[0:4] + '.' + self.sdate[4:6] rib_location = datadir+'archive.routeviews.org/bgpdata/'+mydate+'/RIBS/' dir_list = os.listdir(datadir+'archive.routeviews.org/bgpdata/'+mydate+'/RIBS/') for f in dir_list: if not f.startswith('.'): rib_location = rib_location + f # if RIB is of the same month. That's OK. break if rib_location.endswith('txt.gz'): subprocess.call('gunzip '+rib_location, shell=True) # unpack rib_location = rib_location.replace('.txt.gz', '.txt') elif not rib_location.endswith('txt'): # .bz2/.gz file exists cmlib.parse_mrt(rib_location, rib_location+'.txt') os.remove(rib_location) # then remove .bz2/.gz rib_location = rib_location + '.txt' # now rib file definitely ends with .txt, let's rock and roll with open(rib_location, 'r') as f: for line in f: try: tmp = line.split('|')[5] pfx = tmp ASlist = line.split('|')[6] originAS = ASlist.split()[-1] try: pfx2as[pfx] = int(originAS) except: pfx2as[pfx] = -1 except: pass f.close() # compress RIB into .gz if not os.path.exists(rib_location+'.gz'): cmlib.pack_gz(rib_location) return pfx2as