def get_as2cc_file(self): # AS to customer cone sptfiles = os.listdir(self.spt_dir) for line in sptfiles: if 'ppdc' in line: return 0 # already have a file target_line = None yearmonth = self.sdate[:6] # YYYYMM print 'Downloading AS to customer cone file ...' theurl = 'http://data.caida.org/datasets/2013-asrank-data-supplement/data/' webraw = cmlib.get_weblist(theurl) for line in webraw.split('\n'): if yearmonth in line and 'ppdc' in line: target_line = line break assert target_line != None fname = target_line.split()[0] cmlib.force_download_file(theurl, self.spt_dir, fname) if int(yearmonth) <= 201311: # unpack .gz (only before 201311 (include)) subprocess.call('gunzip '+self.spt_dir+fname, shell=True) else: # unpack .bz2 (only after 201406 (include)) subprocess.call('bunzip2 -d '+self.spt_dir+fname, shell=True) return 0
def get_as2cc_file(self): # AS to customer cone sptfiles = os.listdir(self.spt_dir) for line in sptfiles: if 'ppdc' in line: return 0 # already have a file target_line = None yearmonth = self.sdate[:6] # YYYYMM print 'Downloading AS to customer cone file ...' theurl = 'http://data.caida.org/datasets/2013-asrank-data-supplement/data/' webraw = cmlib.get_weblist(theurl) for line in webraw.split('\n'): if yearmonth in line and 'ppdc' in line: target_line = line break assert target_line != None fname = target_line.split()[0] cmlib.force_download_file(theurl, self.spt_dir, fname) if int(yearmonth) <= 201311: # unpack .gz (only before 201311 (include)) subprocess.call('gunzip ' + self.spt_dir + fname, shell=True) else: # unpack .bz2 (only after 201406 (include)) subprocess.call('bunzip2 -d ' + self.spt_dir + fname, shell=True) return 0
def download_updates(self): f = open(self.listfile, 'r') for line in f: line = line.replace('\n', '').replace('.txt.gz', '') # get original .bz2/gz name tmp = line.split('|')[0] filename = tmp.split('/')[-1] web_location = tmp.replace(filename, '') fsize = float(line.split('|')[1]) full_path = datadir + web_location + filename # Goal: only XXX.bz2/.gz or XXX.bz2/gz.txt.gz exists # remove (if) existing xx.txt file to make things clearer if os.path.exists(full_path + '.txt'): os.remove(full_path + '.txt') if os.path.exists(full_path + '.txt.gz'): # parsed file exists if os.path.getsize(full_path + '.txt.gz') > 0.8 * fsize: # size OK logging.info('file exists:%s', full_path + '.txt.gz') if os.path.exists(full_path): # .bz2/.gz useless anymore os.remove(full_path) continue else: os.remove(full_path + '.txt.gz') if os.path.exists(full_path): # original file exists now_size = os.path.getsize(full_path) if now_size > 0.95 * fsize: # size OK logging.info('file exists:%s', full_path) continue else: os.remove(full_path) cmlib.force_download_file('http://' + web_location, datadir + web_location, filename) f.close() return 0
def download_updates(self): f = open(self.listfile, 'r') for line in f: line = line.replace('\n', '').replace('.txt.gz', '') # get original .bz2/gz name tmp = line.split('|')[0] filename = tmp.split('/')[-1] web_location = tmp.replace(filename, '') fsize = float(line.split('|')[1]) full_path = datadir + web_location + filename # Goal: only XXX.bz2/.gz or XXX.bz2/gz.txt.gz exists # remove (if) existing xx.txt file to make things clearer if os.path.exists(full_path+'.txt'): os.remove(full_path+'.txt') if os.path.exists(full_path+'.txt.gz'): # parsed file exists if os.path.getsize(full_path+'.txt.gz') > 0.8 * fsize: # size OK logging.info('file exists:%s', full_path+'.txt.gz') if os.path.exists(full_path): # .bz2/.gz useless anymore os.remove(full_path) continue else: os.remove(full_path+'.txt.gz') if os.path.exists(full_path): # original file exists now_size = os.path.getsize(full_path) if now_size > 0.95 * fsize: # size OK logging.info('file exists:%s', full_path) continue else: os.remove(full_path) cmlib.force_download_file('http://'+web_location, datadir+web_location, filename) f.close() return 0
def download_one_rib(self, my_date): tmp_month = my_date[0:4] + '.' + my_date[4:6] if self.co.startswith('rrc'): web_location = rrc_root + self.co + '/' + tmp_month + '/' else: web_location = rv_root + self.co + '/bgpdata/' + tmp_month + '/RIBS/' web_location = web_location.replace('//', '/') webraw = cmlib.get_weblist('http://' + web_location) cmlib.make_dir(datadir+web_location) #---------------------------------------------------------------- # select a RIB file with reasonable (not strange) file size rib_list = webraw.split('\n') filter(lambda a: a != '', rib_list) filter(lambda a: a != '\n', rib_list) rib_list = [item for item in rib_list if 'rib' in item or 'bview' in item] sizelist = list() for line in rib_list: size = line.split()[-1] fsize = cmlib.parse_size(size) sizelist.append(fsize) avg = np.mean(sizelist) target_line = None # stores the RIB file for downloading largest_line = None max = -1 closest = 99999 for line in rib_list: fdate = line.split()[0].split('.')[-3] size = line.split()[-1] fsize = cmlib.parse_size(size) if fsize > max: max = fsize largest_line = line diff = abs(int(fdate)-int(my_date)) # >0 # XXX logic here not clear (but seems effective) if diff <= closest and fsize > 0.9 * avg and fsize < 1.1 * avg: target_line = line closest = diff if target_line is None: assert largest_line is not None print 'Failed. Resort to downloading the largest RIB...' target_line = largest_line # work-around for a special case print 'Selected RIB:', target_line size = target_line.split()[-1] # claimed RIB file size fsize = cmlib.parse_size(size) filename = target_line.split()[0] full_loc = datadir + web_location + filename # .bz2/.gz if os.path.exists(full_loc+'.txt'): # only for clearer logic os.remove(full_loc+'.txt') #------------------------------------------------------------------ # Download the RIB if os.path.exists(full_loc+'.txt.gz'): print 'existed size & original size:',os.path.getsize(full_loc+'.txt.gz'),fsize if os.path.getsize(full_loc+'.txt.gz') > 0.6 * fsize: # 0.6 is good enough return full_loc+'.txt.gz' # Do not download else: os.remove(full_loc+'.txt.gz') # too small to be complete if os.path.exists(full_loc): if os.path.getsize(full_loc) <= 0.95 * fsize: os.remove(full_loc) else: # Good! cmlib.parse_mrt(full_loc, full_loc+'.txt', fsize) cmlib.pack_gz(full_loc+'.txt') return full_loc+'.txt.gz' cmlib.force_download_file('http://'+web_location, datadir+web_location, filename) cmlib.parse_mrt(full_loc, full_loc+'.txt', fsize) cmlib.pack_gz(full_loc+'.txt') os.remove(full_loc) # remove the original file return full_loc+'.txt.gz'
def download_one_rib_before_unix(self, my_date, unix): # my_date for deciding month tmp_month = my_date[0:4] + '.' + my_date[4:6] if self.co.startswith('rrc'): web_location = rrc_root + self.co + '/' + tmp_month + '/' else: web_location = rv_root + self.co + '/bgpdata/' + tmp_month + '/RIBS/' web_location = web_location.replace('//', '/') try: webraw = cmlib.get_weblist('http://' + web_location) print 'Getting list from ' + 'http://' + web_location except: return -1 cmlib.make_dir(datadir+web_location) #---------------------------------------------------------------- # select a RIB file right before the unix and with reasonable (not strange) file size rib_list = webraw.split('\n') filter(lambda a: a != '', rib_list) filter(lambda a: a != '\n', rib_list) rib_list = [item for item in rib_list if 'rib' in item or 'bview' in item] sizelist = list() for line in rib_list: size = line.split()[-1] fsize = cmlib.parse_size(size) sizelist.append(fsize) avg = np.mean(sizelist) ok_rib_list = list() # RIBs whose size is OK for line in rib_list: fsize = cmlib.parse_size(line.split()[-1]) if fsize > 0.9 * avg: ok_rib_list.append(line) target_line = None # the RIB closest to unix min = 9999999999 for line in ok_rib_list: fdate = line.split()[0].split('.')[-3] ftime = line.split()[0].split('.')[-2] dtstr = fdate+ftime objdt = datetime.datetime.strptime(dtstr, '%Y%m%d%H%M') runix = time_lib.mktime(objdt.timetuple()) + 8*60*60 # F**k! Time zone! print objdt, runix, unix if runix <= unix and unix-runix < min: min = unix-runix print 'min changed to ', min target_line = line print 'Selected RIB:', target_line if target_line == None: return -1 size = target_line.split()[-1] # claimed RIB file size fsize = cmlib.parse_size(size) filename = target_line.split()[0] full_loc = datadir + web_location + filename # .bz2/.gz if os.path.exists(full_loc+'.txt'): # only for clearer logic os.remove(full_loc+'.txt') #------------------------------------------------------------------ # Download the RIB if os.path.exists(full_loc+'.txt.gz'): print 'existed!!!!!!!!!!!!' return full_loc+'.txt.gz' # Do not download if os.path.exists(full_loc): cmlib.parse_mrt(full_loc, full_loc+'.txt', fsize) cmlib.pack_gz(full_loc+'.txt') return full_loc+'.txt.gz' cmlib.force_download_file('http://'+web_location, datadir+web_location, filename) cmlib.parse_mrt(full_loc, full_loc+'.txt', fsize) cmlib.pack_gz(full_loc+'.txt') os.remove(full_loc) # remove the original file return full_loc+'.txt.gz'
def get_AS_num_file(self): url = 'http://bgp.potaroo.net/as2.0/' cmlib.force_download_file(url, pub_spt_dir, 'bgp-as-count.txt') return 0
def get_as2nn_file(self): cmlib.force_download_file('http://bgp.potaroo.net/cidr/', pub_spt_dir, 'autnums.html')
def get_fib_size_file(self): url = 'http://bgp.potaroo.net/as2.0/' cmlib.force_download_file(url, pub_spt_dir, 'bgp-active.txt') return 0
def get_file(): for clctr in collectors: cl_name = clctr hdname_detail = hdname + 'archive.routeviews.org/' + cl_name +\ '/bgpdata/' hdname_detail = hdname_detail.replace('//', '/') # happens when cl = '' # only for downloading updates, not RIBs for ym in yearmonth: sdate = ym.split('.')[0] + ym.split('.')[1] + '01' edate = ym.split('.')[0] + ym.split('.')[1] + '07' filelocation = '' filelocation = 'archive.routeviews.org/' + cl_name + '/bgpdata/' + ym + '/UPDATES/' filelocation = filelocation.replace('//', '/') # when name is '' webraw = cmlib.get_weblist('http://' + filelocation) print filelocation cmlib.make_dir(hdname+'metadata/'+ym) flist = open(hdname+'metadata/'+ym+'/updt_filelist_'+cl_name, 'w') cmlib.make_dir(hdname+filelocation) for line in webraw.split('\n'): if not 'updates' in line or line == '' or line == '\n': continue size = line.split()[-1] if size.isdigit(): fsize = float(size) else: fsize = float(size[:-1]) * cmlib.size_u2v(size[-1]) filename = line.split()[0] # omit uninteresting info filedate = filename.split('.')[-3] # check whether its datetime in our range if int(filedate) < int(sdate) or int(filedate) > int(edate): continue print filename origin_floc = hdname + filelocation + filename # original file loc&name flist.write(origin_floc+'.txt.gz\n') # .xx.txt.gz file list # remove existing xx.txt file to make things clearer try: os.remove(origin_floc+'.txt') except: pass if os.path.exists(origin_floc+'.txt.gz'): if os.path.getsize(origin_floc+'.txt.gz') > 0.1 * fsize: if os.path.exists(origin_floc): # .bz2/.gz useless anymore os.remove(origin_floc) continue else: os.remove(origin_floc+'.txt.gz') if os.path.exists(origin_floc): if os.path.getsize(origin_floc) > 0.9 * fsize: continue else: os.remove(origin_floc) cmlib.force_download_file('http://'+filelocation, hdname+filelocation, filename) # file that stores update list flist.close() filelocation = 'archive.routeviews.org/' + cl_name + '/bgpdata/' + ym + '/RIBS/' filelocation = filelocation.replace('//', '/') # when name is '' webraw = cmlib.get_weblist('http://' + filelocation) print filelocation cmlib.make_dir(hdname+filelocation) # for each event, we only download one RIB (on the sdate) rib_fname = '' for line in webraw.split('\n'): if not 'rib' in line and not 'bview' in line: continue if line == '' or line == '\n': continue size = line.split()[-1] if size.isdigit(): fsize = float(size) else: fsize = float(size[:-1]) * cmlib.size_u2v(size[-1]) filename = line.split()[0] print filename if not int(filename.split('.')[-3]) == int(sdate): continue print filename origin_floc = hdname + filelocation + filename # original file loc&name try: os.remove(origin_floc+'.txt') except: pass rib_fname = filelocation + filename if os.path.exists(origin_floc+'.txt.gz'): if os.path.getsize(origin_floc+'.txt.gz') > 0.1 * fsize: if os.path.exists(origin_floc): # .bz2/.gz useless anymore os.remove(origin_floc) break else: os.remove(origin_floc+'.txt.gz') if os.path.exists(origin_floc): if os.path.getsize(origin_floc) > 0.9 * fsize: break else: os.remove(origin_floc) cmlib.force_download_file('http://'+filelocation, hdname+filelocation, filename) break # download one rib to intial as_path sdate_datetime = datetime.datetime(int(sdate[0:4]), int(sdate[4:6]),int(sdate[6:8])) as_path_date = sdate_datetime - datetime.timedelta(days=1) as_path_date = as_path_date.strftime('%Y%m%d') as_path_ym = as_path_date[0:4] + '.' + as_path_date[4:6] filelocation = 'archive.routeviews.org/' + cl_name + '/bgpdata/' + as_path_ym + '/RIBS/' filelocation = filelocation.replace('//', '/') # when name is '' webraw = cmlib.get_weblist('http://' + filelocation) print filelocation cmlib.make_dir(hdname+filelocation) asrib_fname = '' for line in reversed(webraw.split('\n')): print line if not 'rib' in line and not 'bview' in line: continue if line == '' or line == '\n': continue size = line.split()[-1] if size.isdigit(): fsize = float(size) else: fsize = float(size[:-1]) * cmlib.size_u2v(size[-1]) filename = line.split()[0] print filename if not int(filename.split('.')[-3]) == int(as_path_date): continue print filename origin_floc = hdname + filelocation + filename # original file loc&name try: os.remove(origin_floc+'.txt') except: pass asrib_fname = filelocation + filename if os.path.exists(origin_floc+'.txt.gz'): if os.path.getsize(origin_floc+'.txt.gz') > 0.1 * fsize: if os.path.exists(origin_floc): # .bz2/.gz useless anymore os.remove(origin_floc) break else: os.remove(origin_floc+'.txt.gz') if os.path.exists(origin_floc): if os.path.getsize(origin_floc) > 0.9 * fsize: break else: os.remove(origin_floc) cmlib.force_download_file('http://'+filelocation, hdname+filelocation, filename) break ## now for update and RIB files, their formats are either .bz2/gz or ## .xx.txt.gz!!! print 'parsing updates...' parse_updates(ym, cl_name) print 'parsing RIB and getting peers...' rib_location = hdname + rib_fname # .bz2/.gz #print rib_location,'dd' peers = get_peers(clctr,ym,rib_location) print 'peers: ', peers as_path_rib_location = hdname + asrib_fname # .bz2/.gz process_as_path_rib(clctr,as_path_ym,as_path_rib_location) print 'determining table transfers start and end time for each peer...' for peer in peers: # must process each peer one by one peer = peer.rstrip() print 'processing ',peer,'...' subprocess.call('perl '+homedir+'tool/bgpmct.pl -rf '+rib_location+'.txt.gz'+' -ul '+\ hdname+'metadata/'+ym+'/updt_filelist_'+cl_name+' -p '+peer+' > '+\ hdname+'tmp/'+peer+'_result.txt', shell=True) print 'delete updates caused by session reset for each peer...' for peer in peers: # No reset from this peer, so nothing in the file try: if os.path.getsize(hdname+'tmp/'+peer+'_result.txt') == 0: continue except: # cannot find file continue print '\nculprit now: ', peer del_tabletran_updates(peer, ym, cl_name) # delete all rubbish in the end subprocess.call('rm '+hdname+'tmp/*', shell=True) return
def download_one_rib(self, my_date): tmp_month = my_date[0:4] + '.' + my_date[4:6] if self.co.startswith('rrc'): web_location = rrc_root + self.co + '/' + tmp_month + '/' else: web_location = rv_root + self.co + '/bgpdata/' + tmp_month + '/RIBS/' web_location = web_location.replace('//', '/') webraw = cmlib.get_weblist('http://' + web_location) cmlib.make_dir(datadir + web_location) #---------------------------------------------------------------- # select a RIB file with reasonable (not strange) file size rib_list = webraw.split('\n') filter(lambda a: a != '', rib_list) filter(lambda a: a != '\n', rib_list) rib_list = [ item for item in rib_list if 'rib' in item or 'bview' in item ] sizelist = list() for line in rib_list: size = line.split()[-1] fsize = cmlib.parse_size(size) sizelist.append(fsize) avg = np.mean(sizelist) target_line = None # stores the RIB file for downloading largest_line = None max = -1 closest = 99999 for line in rib_list: fdate = line.split()[0].split('.')[-3] size = line.split()[-1] fsize = cmlib.parse_size(size) if fsize > max: max = fsize largest_line = line diff = abs(int(fdate) - int(my_date)) # >0 # XXX logic here not clear (but seems effective) if diff <= closest and fsize > 0.9 * avg and fsize < 1.1 * avg: target_line = line closest = diff if target_line is None: assert largest_line is not None print 'Failed. Resort to downloading the largest RIB...' target_line = largest_line # work-around for a special case print 'Selected RIB:', target_line size = target_line.split()[-1] # claimed RIB file size fsize = cmlib.parse_size(size) filename = target_line.split()[0] full_loc = datadir + web_location + filename # .bz2/.gz if os.path.exists(full_loc + '.txt'): # only for clearer logic os.remove(full_loc + '.txt') #------------------------------------------------------------------ # Download the RIB if os.path.exists(full_loc + '.txt.gz'): print 'existed size & original size:', os.path.getsize( full_loc + '.txt.gz'), fsize if os.path.getsize(full_loc + '.txt.gz') > 0.6 * fsize: # 0.6 is good enough return full_loc + '.txt.gz' # Do not download else: os.remove(full_loc + '.txt.gz') # too small to be complete if os.path.exists(full_loc): if os.path.getsize(full_loc) <= 0.95 * fsize: os.remove(full_loc) else: # Good! cmlib.parse_mrt(full_loc, full_loc + '.txt', fsize) cmlib.pack_gz(full_loc + '.txt') return full_loc + '.txt.gz' cmlib.force_download_file('http://' + web_location, datadir + web_location, filename) cmlib.parse_mrt(full_loc, full_loc + '.txt', fsize) cmlib.pack_gz(full_loc + '.txt') os.remove(full_loc) # remove the original file return full_loc + '.txt.gz'
def download_one_rib_before_unix(self, my_date, unix): # my_date for deciding month tmp_month = my_date[0:4] + '.' + my_date[4:6] if self.co.startswith('rrc'): web_location = rrc_root + self.co + '/' + tmp_month + '/' else: web_location = rv_root + self.co + '/bgpdata/' + tmp_month + '/RIBS/' web_location = web_location.replace('//', '/') try: webraw = cmlib.get_weblist('http://' + web_location) print 'Getting list from ' + 'http://' + web_location except: return -1 cmlib.make_dir(datadir + web_location) #---------------------------------------------------------------- # select a RIB file right before the unix and with reasonable (not strange) file size rib_list = webraw.split('\n') filter(lambda a: a != '', rib_list) filter(lambda a: a != '\n', rib_list) rib_list = [ item for item in rib_list if 'rib' in item or 'bview' in item ] sizelist = list() for line in rib_list: size = line.split()[-1] fsize = cmlib.parse_size(size) sizelist.append(fsize) avg = np.mean(sizelist) ok_rib_list = list() # RIBs whose size is OK for line in rib_list: fsize = cmlib.parse_size(line.split()[-1]) if fsize > 0.9 * avg: ok_rib_list.append(line) target_line = None # the RIB closest to unix min = 9999999999 for line in ok_rib_list: fdate = line.split()[0].split('.')[-3] ftime = line.split()[0].split('.')[-2] dtstr = fdate + ftime objdt = datetime.datetime.strptime(dtstr, '%Y%m%d%H%M') runix = time_lib.mktime( objdt.timetuple()) + 8 * 60 * 60 # F**k! Time zone! print objdt, runix, unix if runix <= unix and unix - runix < min: min = unix - runix print 'min changed to ', min target_line = line print 'Selected RIB:', target_line if target_line == None: return -1 size = target_line.split()[-1] # claimed RIB file size fsize = cmlib.parse_size(size) filename = target_line.split()[0] full_loc = datadir + web_location + filename # .bz2/.gz if os.path.exists(full_loc + '.txt'): # only for clearer logic os.remove(full_loc + '.txt') #------------------------------------------------------------------ # Download the RIB if os.path.exists(full_loc + '.txt.gz'): print 'existed!!!!!!!!!!!!' return full_loc + '.txt.gz' # Do not download if os.path.exists(full_loc): cmlib.parse_mrt(full_loc, full_loc + '.txt', fsize) cmlib.pack_gz(full_loc + '.txt') return full_loc + '.txt.gz' cmlib.force_download_file('http://' + web_location, datadir + web_location, filename) cmlib.parse_mrt(full_loc, full_loc + '.txt', fsize) cmlib.pack_gz(full_loc + '.txt') os.remove(full_loc) # remove the original file return full_loc + '.txt.gz'