def get_pfx2as_file(self): location = self.spt_dir cmlib.make_dir(location) tmp = os.listdir(self.spt_dir) for line in tmp: if 'pfx2as' in line: return 0 # we already have a prefix2as file print 'Downloading prefix to AS file ...' year, month = self.sdate[:4], self.sdate[4:6] # YYYY, MM webloc = 'http://data.caida.org/datasets/routing/routeviews-prefix2as' +\ '/' + year + '/' + month + '/' webraw = cmlib.get_weblist(webloc) target_line = '' for line in webraw.split('\n'): if self.sdate in line: target_line = line break if target_line == '': print 'Downloading prefix to AS file fails: no such date!' return 0 fname = target_line.split()[0] urllib.urlretrieve(webloc+fname, location+fname) subprocess.call('gunzip -c '+location+fname+' > '+\ location+fname.replace('.gz', ''), shell=True) os.remove(location+fname) return 0
def time_series_plot(granu, my_dict, describe): value = [] dt = my_dict.keys() dt.sort() for key in dt: value.append(my_dict[key]) dt = [datetime.datetime.fromtimestamp(ts) for ts in dt] # int to obj. required! fig = plt.figure(figsize=(16, 10)) ax = fig.add_subplot(111) ax.plot(dt, value, 'k-') ax.set_ylabel(describe) ax.set_xlabel('Datetime') myFmt = mpldates.DateFormatter('%Y-%m-%d %H%M') ax.xaxis.set_major_formatter(myFmt) plt.xticks(rotation=45) # make a dir according to datetime, granularity and h threshold sdate = describe.split('_')[0] cmlib.make_dir(datadir + 'output/' + sdate + '_' + str(granu) + '/') plt.savefig(datadir + 'output/' + sdate + '_' + str(granu) + '/' + describe + '.pdf') plt.close() # Record plot data in a separate file for future use f = open(datadir+'output/'+sdate+'_'+str(granu)+'/'+\ describe+'.txt', 'w') for i in xrange(0, len(dt)): f.write(str(dt[i]) + ',' + str(value[i]) + '\n') f.close() return 0
def __init__(self, index): self.index = index self.sdate = daterange[index][0] self.edate = daterange[index][1] self.sdatetime_obj = datetime.datetime.strptime(self.sdate, '%Y%m%d') self.edatetime_obj = datetime.datetime.strptime( self.edate, '%Y%m%d') + datetime.timedelta(days=1) # location to store supporting files self.spt_dir = spt_dir + self.sdate + '_' + self.edate + '/' cmlib.make_dir(self.spt_dir) # Store the rib information of every collector (Note: do not change this!) self.rib_info_file = rib_info_dir + self.sdate + '_' + self.edate + '.txt' self.co_mo = dict( ) # collector: monitor list (does not store empty list) self.mo_asn = dict() self.mo_cc = dict() self.mo_tier = dict() self.as2nation = dict() self.as2name = dict() # Note: Occassionally run to get the latest data. (Now up to 20141225) #self.get_fib_size_file() #self.get_AS_num_file() self.dt_anchor1 = datetime.datetime( 2003, 2, 3, 19, 0) # up to now, never used data prior self.dt_anchor2 = datetime.datetime(2006, 2, 1, 21, 0)
def get_update_list(self): tmp_dir = self.get_listfile_dir() cmlib.make_dir(tmp_dir) flist = open(self.listfile, 'w') month_list = self.get_month_list_dot() for month in month_list: web_location = '' if self.co.startswith('rrc'): web_location = rrc_root + self.co + '/' + month + '/' else: web_location = rv_root + self.co + '/bgpdata/' + month + '/UPDATES/' web_location = web_location.replace('//', '/') # when name is '' webraw = cmlib.get_weblist('http://' + web_location) cmlib.make_dir(datadir+web_location) for line in webraw.split('\n'): if not 'updates' in line or line == '' or line == '\n': continue size = line.split()[-1] fsize = cmlib.parse_size(size) filename = line.split()[0] # omit uninteresting info filedate = filename.split('.')[-3] # check whether its date in our range if int(filedate) < int(self.sdate) or int(filedate) > int(self.edate): continue # note: storing the original .bz2/.gz file name makes logic clearer flist.write(web_location+filename+'.txt.gz|'+str(fsize)+'\n') logging.info('record file name: '+web_location+filename+'.txt.gz|'+str(fsize)) return 0
def get_pfx2as_file(self): location = datadir + 'support/' + self.sdate + '/' cmlib.make_dir(location) tmp = os.listdir(datadir+'support/'+self.sdate+'/') for line in tmp: if 'pfx2as' in line: return 0 # we already have a prefix2as file print 'Downloading prefix to AS file ...' year, month = self.sdate[:4], self.sdate[4:6] # YYYY, MM webloc = 'http://data.caida.org/datasets/routing/routeviews-prefix2as' +\ '/' + year + '/' + month + '/' webraw = cmlib.get_weblist(webloc) target_line = '' for line in webraw.split('\n'): if self.sdate in line: target_line = line break if target_line == '': print 'Downloading prefix to AS file fails: no such date!' return 0 fname = target_line.split()[0] urllib.urlretrieve(webloc+fname, location+fname) subprocess.call('gunzip -c '+location+fname+' > '+\ location+fname.replace('.gz', ''), shell=True) os.remove(location+fname) return 0
def cdf_plot(granu, my_dict, describe): # my_dict DV value: exist time xlist = [0] ylist = [0] for key in sorted(my_dict): # must sort by key xlist.append(key) ylist.append(my_dict[key]) xmax = max(xlist) ymax = max(ylist) fig = plt.figure(figsize=(16, 10)) ax = fig.add_subplot(111) ax.plot(xlist, ylist, 'k-') ax.set_ylim([-0.1 * ymax, 1.1 * ymax]) ax.set_xlim([-0.1 * xmax, 1.1 * xmax]) ax.set_ylabel('y') ax.set_xlabel('x') # make a dir according to datetime, granularity and h threshold sdate = describe.split('_')[0] cmlib.make_dir(datadir + 'output/' + sdate + '_' + str(granu) + '/') plt.savefig(datadir + 'output/' + sdate + '_' + str(granu) + '/' + describe + '.pdf') plt.close() # Record plot data in a separate file for future use f = open(datadir+'output/'+sdate+'_'+str(granu)+'/'+\ describe+'.txt', 'w') for i in xrange(0, len(xlist)): f.write(str(xlist[i]) + ',' + str(ylist[i]) + '\n') f.close() return 0
def __init__(self, index): self.index = index self.sdate = daterange[index][0] self.edate = daterange[index][1] self.sdatetime_obj = datetime.datetime.strptime(self.sdate, '%Y%m%d') self.edatetime_obj = datetime.datetime.strptime(self.edate, '%Y%m%d') + datetime.timedelta(days=1) # location to store supporting files self.spt_dir = spt_dir + self.sdate + '_' + self.edate + '/' cmlib.make_dir(self.spt_dir) # Store the rib information of every collector (Note: do not change this!) self.rib_info_file = rib_info_dir + self.sdate + '_' + self.edate + '.txt' self.co_mo = dict() # collector: monitor list (does not store empty list) self.mo_asn = dict() self.mo_cc = dict() self.mo_tier = dict() self.as2nation = dict() self.as2name = dict() # Note: Occassionally run to get the latest data. (Now up to 20141225) #self.get_fib_size_file() #self.get_AS_num_file() self.dt_anchor1 = datetime.datetime(2003,2,3,19,0) # up to now, never used data prior self.dt_anchor2 = datetime.datetime(2006,2,1,21,0)
def box_plot_grouped(granu, my_dict, describe): data_lists = [] for k in my_dict.keys(): # dv ranges tmp_list = [] for k2 in my_dict[k].keys(): for i in xrange(0, len(my_dict[k][k2])): tmp_list.append(k2) data_lists.append(tmp_list) #plot_lists = [] #large = 0 # the number of sub lists #for list in data_lists: #if len(list) > large: #large = len(list) #for i in xrange(0, large): #for j in xrange(0, len(data_lists)): #tmp_list = [] #try: #tmp_list.append(data_lists[j][i]) #except: #tmp_list.append(0) #plot_lists.append(tmp_list) #my_labels = my_dict.keys() #fig = plt.figure(figsize=(16, 10)) #ax = fig.add_subplot(111) #ax.boxplot(data_lists) # make a dir according to datetime, granularity and h threshold sdate = describe.split('_')[0] cmlib.make_dir(datadir + 'output/' + sdate + '_' + str(granu) + '/') plt.savefig(datadir+'output/'+sdate+'_'+str(granu)+'/'+describe+'.pdf',\ bbox_inches='tight') plt.close() # Record plot data in a separate file for future use f = open(datadir+'output/'+sdate+'_'+str(granu)+'_'+'/'+\ describe+'.txt', 'w') for k in my_dict.keys(): f.write(str(k) + ':') for k2 in my_dict[k].keys(): f.write(str(k2) + '|') f.write(str(my_dict[k][k2])) f.write(',') f.write('\n') f.close() return 0
def get_update_list(self): tmp_dir = self.get_listfile_dir() cmlib.make_dir(tmp_dir) flist = open(self.listfile, 'w') month_list = self.get_month_list_dot() for month in month_list: web_location = '' if self.co.startswith('rrc'): web_location = rrc_root + self.co + '/' + month + '/' else: web_location = rv_root + self.co + '/bgpdata/' + month + '/UPDATES/' web_location = web_location.replace('//', '/') # when name is '' webraw = cmlib.get_weblist('http://' + web_location) cmlib.make_dir(datadir + web_location) for line in webraw.split('\n'): if not 'updates' in line or line == '' or line == '\n': continue size = line.split()[-1] fsize = cmlib.parse_size(size) filename = line.split()[0] # omit uninteresting info filedate = filename.split('.')[-3] # check whether its date in our range if int(filedate) < int(self.sdate) or int(filedate) > int( self.edate): continue # note: storing the original .bz2/.gz file name makes logic clearer flist.write(web_location + filename + '.txt.gz|' + str(fsize) + '\n') logging.info('record file name: ' + web_location + filename + '.txt.gz|' + str(fsize)) return 0
def apfx_metrics_fpath(self): dir = metrics_output_root + str(self.granu) + '/' + self.sdate + '_' + self.edate + '/' cmlib.make_dir(dir) return dir+'active_pfx_metrics.txt'
order = 286 unix = cluster1_2[0] sdate = daterange[order][0] edate = daterange[order][1] rib_files = list() for co in all_collectors.keys(): dl = Downloader(sdate, edate, co) rfilepath = dl.download_one_rib_before_unix(sdate, unix) # download RIB if rfilepath != -1: # cannot get rib_files.append(rfilepath) # output the rib file-list dir = final_output_root + 'additional_rib_list/' cmlib.make_dir(dir) ofpath = dir + str(order) + '_' + str(unix) + '.txt' f = open(ofpath, 'w') for rpath in rib_files: f.write(rpath + '\n') f.close() #---------------------------------------------------------------------------- # The main function if __name__ == '__main__' and 1 == 2: order_list = [303] # we select all collectors that have appropriate start dates collector_list = dict() for i in order_list: collector_list[i] = list() for co in all_collectors.keys():
def get_num_feature_actmon(self): # Get the average of each feature total_f2avg = dict() total_f2vlist = dict() for i in range(feature_num): total_f2vlist[i] = list() for uds in self.uds_list: for slot in uds.dtobj_list: print '*************Getting total feature values for slot ', slot sdt_unix = calendar.timegm(slot[0].utctimetuple()) rpath = uds.numf_distr_output_dir() + str(sdt_unix) + '.txt' f = open(rpath, 'r') for line in f: line = line.rstrip('\n') name = line.split(':')[0] mydict = line.replace(name+':', '') mydict = ast.literal_eval(mydict) if name == 'T': for fea in mydict: total_f2vlist[fea].append(mydict[fea]) f.close() for fea in total_f2vlist: total_f2avg[fea] = float(sum(total_f2vlist[fea])) / float(len(total_f2vlist[fea])) # Simply set the threshold for active monitors to average/N f2thre = dict() for i in range(feature_num): f2thre[i] = total_f2avg[i]/10.0 print 'Get the set of active monitors for each slot and each feature' # To save memory, we map monitor ip to an integer mon2id = dict() count = 0 for uds in self.uds_list: for mon in uds.monitors: try: test = mon2id[mon] except: mon2id[mon] = count count += 1 unix2fea2monset = dict() for uds in self.uds_list: for slot in uds.dtobj_list: print '*************Getting highly active monitors for slot ', slot sdt_unix = calendar.timegm(slot[0].utctimetuple()) unix2fea2monset[sdt_unix] = dict() rpath = uds.numf_distr_output_dir() + str(sdt_unix) + '.txt' f = open(rpath, 'r') for line in f: line = line.rstrip('\n') name = line.split(':')[0] mydict = line.replace(name+':', '') mydict = ast.literal_eval(mydict) if name != 'T': id = mon2id[name] for fea in mydict: if mydict[fea] >= f2thre[fea]: try: unix2fea2monset[sdt_unix][fea].add(id) except: unix2fea2monset[sdt_unix][fea] = set([id]) f.close() # store the info in a middle file. One file for one feature filedict = dict() dir = metrics_output_root + str(self.uds_list[0].granu) + '/actmon/' cmlib.make_dir(dir) for i in range(feature_num): filedict[i] = open(dir+str(i)+'.txt', 'w') for unix in unix2fea2monset: for fea in unix2fea2monset[unix]: filedict[fea].write(str(unix)+':'+str(unix2fea2monset[unix][fea])+'\n') for i in range(feature_num): filedict[i].close() f = open(dir+'mon2id.txt', 'w') f.write(str(mon2id)) f.close()
def download_one_rib_before_unix(self, my_date, unix): # my_date for deciding month tmp_month = my_date[0:4] + '.' + my_date[4:6] if self.co.startswith('rrc'): web_location = rrc_root + self.co + '/' + tmp_month + '/' else: web_location = rv_root + self.co + '/bgpdata/' + tmp_month + '/RIBS/' web_location = web_location.replace('//', '/') try: webraw = cmlib.get_weblist('http://' + web_location) print 'Getting list from ' + 'http://' + web_location except: return -1 cmlib.make_dir(datadir + web_location) #---------------------------------------------------------------- # select a RIB file right before the unix and with reasonable (not strange) file size rib_list = webraw.split('\n') filter(lambda a: a != '', rib_list) filter(lambda a: a != '\n', rib_list) rib_list = [ item for item in rib_list if 'rib' in item or 'bview' in item ] sizelist = list() for line in rib_list: size = line.split()[-1] fsize = cmlib.parse_size(size) sizelist.append(fsize) avg = np.mean(sizelist) ok_rib_list = list() # RIBs whose size is OK for line in rib_list: fsize = cmlib.parse_size(line.split()[-1]) if fsize > 0.9 * avg: ok_rib_list.append(line) target_line = None # the RIB closest to unix min = 9999999999 for line in ok_rib_list: fdate = line.split()[0].split('.')[-3] ftime = line.split()[0].split('.')[-2] dtstr = fdate + ftime objdt = datetime.datetime.strptime(dtstr, '%Y%m%d%H%M') runix = time_lib.mktime( objdt.timetuple()) + 8 * 60 * 60 # F**k! Time zone! print objdt, runix, unix if runix <= unix and unix - runix < min: min = unix - runix print 'min changed to ', min target_line = line print 'Selected RIB:', target_line if target_line == None: return -1 size = target_line.split()[-1] # claimed RIB file size fsize = cmlib.parse_size(size) filename = target_line.split()[0] full_loc = datadir + web_location + filename # .bz2/.gz if os.path.exists(full_loc + '.txt'): # only for clearer logic os.remove(full_loc + '.txt') #------------------------------------------------------------------ # Download the RIB if os.path.exists(full_loc + '.txt.gz'): print 'existed!!!!!!!!!!!!' return full_loc + '.txt.gz' # Do not download if os.path.exists(full_loc): cmlib.parse_mrt(full_loc, full_loc + '.txt', fsize) cmlib.pack_gz(full_loc + '.txt') return full_loc + '.txt.gz' cmlib.force_download_file('http://' + web_location, datadir + web_location, filename) cmlib.parse_mrt(full_loc, full_loc + '.txt', fsize) cmlib.pack_gz(full_loc + '.txt') os.remove(full_loc) # remove the original file return full_loc + '.txt.gz'
def numf_metrics_fpath(self): dir = metrics_output_root + str( self.granu) + '/' + self.sdate + '_' + self.edate + '/' cmlib.make_dir(dir) return dir + 'num_fea_metrics.txt'
def numf_metrics_fpath(self): dir = metrics_output_root + str(self.granu) + '/' + self.sdate + '_' + self.edate + '/' cmlib.make_dir(dir) return dir+'num_fea_metrics.txt'
def rm_reset_one_list(self, rib_full_loc, tmp_full_listfile): ## record reset info into a temp file reset_info_file = datadir + 'peer_resets.txt' print self.co, ' obtaining BGP session reset start-end period...' subprocess.call('perl '+projectdir+'tool/bgpmct.pl -rf '+rib_full_loc+' -ul '+\ tmp_full_listfile + ' > '+reset_info_file, shell=True) if os.path.exists(reset_info_file): if os.path.getsize(reset_info_file) == 0: print 'no reset at all!' return else: print 'no reset at all!' return peer_resettime = dict() # peer: list of [reset start, reset end] resetf = open(reset_info_file, 'r') for line in resetf: if line.startswith('run') or line.startswith('/') or ('#' in line): continue if ':' in line: now_peer = line.rstrip(':\n') continue stime_unix, endtime_unix= int(line.split(',')[0]), int(line.split(',')[1]) try: peer_resettime[now_peer].append([stime_unix, endtime_unix]) except: peer_resettime[now_peer] = [[stime_unix, endtime_unix],] resetf.close() # write the reset info into a file # TODO deal with gap > 32 days cmlib.make_dir(reset_info_dir) f = open(self.reset_info, 'a') f.write(self.co+':\n') for p in peer_resettime: f.write(p+'@\n') for rs in peer_resettime[p]: f.write(str(rs)+'\n') f.close() ''' # XXX only for once start (continue after the program stopped because of memo issue) # FIXME Giant bug in these code. In future, re-download the affected collectors this_co_peers = [] peer_file = cmlib.peer_path_by_rib_path(rib_full_loc) fff = open(peer_file, 'r') for line in fff: peer = line.split('@')[0] this_co_peers.append(peer) fff.close() peer_resettime = dict() record = False f = open(self.reset_info, 'r') for line in f: line = line.rstrip('@\n') if ':' in line: record = False continue if line[0].isdigit(): record = True p = line peer_resettime[p] = list() elif record is True: thelist = ast.literal_eval(line) peer_resettime[p].append(thelist) else: assert 1 == 0 f.close() # XXX only for once end ''' # different collectors in the same file for p in peer_resettime: if ':' in p: # We do not really delete IPv6 updates continue #if p not in this_co_peers: # XXX used with the previous commented out code # continue if p not in self.global_peers: # We ignore non-global peers to save time continue for l in peer_resettime[p]: print 'deleting reset for ', p self.delete_reset_updates(p, l[0], l[1], tmp_full_listfile) #h = hpy() #print h.heap() os.remove(reset_info_file) #XXX comment out when 'doing it once'...
def get_num_feature_actmon(self): # Get the average of each feature total_f2avg = dict() total_f2vlist = dict() for i in range(feature_num): total_f2vlist[i] = list() for uds in self.uds_list: for slot in uds.dtobj_list: print '*************Getting total feature values for slot ', slot sdt_unix = calendar.timegm(slot[0].utctimetuple()) rpath = uds.numf_distr_output_dir() + str(sdt_unix) + '.txt' f = open(rpath, 'r') for line in f: line = line.rstrip('\n') name = line.split(':')[0] mydict = line.replace(name + ':', '') mydict = ast.literal_eval(mydict) if name == 'T': for fea in mydict: total_f2vlist[fea].append(mydict[fea]) f.close() for fea in total_f2vlist: total_f2avg[fea] = float(sum(total_f2vlist[fea])) / float( len(total_f2vlist[fea])) # Simply set the threshold for active monitors to average/N f2thre = dict() for i in range(feature_num): f2thre[i] = total_f2avg[i] / 10.0 print 'Get the set of active monitors for each slot and each feature' # To save memory, we map monitor ip to an integer mon2id = dict() count = 0 for uds in self.uds_list: for mon in uds.monitors: try: test = mon2id[mon] except: mon2id[mon] = count count += 1 unix2fea2monset = dict() for uds in self.uds_list: for slot in uds.dtobj_list: print '*************Getting highly active monitors for slot ', slot sdt_unix = calendar.timegm(slot[0].utctimetuple()) unix2fea2monset[sdt_unix] = dict() rpath = uds.numf_distr_output_dir() + str(sdt_unix) + '.txt' f = open(rpath, 'r') for line in f: line = line.rstrip('\n') name = line.split(':')[0] mydict = line.replace(name + ':', '') mydict = ast.literal_eval(mydict) if name != 'T': id = mon2id[name] for fea in mydict: if mydict[fea] >= f2thre[fea]: try: unix2fea2monset[sdt_unix][fea].add(id) except: unix2fea2monset[sdt_unix][fea] = set([id]) f.close() # store the info in a middle file. One file for one feature filedict = dict() dir = metrics_output_root + str(self.uds_list[0].granu) + '/actmon/' cmlib.make_dir(dir) for i in range(feature_num): filedict[i] = open(dir + str(i) + '.txt', 'w') for unix in unix2fea2monset: for fea in unix2fea2monset[unix]: filedict[fea].write( str(unix) + ':' + str(unix2fea2monset[unix][fea]) + '\n') for i in range(feature_num): filedict[i].close() f = open(dir + 'mon2id.txt', 'w') f.write(str(mon2id)) f.close()
for line in f: line = line.rstrip('\n') pfxset.add(line) f.close() my_period.pfx2as_LPM(pfxset) #mf = Micro_fighter(reaper) #mf.analyze_slot(1369102200) #------------------------------------------------------------ # plot matrices of every middle file if action['plot_matrix']: mdir = my_period.get_middle_dir() plotdir = mdir + 'matrix/' cmlib.make_dir(plotdir) mfiles = os.listdir(mdir) for mf in mfiles: if not os.path.isfile(mdir+mf): mfiles.remove(mf) else: print 'Ploting matrix:', mdir+mf plot_matrix(mdir+mf, plotdir+mf.split('.')[0]+'.pdf') #TODO specify a range? reaperlist.append(reaper) #------------------------------------------------------------------ #combined analysis of all reapers if action['MR']:
unix = cluster1_2[0] sdate = daterange[order][0] edate = daterange[order][1] rib_files = list() for co in all_collectors.keys(): dl = Downloader(sdate, edate, co) rfilepath = dl.download_one_rib_before_unix(sdate, unix) # download RIB if rfilepath != -1: # cannot get rib_files.append(rfilepath) # output the rib file-list dir = final_output_root + 'additional_rib_list/' cmlib.make_dir(dir) ofpath = dir + str(order) + '_' + str(unix) + '.txt' f = open(ofpath, 'w') for rpath in rib_files: f.write(rpath + '\n') f.close() #---------------------------------------------------------------------------- # The main function if __name__ == '__main__' and 1 == 2: order_list = [303] # we select all collectors that have appropriate start dates collector_list = dict() for i in order_list: collector_list[i] = list() for co in all_collectors.keys():
def pfx_metrics_CDF_met2total(self): uds = self.uds_list[ 0] # XXX note: in the ISCC paper we only analyze one period! metrics = [ 'UQ', 'PMR', 'GC', 'CR1', 'CR4', 'CR8', 'CR0.1', 'CR0.2', 'CR0.3' ] met2list = dict() for met in metrics: met2list[met] = list() fname = uds.apfx_metrics_fpath() f = open(fname, 'r') for line in f: line = line.rstrip('\n') attr = line.split('|') met2list['UQ'].append( int(attr[1]) ) # get the metrics. Hard-coding is bad. But we save time here. met2list['PMR'].append(float(attr[2])) met2list['GC'].append(float(attr[3])) met2list['CR1'].append(float(attr[4])) met2list['CR4'].append(float(attr[5])) met2list['CR8'].append(float(attr[6])) met2list['CR0.1'].append(float(attr[7])) met2list['CR0.2'].append(float(attr[8])) met2list['CR0.3'].append(float(attr[9])) f.close() fig = plt.figure(figsize=(20, 13)) ax = fig.add_subplot(111) count = 0 for mtype in met2list: if mtype == 'UQ': continue v2count = dict() for v in met2list[mtype]: try: v2count[v] += 1 except: v2count[v] = 1 mycdf = cmlib.value_count2cdf(v2count) xlist = list() ylist = list() for key in sorted(mycdf): xlist.append(key) ylist.append(mycdf[key]) ax.plot(xlist, ylist,\ color=colors[count], marker=styles[count], markersize=25, markevery=(len(xlist)/2,len(xlist)), label=mtype, lw=6, alpha=0.8) #ax.plot(xlist, ylist, color=colors[count], label=mtype, lw=6, alpha=0.8) count += 1 ax.set_ylabel('Quantity of time slot') ax.set_xlabel(' Metric value') legend = ax.legend(loc='lower right', shadow=False) ax.tick_params(axis='y', pad=10) ax.tick_params(axis='x', pad=10) plt.ticklabel_format(style='sci', axis='y', scilimits=(0, 0)) cmlib.make_dir(env.metric_plot_dir) output_loc = env.metric_plot_dir + 'pfx_metrics_CDF.pdf' plt.savefig(output_loc, bbox_inches='tight') plt.clf() # clear the figure plt.close() '''
def num_features_metrics_CDF(self): met2unix2fea2v = dict() # Note: different to fea2unix2met2v # initialize the huge dict print 'Initializing ...' tmppath = self.uds_list[0].numf_metrics_fpath() f = open(tmppath, 'r') count = 0 for line in f: mtype = line.split('|')[1] met2unix2fea2v[mtype] = dict() count += 1 if count == 15: # XXX Note: we assume at most 15 metrics break f.close() unix_list = list() for uds in self.uds_list: for dtobj in uds.dtobj_list: unix = calendar.timegm(dtobj[0].utctimetuple()) unix_list.append(unix) for m in met2unix2fea2v.keys(): for unix in unix_list: met2unix2fea2v[m][unix] = dict() # read output file and store information for uds in self.uds_list: mf_path = uds.numf_metrics_fpath() print 'Reading ', mf_path f = open(mf_path, 'r') for line in f: line = line.rstrip('\n') splitted = line.split('|') unix = int(splitted[0]) mtype = splitted[1] thedict = ast.literal_eval(splitted[2]) for fea in thedict: value = thedict[fea] met2unix2fea2v[mtype][unix][fea] = value f.close() # Plot M figures for M metrics. In each figure, N curves for N features. for mtype in met2unix2fea2v: print 'Plotting metric ', mtype fea2vlist = dict() for unix in met2unix2fea2v[mtype]: for fea in met2unix2fea2v[mtype][unix]: value = met2unix2fea2v[mtype][unix][fea] try: fea2vlist[fea].append(value) except: fea2vlist[fea] = [value] fea2xlist = dict() fea2ylist = dict() for fea in fea2vlist: not_applicable = 0 v2count = dict() for v in fea2vlist[fea]: if v == -1: not_applicable += 1 continue try: v2count[v] += 1 except: v2count[v] = 1 mycdf = cmlib.value_count2cdf(v2count) for key in sorted(mycdf): try: fea2xlist[fea].append(key) fea2ylist[fea].append(mycdf[key]) except: fea2xlist[fea] = [key] fea2ylist[fea] = [mycdf[key]] if len(mycdf.keys()) == 2: # highest disparity fea2xlist[fea].append(mycdf.keys()[-1]) tmp = fea2ylist[fea][-1] fea2ylist[fea][-1] = 0 fea2ylist[fea].append(tmp) if not_applicable > 0: print 'fea:', fea, '. mtype:', mtype, '. not_applicable:', not_applicable # for showing statistics in paper if mtype == 'DV': #print 'WW:',fea2xlist[3][-1] #print 'AADup1',fea2xlist[4][-1] print 'WADup:', fea2xlist[8][1] print 'AADup2', fea2xlist[5][1] # Start plotting now! if mtype == 'TOTAL': fig = plt.figure(figsize=(20, 13)) else: fig = plt.figure(figsize=(17, 13)) ax = fig.add_subplot(111) count = -1 for fea in fea2xlist: if fea == 7: continue count += 1 #ax.plot(fea2xlist[fea], fea2ylist[fea],linestyles[count%4],\ # color=colors[count], label=feature_num2name[fea], lw=9, alpha=0.8) ax.plot(fea2xlist[fea], fea2ylist[fea],\ color=colors[count], marker=styles[count], markersize=25, markevery=(len(fea2xlist[fea])/2,len(fea2xlist[fea])), label=feature_num2name[fea], lw=6, alpha=0.8) ax.set_ylabel('Quantity of time slot') if mtype == 'TOTAL': ax.set_xlabel('Quantity of feature') ax.set_xscale('log') ax.set_ylim([-500, 13500]) ax.tick_params(axis='x', pad=10) else: ax.set_xlabel(' Metric value') ax.set_xlim([-0.1, 1.1]) ax.set_ylim([-500, 13500]) if mtype == 'GINI' or mtype == 'TOTAL': legend = ax.legend(loc='best', shadow=False) ax.tick_params(axis='y', pad=10) ax.tick_params(axis='x', pad=10) plt.ticklabel_format(style='sci', axis='y', scilimits=(0, 0)) cmlib.make_dir(env.metric_plot_dir) output_loc = env.metric_plot_dir + 'CDF_' + str(mtype) + '.pdf' plt.savefig(output_loc, bbox_inches='tight') plt.clf() # clear the figure plt.close()
def rm_reset_one_list(self, rib_full_loc, tmp_full_listfile): ## record reset info into a temp file reset_info_file = datadir + 'peer_resets.txt' print self.co, ' obtaining BGP session reset start-end period...' subprocess.call('perl '+projectdir+'tool/bgpmct.pl -rf '+rib_full_loc+' -ul '+\ tmp_full_listfile + ' > '+reset_info_file, shell=True) if os.path.exists(reset_info_file): if os.path.getsize(reset_info_file) == 0: print 'no reset at all!' return else: print 'no reset at all!' return peer_resettime = dict() # peer: list of [reset start, reset end] resetf = open(reset_info_file, 'r') for line in resetf: if line.startswith('run') or line.startswith('/') or ('#' in line): continue if ':' in line: now_peer = line.rstrip(':\n') continue stime_unix, endtime_unix = int(line.split(',')[0]), int( line.split(',')[1]) try: peer_resettime[now_peer].append([stime_unix, endtime_unix]) except: peer_resettime[now_peer] = [ [stime_unix, endtime_unix], ] resetf.close() # write the reset info into a file # TODO deal with gap > 32 days cmlib.make_dir(reset_info_dir) f = open(self.reset_info, 'a') f.write(self.co + ':\n') for p in peer_resettime: f.write(p + '@\n') for rs in peer_resettime[p]: f.write(str(rs) + '\n') f.close() ''' # XXX only for once start (continue after the program stopped because of memo issue) # FIXME Giant bug in these code. In future, re-download the affected collectors this_co_peers = [] peer_file = cmlib.peer_path_by_rib_path(rib_full_loc) fff = open(peer_file, 'r') for line in fff: peer = line.split('@')[0] this_co_peers.append(peer) fff.close() peer_resettime = dict() record = False f = open(self.reset_info, 'r') for line in f: line = line.rstrip('@\n') if ':' in line: record = False continue if line[0].isdigit(): record = True p = line peer_resettime[p] = list() elif record is True: thelist = ast.literal_eval(line) peer_resettime[p].append(thelist) else: assert 1 == 0 f.close() # XXX only for once end ''' # different collectors in the same file for p in peer_resettime: if ':' in p: # We do not really delete IPv6 updates continue #if p not in this_co_peers: # XXX used with the previous commented out code # continue if p not in self.global_peers: # We ignore non-global peers to save time continue for l in peer_resettime[p]: print 'deleting reset for ', p self.delete_reset_updates(p, l[0], l[1], tmp_full_listfile) #h = hpy() #print h.heap() os.remove(reset_info_file) #XXX comment out when 'doing it once'...
def download_one_rib(self, my_date): tmp_month = my_date[0:4] + '.' + my_date[4:6] if self.co.startswith('rrc'): web_location = rrc_root + self.co + '/' + tmp_month + '/' else: web_location = rv_root + self.co + '/bgpdata/' + tmp_month + '/RIBS/' web_location = web_location.replace('//', '/') webraw = cmlib.get_weblist('http://' + web_location) cmlib.make_dir(datadir + web_location) #---------------------------------------------------------------- # select a RIB file with reasonable (not strange) file size rib_list = webraw.split('\n') filter(lambda a: a != '', rib_list) filter(lambda a: a != '\n', rib_list) rib_list = [ item for item in rib_list if 'rib' in item or 'bview' in item ] sizelist = list() for line in rib_list: size = line.split()[-1] fsize = cmlib.parse_size(size) sizelist.append(fsize) avg = np.mean(sizelist) target_line = None # stores the RIB file for downloading largest_line = None max = -1 closest = 99999 for line in rib_list: fdate = line.split()[0].split('.')[-3] size = line.split()[-1] fsize = cmlib.parse_size(size) if fsize > max: max = fsize largest_line = line diff = abs(int(fdate) - int(my_date)) # >0 # XXX logic here not clear (but seems effective) if diff <= closest and fsize > 0.9 * avg and fsize < 1.1 * avg: target_line = line closest = diff if target_line is None: assert largest_line is not None print 'Failed. Resort to downloading the largest RIB...' target_line = largest_line # work-around for a special case print 'Selected RIB:', target_line size = target_line.split()[-1] # claimed RIB file size fsize = cmlib.parse_size(size) filename = target_line.split()[0] full_loc = datadir + web_location + filename # .bz2/.gz if os.path.exists(full_loc + '.txt'): # only for clearer logic os.remove(full_loc + '.txt') #------------------------------------------------------------------ # Download the RIB if os.path.exists(full_loc + '.txt.gz'): print 'existed size & original size:', os.path.getsize( full_loc + '.txt.gz'), fsize if os.path.getsize(full_loc + '.txt.gz') > 0.6 * fsize: # 0.6 is good enough return full_loc + '.txt.gz' # Do not download else: os.remove(full_loc + '.txt.gz') # too small to be complete if os.path.exists(full_loc): if os.path.getsize(full_loc) <= 0.95 * fsize: os.remove(full_loc) else: # Good! cmlib.parse_mrt(full_loc, full_loc + '.txt', fsize) cmlib.pack_gz(full_loc + '.txt') return full_loc + '.txt.gz' cmlib.force_download_file('http://' + web_location, datadir + web_location, filename) cmlib.parse_mrt(full_loc, full_loc + '.txt', fsize) cmlib.pack_gz(full_loc + '.txt') os.remove(full_loc) # remove the original file return full_loc + '.txt.gz'
def download_one_rib_before_unix(self, my_date, unix): # my_date for deciding month tmp_month = my_date[0:4] + '.' + my_date[4:6] if self.co.startswith('rrc'): web_location = rrc_root + self.co + '/' + tmp_month + '/' else: web_location = rv_root + self.co + '/bgpdata/' + tmp_month + '/RIBS/' web_location = web_location.replace('//', '/') try: webraw = cmlib.get_weblist('http://' + web_location) print 'Getting list from ' + 'http://' + web_location except: return -1 cmlib.make_dir(datadir+web_location) #---------------------------------------------------------------- # select a RIB file right before the unix and with reasonable (not strange) file size rib_list = webraw.split('\n') filter(lambda a: a != '', rib_list) filter(lambda a: a != '\n', rib_list) rib_list = [item for item in rib_list if 'rib' in item or 'bview' in item] sizelist = list() for line in rib_list: size = line.split()[-1] fsize = cmlib.parse_size(size) sizelist.append(fsize) avg = np.mean(sizelist) ok_rib_list = list() # RIBs whose size is OK for line in rib_list: fsize = cmlib.parse_size(line.split()[-1]) if fsize > 0.9 * avg: ok_rib_list.append(line) target_line = None # the RIB closest to unix min = 9999999999 for line in ok_rib_list: fdate = line.split()[0].split('.')[-3] ftime = line.split()[0].split('.')[-2] dtstr = fdate+ftime objdt = datetime.datetime.strptime(dtstr, '%Y%m%d%H%M') runix = time_lib.mktime(objdt.timetuple()) + 8*60*60 # F**k! Time zone! print objdt, runix, unix if runix <= unix and unix-runix < min: min = unix-runix print 'min changed to ', min target_line = line print 'Selected RIB:', target_line if target_line == None: return -1 size = target_line.split()[-1] # claimed RIB file size fsize = cmlib.parse_size(size) filename = target_line.split()[0] full_loc = datadir + web_location + filename # .bz2/.gz if os.path.exists(full_loc+'.txt'): # only for clearer logic os.remove(full_loc+'.txt') #------------------------------------------------------------------ # Download the RIB if os.path.exists(full_loc+'.txt.gz'): print 'existed!!!!!!!!!!!!' return full_loc+'.txt.gz' # Do not download if os.path.exists(full_loc): cmlib.parse_mrt(full_loc, full_loc+'.txt', fsize) cmlib.pack_gz(full_loc+'.txt') return full_loc+'.txt.gz' cmlib.force_download_file('http://'+web_location, datadir+web_location, filename) cmlib.parse_mrt(full_loc, full_loc+'.txt', fsize) cmlib.pack_gz(full_loc+'.txt') os.remove(full_loc) # remove the original file return full_loc+'.txt.gz'
def download_one_rib(self, my_date): tmp_month = my_date[0:4] + '.' + my_date[4:6] if self.co.startswith('rrc'): web_location = rrc_root + self.co + '/' + tmp_month + '/' else: web_location = rv_root + self.co + '/bgpdata/' + tmp_month + '/RIBS/' web_location = web_location.replace('//', '/') webraw = cmlib.get_weblist('http://' + web_location) cmlib.make_dir(datadir+web_location) #---------------------------------------------------------------- # select a RIB file with reasonable (not strange) file size rib_list = webraw.split('\n') filter(lambda a: a != '', rib_list) filter(lambda a: a != '\n', rib_list) rib_list = [item for item in rib_list if 'rib' in item or 'bview' in item] sizelist = list() for line in rib_list: size = line.split()[-1] fsize = cmlib.parse_size(size) sizelist.append(fsize) avg = np.mean(sizelist) target_line = None # stores the RIB file for downloading largest_line = None max = -1 closest = 99999 for line in rib_list: fdate = line.split()[0].split('.')[-3] size = line.split()[-1] fsize = cmlib.parse_size(size) if fsize > max: max = fsize largest_line = line diff = abs(int(fdate)-int(my_date)) # >0 # XXX logic here not clear (but seems effective) if diff <= closest and fsize > 0.9 * avg and fsize < 1.1 * avg: target_line = line closest = diff if target_line is None: assert largest_line is not None print 'Failed. Resort to downloading the largest RIB...' target_line = largest_line # work-around for a special case print 'Selected RIB:', target_line size = target_line.split()[-1] # claimed RIB file size fsize = cmlib.parse_size(size) filename = target_line.split()[0] full_loc = datadir + web_location + filename # .bz2/.gz if os.path.exists(full_loc+'.txt'): # only for clearer logic os.remove(full_loc+'.txt') #------------------------------------------------------------------ # Download the RIB if os.path.exists(full_loc+'.txt.gz'): print 'existed size & original size:',os.path.getsize(full_loc+'.txt.gz'),fsize if os.path.getsize(full_loc+'.txt.gz') > 0.6 * fsize: # 0.6 is good enough return full_loc+'.txt.gz' # Do not download else: os.remove(full_loc+'.txt.gz') # too small to be complete if os.path.exists(full_loc): if os.path.getsize(full_loc) <= 0.95 * fsize: os.remove(full_loc) else: # Good! cmlib.parse_mrt(full_loc, full_loc+'.txt', fsize) cmlib.pack_gz(full_loc+'.txt') return full_loc+'.txt.gz' cmlib.force_download_file('http://'+web_location, datadir+web_location, filename) cmlib.parse_mrt(full_loc, full_loc+'.txt', fsize) cmlib.pack_gz(full_loc+'.txt') os.remove(full_loc) # remove the original file return full_loc+'.txt.gz'
def numf_distr_output_dir(self): dir = metrics_output_root + str(self.granu) + '/' + self.sdate + '_' + self.edate + '/' cmlib.make_dir(dir) return dir
def __init__(self, sdate): self.sdate = sdate cmlib.make_dir(datadir+'support/')
def num_features_metrics_CDF(self): met2unix2fea2v = dict() # Note: different to fea2unix2met2v # initialize the huge dict print 'Initializing ...' tmppath = self.uds_list[0].numf_metrics_fpath() f = open(tmppath, 'r') count = 0 for line in f: mtype = line.split('|')[1] met2unix2fea2v[mtype] = dict() count += 1 if count == 15: # XXX Note: we assume at most 15 metrics break f.close() unix_list = list() for uds in self.uds_list: for dtobj in uds.dtobj_list: unix = calendar.timegm(dtobj[0].utctimetuple()) unix_list.append(unix) for m in met2unix2fea2v.keys(): for unix in unix_list: met2unix2fea2v[m][unix] = dict() # read output file and store information for uds in self.uds_list: mf_path = uds.numf_metrics_fpath() print 'Reading ', mf_path f = open(mf_path, 'r') for line in f: line = line.rstrip('\n') splitted = line.split('|') unix = int(splitted[0]) mtype = splitted[1] thedict = ast.literal_eval(splitted[2]) for fea in thedict: value = thedict[fea] met2unix2fea2v[mtype][unix][fea] = value f.close() # Plot M figures for M metrics. In each figure, N curves for N features. for mtype in met2unix2fea2v: print 'Plotting metric ', mtype fea2vlist = dict() for unix in met2unix2fea2v[mtype]: for fea in met2unix2fea2v[mtype][unix]: value = met2unix2fea2v[mtype][unix][fea] try: fea2vlist[fea].append(value) except: fea2vlist[fea] = [value] fea2xlist = dict() fea2ylist = dict() for fea in fea2vlist: not_applicable = 0 v2count = dict() for v in fea2vlist[fea]: if v == -1: not_applicable += 1 continue try: v2count[v] += 1 except: v2count[v] = 1 mycdf = cmlib.value_count2cdf(v2count) for key in sorted(mycdf): try: fea2xlist[fea].append(key) fea2ylist[fea].append(mycdf[key]) except: fea2xlist[fea] = [key] fea2ylist[fea] = [mycdf[key]] if len(mycdf.keys()) == 2: # highest disparity fea2xlist[fea].append(mycdf.keys()[-1]) tmp = fea2ylist[fea][-1] fea2ylist[fea][-1] = 0 fea2ylist[fea].append(tmp) if not_applicable > 0: print 'fea:',fea,'. mtype:',mtype,'. not_applicable:',not_applicable # for showing statistics in paper if mtype == 'DV': #print 'WW:',fea2xlist[3][-1] #print 'AADup1',fea2xlist[4][-1] print 'WADup:',fea2xlist[8][1] print 'AADup2',fea2xlist[5][1] # Start plotting now! if mtype == 'TOTAL': fig = plt.figure(figsize=(20, 13)) else: fig = plt.figure(figsize=(17, 13)) ax = fig.add_subplot(111) count = -1 for fea in fea2xlist: if fea == 7: continue count += 1 #ax.plot(fea2xlist[fea], fea2ylist[fea],linestyles[count%4],\ # color=colors[count], label=feature_num2name[fea], lw=9, alpha=0.8) ax.plot(fea2xlist[fea], fea2ylist[fea],\ color=colors[count], marker=styles[count], markersize=25, markevery=(len(fea2xlist[fea])/2,len(fea2xlist[fea])), label=feature_num2name[fea], lw=6, alpha=0.8) ax.set_ylabel('Quantity of time slot') if mtype == 'TOTAL': ax.set_xlabel('Quantity of feature') ax.set_xscale('log') ax.set_ylim([-500, 13500]) ax.tick_params(axis='x',pad=10) else: ax.set_xlabel(' Metric value') ax.set_xlim([-0.1, 1.1]) ax.set_ylim([-500, 13500]) if mtype == 'GINI' or mtype == 'TOTAL': legend = ax.legend(loc='best',shadow=False) ax.tick_params(axis='y',pad=10) ax.tick_params(axis='x',pad=10) plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0)) cmlib.make_dir(env.metric_plot_dir) output_loc = env.metric_plot_dir + 'CDF_' + str(mtype) + '.pdf' plt.savefig(output_loc, bbox_inches='tight') plt.clf() # clear the figure plt.close()
def __init__(period, granu): self.filelist = period.get_filelist() self.sdate = period.sdate self.edate = period.edate self.granu = granu self.cl_list = cl_list #XXX self.max_dt = -1 #------------------------------------------------------------- # dir for final and middle output files self.output_dir = datadir + 'output/' + self.sdate + '_' + self.edate + '/' cmlib.make_dir(self.output_dir) self.monitors = [] tmp_co_mo = period.co_mo for co in tmp_co_mo.keys(): self.monitors.extend(tmp_co_mo[co]) self.mcount = len(self.monitors) self.mo2index = {} # map monitor ip to an index index = 0 for mo in self.monitors: self.mo2index[mo] = index index += 1 self.no_prefixes = period.no_prefixes # a trie # FIXME not here! in period class ''' self.m_as_m = dict() # AS number: monitor count self.m_nation_as = dict() # nation: AS (of monitors) count for m in self.monitors.keys(): asn = self.monitors[m] try: self.m_as_m[asn] += 1 except: self.m_as_m[asn] = 1 for asn in self.m_as_m.keys(): nation = self.as_to_nation(asn) if nation == -1: continue try: self.m_nation_as[nation] += 1 except: self.m_nation_as[nation] = 1 self.m_ascount = len(self.m_as_m.keys()) print 'monitor AS count:', self.m_ascount self.m_nationcount = len(self.m_nation_as.keys()) print 'monitor nation count:', self.m_nationcount print 'monitor nations:', self.m_nation_as.keys() ''' #----------------------------------------------------- # For synchronization among collectors and conducting timely aggregation # Note: assume all collectors will exist after self.sdate + 1 hour self.cl_dt = { } # The current datetime of every collector, for getting ceiling for cl in self.cl_list: self.cl_dt[cl] = 0 tmp_dt = datetime.datetime(int(self.sdate[0:4]),\ int(self.sdate[4:6]),int(self.sdate[6:8]),0,0) # do not fill up the hour to allow for the edge value being analyzed tmp_dt = tmp_dt + datetime.timedelta(minutes=58) tmp_dt = time_lib.mktime(tmp_dt.timetuple()) # floor is only for ignoring anything before self.sdate + 1 hour self.floor = tmp_dt # we output everything below ceiling and above floor self.ceiling = self.floor tmp_dt = datetime.datetime(int(self.edate[0:4]),\ int(self.edate[4:6]),int(self.edate[6:8]),23,59) tmp_dt = tmp_dt + datetime.timedelta(minutes=-58) tmp_dt = time_lib.mktime(tmp_dt.timetuple()) # Change into seconds int self.top_ceiling = tmp_dt # self.ceiling cannot exceed this value #------------------------------------------------------ # Basic values assignment self.pfx_trie = dict( ) # every dt has a corresponding trie, deleted periodically #self.dt_list = list() # the list of datetime #self.peerlist = dict() # dt: monitor list XXX no need any more #self.ucount = dict() # dt: update count #self.acount = dict() # dt: announcement count #self.wcount = dict() # dt: withdrawal count #self.wpctg = dict() # dt: withdrawal percentage XXX get this only when analyzing output # FIXME put the download of support files in period class # Take special care when the duration is long spt = Supporter(self.sdate) self.pfx2as = spt.get_pfx2as_trie() # all prefixes mappingg to AS self.as2nation = spt.get_as2nation_dict( ) # all ASes mapping to nation (latest info) self.all_ascount = cmlib.get_all_ascount( self.sdate) # Get total AS quantity self.all_pcount = cmlib.get_all_pcount( self.sdate) # Get total prefix quantity self.all_pcount_lzero = 0 # quantity of prefixes having DV > 0 self.as2cc = spt.get_as2cc_dict( ) # all ASes mapped to sizes of customer cones # XXX no need any more self.as2rank = dict( ) # All ASes mapped to rank (according to customer cone size) pre_value = 999999 rank = 0 # number (of ASes whose CC is larger) + 1 buffer = 0 # number (of ASes having the same CC size) - 1 for item in sorted(self.as2cc.iteritems(), key=operator.itemgetter(1), reverse=True): if item[1] < pre_value: rank = rank + buffer + 1 pre_value = item[1] self.as2rank[item[0]] = rank buffer = 0 else: # item[1] (cc size) == pre_value buffer += 1 self.as2rank[item[0]] = rank #--------------------------------------------------------------------- # FIXME # For each dt create a middle file that stores prefix|update quantity|DV value|(DV list) # Analyze these middle files in the end self.dv_level = [0, 0.05, 0.1, 0.15, 0.2] # Coarser DV values self.dvrange_dt_pfx = dict() # DV level range: dt: pfx count self.dvrange_len_pfx = dict( ) # DV level range: prefix length: existence self.dv_dt_asn_pfx = dict() # DV levels: dt: AS: prefix count self.pfxcount = dict() # dv: dt: prefix (in updates) count self.pfxcount_range = dict() # dv range: dt: prefix (in updates) count self.dv_dt_hdvp = dict() # DV levels: dt: hdvp count for dl in self.dv_level: self.dvrange_dt_pfx[dl] = dict() self.dvrange_len_pfx[dl] = dict() self.dv_dt_asn_pfx[dl] = dict() self.pfxcount[dl] = dict() self.pfxcount_range[dl] = dict() self.dv_dt_hdvp[dl] = dict() # only record DV > 0.15 #XXX delete self.dup_trie = patricia.trie(None) # TODO Enough memory for this? # DV distribution in every time slot self.dv_distribution = dict() # dt: DV: count self.dv_cdf = dict() # dt: DV: cumulative count #---------------------------------------------------------------------- # CDFs for the slot before and after the cdfbound (HDVP peak) # FIXME do this when analyzing the middle files #TODO obtain cdfbound self.compare = False if cdfbound != None: self.compare = True self.cdfbfr = dict() self.cdfaft = dict() self.as_bfr = dict() self.as_aft = dict() self.cdfbound = datetime.datetime.strptime(cdfbound, '%Y-%m-%d %H:%M:%S') self.bfr_start = time_lib.mktime((self.cdfbound +\ datetime.timedelta(minutes=-(self.granu*2))).timetuple()) self.cdfbound = time_lib.mktime(self.cdfbound.timetuple()) for dl in self.dv_level: self.as_bfr[dl] = dict() # dv: ASN: count self.as_aft[dl] = dict()
def pfx_metrics_CDF_met2total(self): uds = self.uds_list[0] # XXX note: in the ISCC paper we only analyze one period! metrics = ['UQ', 'PMR', 'GC', 'CR1', 'CR4', 'CR8', 'CR0.1', 'CR0.2', 'CR0.3'] met2list = dict() for met in metrics: met2list[met] = list() fname = uds.apfx_metrics_fpath() f = open(fname, 'r') for line in f: line = line.rstrip('\n') attr = line.split('|') met2list['UQ'].append(int(attr[1])) # get the metrics. Hard-coding is bad. But we save time here. met2list['PMR'].append(float(attr[2])) met2list['GC'].append(float(attr[3])) met2list['CR1'].append(float(attr[4])) met2list['CR4'].append(float(attr[5])) met2list['CR8'].append(float(attr[6])) met2list['CR0.1'].append(float(attr[7])) met2list['CR0.2'].append(float(attr[8])) met2list['CR0.3'].append(float(attr[9])) f.close() fig = plt.figure(figsize=(20, 13)) ax = fig.add_subplot(111) count = 0 for mtype in met2list: if mtype == 'UQ': continue v2count = dict() for v in met2list[mtype]: try: v2count[v] += 1 except: v2count[v] = 1 mycdf = cmlib.value_count2cdf(v2count) xlist = list() ylist = list() for key in sorted(mycdf): xlist.append(key) ylist.append(mycdf[key]) ax.plot(xlist, ylist,\ color=colors[count], marker=styles[count], markersize=25, markevery=(len(xlist)/2,len(xlist)), label=mtype, lw=6, alpha=0.8) #ax.plot(xlist, ylist, color=colors[count], label=mtype, lw=6, alpha=0.8) count += 1 ax.set_ylabel('Quantity of time slot') ax.set_xlabel(' Metric value') legend = ax.legend(loc='lower right',shadow=False) ax.tick_params(axis='y',pad=10) ax.tick_params(axis='x',pad=10) plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0)) cmlib.make_dir(env.metric_plot_dir) output_loc = env.metric_plot_dir + 'pfx_metrics_CDF.pdf' plt.savefig(output_loc, bbox_inches='tight') plt.clf() # clear the figure plt.close() '''
def apfx_metrics_fpath(self): dir = metrics_output_root + str( self.granu) + '/' + self.sdate + '_' + self.edate + '/' cmlib.make_dir(dir) return dir + 'active_pfx_metrics.txt'
def __init__(self, period, granu): # not using dict to consider for multiple existence self.blank_co = list( ) # list of lists: [collector, start unix dt, end unix dt] self.ignore_co = list( ) # we are now ignoring some co whose are in blank period self.filelist = period.get_filelist() self.sdate = period.sdate self.edate = period.edate self.granu = granu self.middle_dir = period.get_middle_dir() cmlib.make_dir(self.middle_dir) self.blank_dir = period.get_blank_dir() cmlib.make_dir(self.blank_dir) self.period = period self.all_co_list = period.co_mo.keys() # collector list self.monitors = [] for co in period.co_mo.keys(): self.monitors.extend(period.co_mo[co]) self.mcount = len(self.monitors) # Sort the monitor list first so that this mapping consistent across multiple runs tmp_list = sorted(self.monitors, key=cmlib.ip_to_integer) self.mo2index = {} # map monitor ip to an index index = 0 for mo in tmp_list: self.mo2index[mo] = index index += 1 # write this mapping to a file for future microscopic analysis self.mo2index_file = self.period.get_mon2index_file_path() f = open(self.mo2index_file, 'w') for mo in self.mo2index: f.write(mo + ':' + str(self.mo2index[mo]) + '\n') f.close() ###self.pfx_radix = dict() # every dt has a corresponding trie, deleted periodically self.pfx_tree = radix.Radix() # XXX test self.dt_list = dict() # unix dt => True # XXX test #----------------------------------------------------- # For synchronization among collectors and conducting timely aggregation # Note: assume all collectors will exist after self.sdate + 1 hour # XXX commented out when dealing with 2013 whole year data self.co_unix_dt = { } # The current datetime of every collector, for getting ceiling for cl in self.all_co_list: self.co_unix_dt[cl] = 0 tmp_dt = datetime.datetime(int(self.sdate[0:4]),\ int(self.sdate[4:6]),int(self.sdate[6:8]),0,0) # is UTC # do not fill up the hour to allow for the edge value being analyzed #XXX tmp_dt = tmp_dt + datetime.timedelta(minutes=58) # is UTC tmp_dt = calendar.timegm(tmp_dt.timetuple()) # is UTC # floor is only for ignoring anything before self.sdate + 1 hour self.floor = tmp_dt # we output everything below ceiling and above floor #self.ceiling = self.floor self.ceiling = tmp_dt tmp_dt = datetime.datetime(int(self.edate[0:4]),\ int(self.edate[4:6]),int(self.edate[6:8]),23,59,59) #XXX tmp_dt = tmp_dt + datetime.timedelta(minutes=-58) tmp_dt = calendar.timegm(tmp_dt.timetuple()) self.top_ceiling = tmp_dt # self.ceiling cannot exceed this value
def numf_distr_output_dir(self): dir = metrics_output_root + str( self.granu) + '/' + self.sdate + '_' + self.edate + '/' cmlib.make_dir(dir) return dir
def get_file(): for clctr in collectors: cl_name = clctr hdname_detail = hdname + 'archive.routeviews.org/' + cl_name +\ '/bgpdata/' hdname_detail = hdname_detail.replace('//', '/') # happens when cl = '' # only for downloading updates, not RIBs for ym in yearmonth: sdate = ym.split('.')[0] + ym.split('.')[1] + '01' edate = ym.split('.')[0] + ym.split('.')[1] + '07' filelocation = '' filelocation = 'archive.routeviews.org/' + cl_name + '/bgpdata/' + ym + '/UPDATES/' filelocation = filelocation.replace('//', '/') # when name is '' webraw = cmlib.get_weblist('http://' + filelocation) print filelocation cmlib.make_dir(hdname+'metadata/'+ym) flist = open(hdname+'metadata/'+ym+'/updt_filelist_'+cl_name, 'w') cmlib.make_dir(hdname+filelocation) for line in webraw.split('\n'): if not 'updates' in line or line == '' or line == '\n': continue size = line.split()[-1] if size.isdigit(): fsize = float(size) else: fsize = float(size[:-1]) * cmlib.size_u2v(size[-1]) filename = line.split()[0] # omit uninteresting info filedate = filename.split('.')[-3] # check whether its datetime in our range if int(filedate) < int(sdate) or int(filedate) > int(edate): continue print filename origin_floc = hdname + filelocation + filename # original file loc&name flist.write(origin_floc+'.txt.gz\n') # .xx.txt.gz file list # remove existing xx.txt file to make things clearer try: os.remove(origin_floc+'.txt') except: pass if os.path.exists(origin_floc+'.txt.gz'): if os.path.getsize(origin_floc+'.txt.gz') > 0.1 * fsize: if os.path.exists(origin_floc): # .bz2/.gz useless anymore os.remove(origin_floc) continue else: os.remove(origin_floc+'.txt.gz') if os.path.exists(origin_floc): if os.path.getsize(origin_floc) > 0.9 * fsize: continue else: os.remove(origin_floc) cmlib.force_download_file('http://'+filelocation, hdname+filelocation, filename) # file that stores update list flist.close() filelocation = 'archive.routeviews.org/' + cl_name + '/bgpdata/' + ym + '/RIBS/' filelocation = filelocation.replace('//', '/') # when name is '' webraw = cmlib.get_weblist('http://' + filelocation) print filelocation cmlib.make_dir(hdname+filelocation) # for each event, we only download one RIB (on the sdate) rib_fname = '' for line in webraw.split('\n'): if not 'rib' in line and not 'bview' in line: continue if line == '' or line == '\n': continue size = line.split()[-1] if size.isdigit(): fsize = float(size) else: fsize = float(size[:-1]) * cmlib.size_u2v(size[-1]) filename = line.split()[0] print filename if not int(filename.split('.')[-3]) == int(sdate): continue print filename origin_floc = hdname + filelocation + filename # original file loc&name try: os.remove(origin_floc+'.txt') except: pass rib_fname = filelocation + filename if os.path.exists(origin_floc+'.txt.gz'): if os.path.getsize(origin_floc+'.txt.gz') > 0.1 * fsize: if os.path.exists(origin_floc): # .bz2/.gz useless anymore os.remove(origin_floc) break else: os.remove(origin_floc+'.txt.gz') if os.path.exists(origin_floc): if os.path.getsize(origin_floc) > 0.9 * fsize: break else: os.remove(origin_floc) cmlib.force_download_file('http://'+filelocation, hdname+filelocation, filename) break # download one rib to intial as_path sdate_datetime = datetime.datetime(int(sdate[0:4]), int(sdate[4:6]),int(sdate[6:8])) as_path_date = sdate_datetime - datetime.timedelta(days=1) as_path_date = as_path_date.strftime('%Y%m%d') as_path_ym = as_path_date[0:4] + '.' + as_path_date[4:6] filelocation = 'archive.routeviews.org/' + cl_name + '/bgpdata/' + as_path_ym + '/RIBS/' filelocation = filelocation.replace('//', '/') # when name is '' webraw = cmlib.get_weblist('http://' + filelocation) print filelocation cmlib.make_dir(hdname+filelocation) asrib_fname = '' for line in reversed(webraw.split('\n')): print line if not 'rib' in line and not 'bview' in line: continue if line == '' or line == '\n': continue size = line.split()[-1] if size.isdigit(): fsize = float(size) else: fsize = float(size[:-1]) * cmlib.size_u2v(size[-1]) filename = line.split()[0] print filename if not int(filename.split('.')[-3]) == int(as_path_date): continue print filename origin_floc = hdname + filelocation + filename # original file loc&name try: os.remove(origin_floc+'.txt') except: pass asrib_fname = filelocation + filename if os.path.exists(origin_floc+'.txt.gz'): if os.path.getsize(origin_floc+'.txt.gz') > 0.1 * fsize: if os.path.exists(origin_floc): # .bz2/.gz useless anymore os.remove(origin_floc) break else: os.remove(origin_floc+'.txt.gz') if os.path.exists(origin_floc): if os.path.getsize(origin_floc) > 0.9 * fsize: break else: os.remove(origin_floc) cmlib.force_download_file('http://'+filelocation, hdname+filelocation, filename) break ## now for update and RIB files, their formats are either .bz2/gz or ## .xx.txt.gz!!! print 'parsing updates...' parse_updates(ym, cl_name) print 'parsing RIB and getting peers...' rib_location = hdname + rib_fname # .bz2/.gz #print rib_location,'dd' peers = get_peers(clctr,ym,rib_location) print 'peers: ', peers as_path_rib_location = hdname + asrib_fname # .bz2/.gz process_as_path_rib(clctr,as_path_ym,as_path_rib_location) print 'determining table transfers start and end time for each peer...' for peer in peers: # must process each peer one by one peer = peer.rstrip() print 'processing ',peer,'...' subprocess.call('perl '+homedir+'tool/bgpmct.pl -rf '+rib_location+'.txt.gz'+' -ul '+\ hdname+'metadata/'+ym+'/updt_filelist_'+cl_name+' -p '+peer+' > '+\ hdname+'tmp/'+peer+'_result.txt', shell=True) print 'delete updates caused by session reset for each peer...' for peer in peers: # No reset from this peer, so nothing in the file try: if os.path.getsize(hdname+'tmp/'+peer+'_result.txt') == 0: continue except: # cannot find file continue print '\nculprit now: ', peer del_tabletran_updates(peer, ym, cl_name) # delete all rubbish in the end subprocess.call('rm '+hdname+'tmp/*', shell=True) return