def _extract_owd_pktloss(test_id='', out_dir='', replot_only='0', source_filter='', ts_correct='1', burst_sep='0.0', sburst='1', eburst='0', seek_window='16000', log_loss='0', anchor_map='', owd_midpoint='0'): "Extract OWD or PKTLOSS of flows" ifile_ext = '.dmp.gz' if log_loss == '0': ofile_ext = '.owds2.gz' else: ofile_ext = '.loss2.gz' already_done = {} out_files = {} out_groups = {} test_id_arr = test_id.split(';') if len(test_id_arr) == 0 or test_id_arr[0] == '': abort('Must specify test_id parameter') if ts_correct == '0' and log_loss == '0': abort('Must use ts_correct=1 when calculating OWD') # Initialise source filter data structure sfil = SourceFilter(source_filter) # EXPERIMENTAL: anchor_map="<srcip1>:<dstip1>;<srcip2>:<dstip2>;..." # Normally a packet's OWD is logged as having occurred at the time the # packet is seen in pcap file associated with <srcip> (src_extname) # An 'anchor_map' entry allows you to specify that a packet's OWD be # logged as having occurred at the time the packet from <srcip> was # seen in the pcap file associated with <dstip> (dst_extname) # This is only operates on packets between <srcip>:<dstip>, other # flows in the same testID are unaffected. anchor_map_list = {} if anchor_map != '': if replot_only == '1': abort("Must specify replot_only=0 in conjunction with anchor_map") entries = anchor_map.split(';') for entry in entries: k, v = entry.split(':') anchor_map_list[k] = v group = 1 for test_id in test_id_arr: # first process tcpdump files (ignore router and ctl interface tcpdumps) tcpdump_files = get_testid_file_list('', test_id, ifile_ext, 'grep -v "router.dmp.gz" | grep -v "ctl.dmp.gz"') for tcpdump_file in tcpdump_files: # get input directory name and create result directory if necessary out_dirname = get_out_dir(tcpdump_file, out_dir) dir_name = os.path.dirname(tcpdump_file) # get unique flows flows = lookup_flow_cache(tcpdump_file) if flows == None: # If not previously found in flow_cache, # extract and identify the tcp and udp flows contained in tcpdump_file flows = _list(local('zcat %s | tcpdump -nr - "tcp" | ' 'awk \'{ if ( $2 == "IP" ) { print $3 " " $5 " tcp" } }\' | ' 'sed "s/://" | ' 'sed "s/\.\([0-9]*\) /,\\1 /g" | sed "s/ /,/g" | ' 'LC_ALL=C sort -u' % tcpdump_file, capture=True)) flows += _list(local('zcat %s | tcpdump -nr - "udp" | ' 'awk \'{ if ( $2 == "IP" ) { print $3 " " $5 " udp" } }\' | ' 'sed "s/://" | ' 'sed "s/\.\([0-9]*\) /,\\1 /g" | sed "s/ /,/g" | ' 'LC_ALL=C sort -u' % tcpdump_file, capture=True)) # Add them to the flow cache append_flow_cache(tcpdump_file, flows) # Walk through and process the flows identified in this current tcpdump_file for flow in flows: # First extract src & dst as IP addr on experiment networks src, src_port, dst, dst_port, proto = flow.split(',') # Map src & dst (IP addr) to testbed-specific external (control network) # host names from config.py ({src,dst}_extname) and internal (experiment network) # hostnames/addresses from config.py ({src,dst}_internal) src_extname, src_internal = get_address_pair_analysis(test_id, src, do_abort='0') dst_extname, dst_internal = get_address_pair_analysis(test_id, dst, do_abort='0') # Skip cases of random (broadcast) traffic involving an IP address for # which no experimental network NIC (and hence control network hostname) # is directly related if src_extname == '' or dst_extname == '': continue # flow name name = src_internal + '_' + src_port + '_' + dst_internal + '_' + dst_port # test id plus flow name if len(test_id_arr) > 1: long_name = test_id + '_' + name else: long_name = name if long_name not in already_done: # Construct filenames for files containing final <time> <owd|loss> pairs out_final = out_dirname + test_id + '_' + name + ofile_ext # Only embark on actual filtering/extraction if we're asked to regenerate # the intermediate OWD values, or for some reason the intermediate OWD # file is missing... if replot_only == '0' or not os.path.isfile(out_final): # Per flow/host: # Create intermediate file of timestamps + uniqString from pcap files, # THEN call adjust_timestamps to construct a version with timestamps adjusted # relative to a single reference host in the testbed # THEN use the adjusted timestamps in the subsequent owd/loss calculations. # To extract packets in in FORWARD direction from both src and dst pcap files, # construct dpkt flow filter in form <src_ip>:<src_port>:<dst_ip>:<dst_port> # (and more specifically, from src_internal:src_port to dst_internal:dst_port). filter_dpkt = src_internal + ':' + src_port + ':' + dst_internal + ':' + dst_port src_port_int = int(src_port) dst_port_int = int(dst_port) # Loop across the src and dst dmp files tmp_fwd_out_adj = {} for dmpfile_host, dirsuffix in ([src_extname,"src"],[dst_extname,"dst"]): # Construct the file name of the dump file that contains this # flow's packets. 'src' captured at (near) the flow's source, and 'dst' # captured at (near) the flow's destination. dmp_file = dir_name + '/' + test_id + '_' + dmpfile_host + ifile_ext print "Extracting packets for " + name + " from:" + dmp_file # Construct filename for intermediate "<time> <uniqueString>" output files # whose timestamps will be adjusted by adjust_timestamps() # before being used for owd calculations # (NOTE: Due to adjust_timestamps making assumptions about out_dir parameter, # we currently can't place these tmp files under /tmp) tmp_fwd_out = tempfile.mktemp(suffix=test_id + '_' + name +'_fwd_out_' + dirsuffix+".gz", dir=out_dirname) # Extract packet id info if dmp_file.endswith('.gz'): f_dmp_file = gzip.open(dmp_file) else: f_dmp_file = open(dmp_file) pcap_reader = dpkt.pcap.Reader(f_dmp_file) pcap_reader.setfilter(filter_dpkt) #pcap_reader.setfilter('') # Create a compressed temporary intermediate file f_tmp_fwd_out = gzip.open(tmp_fwd_out,'wb',1) # Walk across every packet in this pcap file for ts, pkt in pcap_reader: # get pointer to ethernet layer and check that we have IP eth = dpkt.ethernet.Ethernet(pkt) if eth.type != dpkt.ethernet.ETH_TYPE_IP: continue # get pointer to IP layer ip_pkt = eth.data # ignore if src or dst IP not the ones specified in filter if socket.inet_ntoa(ip_pkt.src) != src_internal or \ socket.inet_ntoa(ip_pkt.dst) != dst_internal: continue # ignore if UDP/TCP src or dst ports not the ones specified in filter # get pointer to payload if type(ip_pkt.data) == dpkt.udp.UDP: udp_frame = ip_pkt.data if udp_frame.sport != src_port_int or udp_frame.dport != dst_port_int: continue # Add IP ID field to the payload to ensure # at least something semi-unique is hashed # if UDP payload is invariant payload = str(ip_pkt.id) +udp_frame.data elif type(ip_pkt.data) == dpkt.tcp.TCP: tcp_frame = ip_pkt.data if tcp_frame.sport != src_port_int or tcp_frame.dport != dst_port_int: continue # Use IP ID field, TCP Sequence number and ACK number to # construct a mostly unique string within context of this flow payload = str(ip_pkt.id) + str(tcp_frame.seq) + str(tcp_frame.ack) else: continue # Write <timestamp> <crc32 hash of uniqueString bytes> # (hashing here eliminates any later problems parsing payloads # containing null bytes) f_tmp_fwd_out.write("%f %s\n" % (ts,zlib.crc32(payload))) f_tmp_fwd_out.close() f_dmp_file.close() # Apply timestamp corrections to the data thus extracted, prior to # calculating OWDs. Correction is MANDATORY otherwise the # 'calculated' OWDs are essentially useless. tmp_fwd_out_adj[dirsuffix] = adjust_timestamps(test_id, tmp_fwd_out, dmpfile_host, ' ', out_dir) # Remove pre-adjustment files. os.remove(tmp_fwd_out) # Now we have unique packet hashes seen at both src and dst locations, # and timestamps have been adjusted for clockoffsets. # Begin calculating OWD or identifying when packet losses occurred # Read into memory the <adjusted_timestamp> <uniqString> datasets captured # at dst (2nd place packet seen, "destination"). The src is (1st place packet # seen, "source") dst_data_time=list() dst_data_uniqString=list() for line in gzip.open(tmp_fwd_out_adj["dst"]).read().splitlines(): sline = line.split(" ") dst_data_time.append(float(sline[0])) dst_data_uniqString.append(sline[1]) # Walk through tmp_fwd_out_adj["src"] looking for matches to packets # in dst_data_uniqString, and write <time> <owd|loss> pairs in plain # ASCII to out_final # To limit potential duplicate matches to packets received forward # in time from previous match in dst_data_uniqString, maintain # index next_j pointing to next row in dst_data_uniqString to start # matching next packet from tmp_fwd_out_adj["src"] next_j = 0 last_j = len(dst_data_uniqString)-1 # As a speed-up hack, assume match in dst_data_uniqString is # within sk_window entries of next_j (saves searching all the # way to the end of dst_data_uniqString when seeking a lost packet) # Keeping seek_window in the low 1000s also minimises chances of # duplicate matches. if seek_window != '': sk_window = int(seek_window) else: sk_window = last_j # Create gzipped output file (rough experiments showed over reduction # in on-disk file size easily 100s of K down to 10s of K). # R automagically reads gzipped data files, so no changes required # to subsequent analyse_* plotting scripts. f = gzip.open(out_final, 'w') cumulative_loss = 0 # Decide whether to use timestamp at src or dst for OWD # (Default to src, unless anchor_map indicates using dst for # this particular traffic pattern) anchor = 0 # Default print timestamp at src if log_loss == '0': # Only relevant for OWD calculations if src_internal in anchor_map_list.keys(): #print "*** Found " + src_internal + " in anchor_map" if anchor_map_list[src_internal] == dst_internal: # Only relevant if the map relates to both src_internal and dst_internal #print "*** " + src_internal + " points to " + anchor_map_list[src_internal] + " in anchor_map" anchor = 1 # Print timestamp at dst for line in gzip.open(tmp_fwd_out_adj["src"]).read().splitlines(): i = line.split(" ") try: # The following search will raise a 'ValueError' exception if i[1] does not occur in dst_data_uniqString[next_j:] j = dst_data_uniqString[next_j:min((next_j+sk_window),last_j+1)].index(i[1]) if log_loss == '0': # OWD is diff between i[0] and dst_data_time[next_j+j] ts = float(i[0]) owd = dst_data_time[next_j+j]-float(i[0]) # If required, print event as occuring at dst timestamp rather than src timestamp if anchor: ts = dst_data_time[next_j+j] # If we want to imply the OWD "existed" at some mid-point # between pkt seen at src and seen at dst if owd_midpoint == '1': ts += owd/2 f.write('%f %f\n' % (ts, owd)) if log_loss == '1': # No lost packet, emit "0" f.write('%s 0\n' % (i[0])) if log_loss == '2': # No lost packet, emit previous cumulative count f.write('%s %i\n' % (i[0], cumulative_loss)) next_j = min(next_j+j+1,last_j) except ValueError: # No match means a packet loss if log_loss == '1': # Single loss event, emit "1" f.write('%s 1\n' % (i[0])) if log_loss == '2': # Single loss event, increment cumulative count, emit cumulative count cumulative_loss += 1 f.write('%s %i\n' % (i[0], cumulative_loss)) pass f.close() dst_data_time=[] dst_data_uniqString=[] # Clean up temporary post-adjustment files os.remove(tmp_fwd_out_adj["src"]) os.remove(tmp_fwd_out_adj["dst"]) already_done[long_name] = 1 if sfil.is_in(name): (out_files, out_groups) = select_bursts(long_name, group, out_final, burst_sep, sburst, eburst, out_files, out_groups) group += 1 return (test_id_arr, out_files, out_groups)
def get_testid_file_list(file_list_fname='', test_id='', file_ext='', pipe_cmd='', search_dir='.', no_abort=False): file_list = [] # if search dir is not specified try to find it in cache if search_dir == '.': search_dir = lookup_dir_cache(test_id) if file_list_fname == '': # read from test_id list specified, this always overrules list in file if # also specified test_id_arr = test_id.split(';') if len(test_id_arr) == 0 or test_id_arr[0] == '': abort('Must specify test_id parameter') if pipe_cmd != '': pipe_cmd = ' | ' + pipe_cmd for test_id in test_id_arr: _files = _list( local( 'find -L %s -name "%s*%s" -print | sed -e "s/^\.\///"%s' % (search_dir, test_id, file_ext, pipe_cmd), capture=True)) _files = filter_duplicates(_files) if search_dir == '.' and len(_files) > 0: append_dir_cache(test_id, os.path.dirname(_files[0])) file_list += _files else: # read list of test ids from file try: lines = [] with open(file_list_fname) as f: lines = f.readlines() for fname in lines: fname = fname.rstrip() _files = _list( local('find -L %s -name "%s" -print | sed -e "s/^\.\///"' % (search_dir, fname), capture=True)) _files = filter_duplicates(_files) if search_dir == '.' and len(_files) > 0: append_dir_cache(test_id, os.path.dirname(_files[0])) file_list += _files except IOError: abort('Cannot open experiment list file %s' % file_list_fname) if not no_abort and len(file_list) == 0: abort('Cannot find any matching data files.\n' 'Remove outdated teacup_dir_cache.txt if files were moved.') return file_list
def get_testid_file_list(file_list_fname='', test_id='', file_ext='', pipe_cmd='', search_dir='.', no_abort=False): file_list = [] # if search dir is not specified try to find it in cache if search_dir == '.': search_dir = lookup_dir_cache(test_id) if file_list_fname == '': # read from test_id list specified, this always overrules list in file if # also specified test_id_arr = test_id.split(';') if len(test_id_arr) == 0 or test_id_arr[0] == '': abort('Must specify test_id parameter') if pipe_cmd != '': pipe_cmd = ' | ' + pipe_cmd for test_id in test_id_arr: _files = _list( local( 'find -L %s -name "%s*%s" -print | sed -e "s/^\.\///"%s' % (search_dir, test_id, file_ext, pipe_cmd), capture=True)) _files = filter_duplicates(_files) if search_dir == '.' and len(_files) > 0: append_dir_cache(test_id, os.path.dirname(_files[0])) file_list += _files else: # read list of test ids from file try: lines = [] with open(file_list_fname) as f: lines = f.readlines() for fname in lines: fname = fname.rstrip() _files = _list( local( 'find -L %s -name "%s" -print | sed -e "s/^\.\///"' % (search_dir, fname), capture=True)) _files = filter_duplicates(_files) if search_dir == '.' and len(_files) > 0: append_dir_cache(test_id, os.path.dirname(_files[0])) file_list += _files except IOError: abort('Cannot open experiment list file %s' % file_list_fname) if not no_abort and len(file_list) == 0: abort('Cannot find any matching data files.\n' 'Remove outdated teacup_dir_cache.txt if files were moved.') return file_list