def generate_chunk_data(input_file, home_ip, website, protocols=None, threshold=0.11): ignore_init = 25 if protocols is None: protocols = ['TCP'] chunks = {} first_timestamp, duration = get_time_limits(input_file, website, home_ip, protocols) with open(input_file, 'rb') as csv_file: data_reader = csv.DictReader(csv_file, delimiter=',') for row in data_reader: if website in row['website'] and (row['protocol'] in protocols): if row['dst'] == home_ip: timestamp = float(row['timestamp']) - first_timestamp if timestamp < ignore_init: continue stream_id = make_stream_id(row) if stream_id not in chunks: chunks[stream_id] = {'chunks': [], 'current_chunk': 0, 'current_time': timestamp} if timestamp < chunks[stream_id]['current_time'] + threshold: chunks[stream_id]['current_time'] = timestamp chunks[stream_id]['current_chunk'] += int(row['len']) else: chunk = chunks[stream_id]['current_chunk'] chunks[stream_id]['chunks'].append((chunk, timestamp)) chunks[stream_id]['current_chunk'] = 0 chunks[stream_id]['current_time'] = timestamp return chunks
def get_packet_length_data(input_file, home_ip, website, is_incoming=True, ignore_acks=True, tso=False, need_ids=False, protocols=None): # lengths_dict = {} if protocols is None: protocols = ['TCP'] lengths = [] ids = [] if need_ids else None with open(input_file, 'rb') as csv_file: data_reader = csv.DictReader(csv_file, delimiter=',') for row in data_reader: if ignore_acks and row['is_ack'] == 'True': continue if website in row['website'] and (row['protocol'] in protocols): if row['dst' if is_incoming else 'src'] == home_ip: # if 40 < int(row['len']) < 1458: lengths.append(int(row['len'])) if need_ids: ids.append(make_stream_id(row)) if tso: lengths, ids = mock_tso(lengths, ids) return sorted(lengths), ids
def generate_data(input_file, home_ip, website="youtube.com", protocols=None): if protocols is None: protocols = ["TCP"] data_sums = {"all_streams": {"time": [], "total": 0}} first_timestamp, duration = get_time_limits(input_file, website, home_ip, protocols) interval = 1 # second current_time = 0 with open(input_file, "rb") as csv_file: data_reader = csv.DictReader(csv_file, delimiter=",") for row in data_reader: if website in row["website"] and (row["protocol"] in protocols): if row["dst"] == home_ip: this_stream_id = make_stream_id(row) if this_stream_id not in data_sums: data_sums[this_stream_id] = {"time": map(lambda x: (x, 0), range(0, current_time)), "total": 0} timestamp = float(row["timestamp"]) - first_timestamp pkt_size = int(row["len"]) if timestamp < current_time + interval: data_sums[this_stream_id]["total"] += pkt_size data_sums["all_streams"]["total"] += pkt_size else: for stream_id in data_sums.keys(): data_sums[stream_id]["time"].append((current_time, data_sums[stream_id]["total"])) current_time += interval while timestamp > current_time: for stream_id in data_sums.keys(): data_sums[stream_id]["time"].append((current_time, data_sums[stream_id]["total"])) current_time += interval return data_sums
def generate_flow_plot_data(input_file, home_ip, website='youtube.com', is_incoming=True): stream_set = set() closed_set = set() start_timestamp = None connection_list = GaplessList() with open(input_file, 'rb') as csv_file: data_reader = csv.DictReader(csv_file, delimiter=',') for row in data_reader: # set start_timestamp if not set start_timestamp = float( row['timestamp'] ) if start_timestamp is None else start_timestamp if website in row['website'] and row['protocol'] == 'TCP': if row['dst' if is_incoming else 'src'] == home_ip: stream_id = make_stream_id(row) if row['is_rst'] == 'False' and row[ 'is_fin'] == 'False' and stream_id not in closed_set: stream_set.add(stream_id) else: stream_set.discard(stream_id) closed_set.add(stream_id) packet_time = int( float(row['timestamp']) - start_timestamp) connection_list.set_element(packet_time, len(stream_set)) return connection_list.get_list()
def generate_data(input_file, home_ip, website='youtube.com', is_incoming=True, num_values=5, protocols=None): if protocols is None: protocols = ['TCP'] packet_sums = {} with open(input_file, 'rb') as csv_file: data_reader = csv.DictReader(csv_file, delimiter=',') for row in data_reader: if website in row['website'] and (row['protocol'] in protocols): if row['dst' if is_incoming else 'src'] == home_ip: stream_id = make_stream_id(row) try: packet_sums[stream_id] += int(row['len']) except KeyError: packet_sums[stream_id] = int(row['len']) values = sorted(packet_sums.items(), cmp=lambda x, y: x[1] - y[1], reverse=True) # print values[0], values[1], values[2], values[3], values[4] values = map(lambda x: (x[0].split('-')[-1], x[1]), values) shortened = False if num_values < len(values): values = values[0:num_values - 1] + [ ('MIX', sum(map(lambda x: x[1], values[num_values - 1:]))) ] shortened = True return values, shortened
def generate_data(input_file, home_ip, website='youtube.com', protocols=None): if protocols is None: protocols = ['TCP'] data_sums = {'all_streams': {'time': [], 'total': 0}} first_timestamp, duration = get_time_limits(input_file, website, home_ip, protocols) interval = 1 # second current_time = 0 with open(input_file, 'rb') as csv_file: data_reader = csv.DictReader(csv_file, delimiter=',') for row in data_reader: if website in row['website'] and (row['protocol'] in protocols): if row['dst'] == home_ip: this_stream_id = make_stream_id(row) if this_stream_id not in data_sums: data_sums[this_stream_id] = {'time': map(lambda x: (x, 0), range(0, current_time)), 'total': 0} timestamp = float(row['timestamp']) - first_timestamp pkt_size = int(row['len']) if timestamp < current_time + interval: data_sums[this_stream_id]['total'] += pkt_size data_sums['all_streams']['total'] += pkt_size else: for stream_id in data_sums.keys(): data_sums[stream_id]['time'].append((current_time, data_sums[stream_id]['total'])) current_time += interval while timestamp > current_time: for stream_id in data_sums.keys(): data_sums[stream_id]['time'].append((current_time, data_sums[stream_id]['total'])) current_time += interval return data_sums
def get_stream_sizes(input_file, home_ip, stream_ids, is_incoming=False): sizes = [0] * len(stream_ids) all_sizes = 0 with open(input_file, 'rb') as csv_file: data_reader = csv.DictReader(csv_file, delimiter=',') for row in data_reader: if row['is_ack'] == 'True': continue if row['protocol'] != 'TCP': continue if row['dst' if is_incoming else 'src'] == home_ip: all_sizes += int(row['len']) this_stream_id = make_stream_id(row) for i, stream_id in enumerate(stream_ids): if stream_id == this_stream_id: sizes[i] += int(row['len']) break return sizes, all_sizes
def get_stream_sizes(input_file, home_ip, stream_ids, is_incoming=False): sizes = [0] * len(stream_ids) all_sizes = 0 with open(input_file, "rb") as csv_file: data_reader = csv.DictReader(csv_file, delimiter=",") for row in data_reader: if row["is_ack"] == "True": continue if row["protocol"] != "TCP": continue if row["dst" if is_incoming else "src"] == home_ip: all_sizes += int(row["len"]) this_stream_id = make_stream_id(row) for i, stream_id in enumerate(stream_ids): if stream_id == this_stream_id: sizes[i] += int(row["len"]) break return sizes, all_sizes
def get_ad_streams(input_file, home_ip, website="youtube.com", is_incoming=False): ad_dict = {} with open(input_file, "rb") as csv_file: data_reader = csv.DictReader(csv_file, delimiter=",") for row in data_reader: if row["is_ack"] == "True": continue if row["protocol"] != "TCP": continue if website in row["website"]: if row["dst" if is_incoming else "src"] == home_ip: if row["url"] != "": stream_id = make_stream_id(row) try: ad_dict[stream_id].append(row["url"]) except KeyError: ad_dict[stream_id] = [row["url"]] return ad_dict
def get_ad_streams(input_file, home_ip, website='youtube.com', is_incoming=False): ad_dict = {} with open(input_file, 'rb') as csv_file: data_reader = csv.DictReader(csv_file, delimiter=',') for row in data_reader: if row['is_ack'] == 'True': continue if row['protocol'] != 'TCP': continue if website in row['website']: if row['dst' if is_incoming else 'src'] == home_ip: if row['url'] != '': stream_id = make_stream_id(row) try: ad_dict[stream_id].append(row['url']) except KeyError: ad_dict[stream_id] = [row['url']] return ad_dict
def generate_flow_plot_data(input_file, home_ip, website="youtube.com", is_incoming=True): stream_set = set() closed_set = set() start_timestamp = None connection_list = GaplessList() with open(input_file, "rb") as csv_file: data_reader = csv.DictReader(csv_file, delimiter=",") for row in data_reader: # set start_timestamp if not set start_timestamp = float(row["timestamp"]) if start_timestamp is None else start_timestamp if website in row["website"] and row["protocol"] == "TCP": if row["dst" if is_incoming else "src"] == home_ip: stream_id = make_stream_id(row) if row["is_rst"] == "False" and row["is_fin"] == "False" and stream_id not in closed_set: stream_set.add(stream_id) else: stream_set.discard(stream_id) closed_set.add(stream_id) packet_time = int(float(row["timestamp"]) - start_timestamp) connection_list.set_element(packet_time, len(stream_set)) return connection_list.get_list()
def generate_data(input_file, home_ip, website='youtube.com', is_incoming=True, num_values=5, protocols=None): if protocols is None: protocols = ['TCP'] packet_sums = {} with open(input_file, 'rb') as csv_file: data_reader = csv.DictReader(csv_file, delimiter=',') for row in data_reader: if website in row['website'] and (row['protocol'] in protocols): if row['dst' if is_incoming else 'src'] == home_ip: stream_id = make_stream_id(row) try: packet_sums[stream_id] += int(row['len']) except KeyError: packet_sums[stream_id] = int(row['len']) values = sorted(packet_sums.items(), cmp=lambda x, y: x[1]-y[1], reverse=True) # print values[0], values[1], values[2], values[3], values[4] values = map(lambda x: (x[0].split('-')[-1], x[1]), values) shortened = False if num_values < len(values): values = values[0:num_values - 1] + [('MIX', sum(map(lambda x: x[1], values[num_values - 1:])))] shortened = True return values, shortened
def generate_chunk_data(input_file, home_ip, website, protocols=None, threshold=0.11): ignore_init = 25 if protocols is None: protocols = ['TCP'] chunks = {} first_timestamp, duration = get_time_limits(input_file, website, home_ip, protocols) with open(input_file, 'rb') as csv_file: data_reader = csv.DictReader(csv_file, delimiter=',') for row in data_reader: if website in row['website'] and (row['protocol'] in protocols): if row['dst'] == home_ip: timestamp = float(row['timestamp']) - first_timestamp if timestamp < ignore_init: continue stream_id = make_stream_id(row) if stream_id not in chunks: chunks[stream_id] = { 'chunks': [], 'current_chunk': 0, 'current_time': timestamp } if timestamp < chunks[stream_id][ 'current_time'] + threshold: chunks[stream_id]['current_time'] = timestamp chunks[stream_id]['current_chunk'] += int(row['len']) else: chunk = chunks[stream_id]['current_chunk'] chunks[stream_id]['chunks'].append((chunk, timestamp)) chunks[stream_id]['current_chunk'] = 0 chunks[stream_id]['current_time'] = timestamp return chunks
def plot_thing(input_file, home_ip, plot_all=True, plot_first=False, plot_second=False, plot_rest=False, website='youtube.com', is_incoming=True, colors=None, chart=None, names=None, sizes=True, protocols=None, window=1): if protocols is None: protocols = ['TCP'] if names is None: names = ['All streams', 'Top stream', 'Rest'] if colors is None: colors = ['r', 'g', 'b'] streams = {} with open(input_file, 'rb') as csv_file: data_reader = csv.DictReader(csv_file, delimiter=',') for row in data_reader: if website in row['website'] and (row['protocol'] in protocols): if row['dst' if is_incoming else 'src'] == home_ip and row[ 'is_ack'] == 'False': stream_id = make_stream_id(row) try: streams[stream_id].append( (float(row['timestamp']), int(row['len']))) except KeyError: streams[stream_id] = [(float(row['timestamp']), int(row['len']))] # sort list of lists by first value of each list sorted_streams = sorted(streams.values(), key=lambda tuples: tuples[0][0]) first_timestamp = sorted_streams[0][0][0] # subtract first timestamp from all sorted_streams = map( lambda stream: map(lambda tup: (tup[0] - first_timestamp, tup[1]), stream), sorted_streams) # merge list of lists all_streams = reduce(lambda x, y: x + y, sorted_streams, []) def put_in_bins(time_stamps, size=True): """ Puts packets into time bins (e. g. packets of same second) :param time_stamps: list of tuples (time_stamp, packet_length) :param size: (optional) if True, sums up lengths, otherwise number of packets :return: """ time_bins = [] for time_stamp, length in time_stamps: bin_index = int(time_stamp / window) try: time_bins[bin_index] += length if size else 1 except IndexError: while len(time_bins) != bin_index: time_bins.append(0) time_bins.append(length if size else 1) return time_bins plot_line_list = [] plot_name_list = [] if chart is None: fig, ax = plt.subplots() if sizes: ax.set_ylabel('size in KB') else: ax.set_ylabel('# of packets') ax.set_xlabel('time, s') else: fig, ax, plot_line_list, plot_name_list = chart top_stream = sorted(sorted_streams, key=lambda stream: len(stream), reverse=True)[0] second_stream = sorted(sorted_streams, key=lambda stream: len(stream), reverse=True)[1] rest_start_index = 2 if plot_second else 1 rest_streams = reduce( lambda x, y: x + y, sorted(sorted_streams, key=lambda stream: len(stream), reverse=True)[rest_start_index:], []) i = 0 all_values = [] if plot_all: values = put_in_bins(all_streams, sizes) all_values.extend(values) red_line = plt.plot(values, colors[i] + '-') plot_line_list.append(red_line[0]) plot_name_list.append(names[i]) i += 1 if plot_first: values = put_in_bins(top_stream, sizes) all_values.extend(values) blue_line = plt.plot(values, colors[i] + '-') plot_line_list.append(blue_line[0]) plot_name_list.append(names[i]) i += 1 if plot_second: values = put_in_bins(second_stream, sizes) all_values.extend(values) magenta_line = plt.plot(values, colors[i] + '-') plot_line_list.append(magenta_line[0]) plot_name_list.append(names[i]) i += 1 if plot_rest: values = put_in_bins(rest_streams, sizes) all_values.extend(values) green_line = plt.plot(values, colors[i] + '-') plot_line_list.append(green_line[0]) plot_name_list.append(names[i]) i += 1 plt.legend(plot_line_list, plot_name_list) if sizes: fn, tix = make_storage_ticks(all_values) ax.yaxis.set_major_formatter(fn) plt.yticks(tix) ax.set_xticklabels([int(x * window) for x in ax.get_xticks()]) return fig, ax, plot_line_list, plot_name_list
def plot_thing(input_file, home_ip, plot_all=True, plot_first=False, plot_second=False, plot_rest=False, website='youtube.com', is_incoming=True, colors=None, chart=None, names=None, sizes=True, protocols=None, window=1): if protocols is None: protocols = ['TCP'] if names is None: names = ['All streams', 'Top stream', 'Rest'] if colors is None: colors = ['r', 'g', 'b'] streams = {} with open(input_file, 'rb') as csv_file: data_reader = csv.DictReader(csv_file, delimiter=',') for row in data_reader: if website in row['website'] and (row['protocol'] in protocols): if row['dst' if is_incoming else 'src'] == home_ip and row['is_ack'] == 'False': stream_id = make_stream_id(row) try: streams[stream_id].append((float(row['timestamp']), int(row['len']))) except KeyError: streams[stream_id] = [(float(row['timestamp']), int(row['len']))] # sort list of lists by first value of each list sorted_streams = sorted(streams.values(), key=lambda tuples: tuples[0][0]) first_timestamp = sorted_streams[0][0][0] # subtract first timestamp from all sorted_streams = map(lambda stream: map(lambda tup: (tup[0] - first_timestamp, tup[1]), stream), sorted_streams) # merge list of lists all_streams = reduce(lambda x, y: x + y, sorted_streams, []) def put_in_bins(time_stamps, size=True): """ Puts packets into time bins (e. g. packets of same second) :param time_stamps: list of tuples (time_stamp, packet_length) :param size: (optional) if True, sums up lengths, otherwise number of packets :return: """ time_bins = [] for time_stamp, length in time_stamps: bin_index = int(time_stamp / window) try: time_bins[bin_index] += length if size else 1 except IndexError: while len(time_bins) != bin_index: time_bins.append(0) time_bins.append(length if size else 1) return time_bins plot_line_list = [] plot_name_list = [] if chart is None: fig, ax = plt.subplots() if sizes: ax.set_ylabel('size in KB') else: ax.set_ylabel('# of packets') ax.set_xlabel('time, s') else: fig, ax, plot_line_list, plot_name_list = chart top_stream = sorted(sorted_streams, key=lambda stream: len(stream), reverse=True)[0] second_stream = sorted(sorted_streams, key=lambda stream: len(stream), reverse=True)[1] rest_start_index = 2 if plot_second else 1 rest_streams = reduce(lambda x, y: x + y, sorted(sorted_streams, key=lambda stream: len(stream), reverse=True)[rest_start_index:], []) i = 0 all_values = [] if plot_all: values = put_in_bins(all_streams, sizes) all_values.extend(values) red_line = plt.plot(values, colors[i] + '-') plot_line_list.append(red_line[0]) plot_name_list.append(names[i]) i += 1 if plot_first: values = put_in_bins(top_stream, sizes) all_values.extend(values) blue_line = plt.plot(values, colors[i] + '-') plot_line_list.append(blue_line[0]) plot_name_list.append(names[i]) i += 1 if plot_second: values = put_in_bins(second_stream, sizes) all_values.extend(values) magenta_line = plt.plot(values, colors[i] + '-') plot_line_list.append(magenta_line[0]) plot_name_list.append(names[i]) i += 1 if plot_rest: values = put_in_bins(rest_streams, sizes) all_values.extend(values) green_line = plt.plot(values, colors[i] + '-') plot_line_list.append(green_line[0]) plot_name_list.append(names[i]) i += 1 plt.legend(plot_line_list, plot_name_list) if sizes: fn, tix = make_storage_ticks(all_values) ax.yaxis.set_major_formatter(fn) plt.yticks(tix) ax.set_xticklabels([int(x * window) for x in ax.get_xticks()]) return fig, ax, plot_line_list, plot_name_list