Ejemplo n.º 1
0
def generate_chunk_data(input_file, home_ip, website, protocols=None, threshold=0.11):
    ignore_init = 25
    if protocols is None:
        protocols = ['TCP']
    chunks = {}
    first_timestamp, duration = get_time_limits(input_file, website, home_ip, protocols)
    with open(input_file, 'rb') as csv_file:
        data_reader = csv.DictReader(csv_file, delimiter=',')
        for row in data_reader:
            if website in row['website'] and (row['protocol'] in protocols):
                if row['dst'] == home_ip:
                    timestamp = float(row['timestamp']) - first_timestamp
                    if timestamp < ignore_init:
                        continue
                    stream_id = make_stream_id(row)
                    if stream_id not in chunks:
                        chunks[stream_id] = {'chunks': [], 'current_chunk': 0, 'current_time': timestamp}
                    if timestamp < chunks[stream_id]['current_time'] + threshold:
                        chunks[stream_id]['current_time'] = timestamp
                        chunks[stream_id]['current_chunk'] += int(row['len'])
                    else:
                        chunk = chunks[stream_id]['current_chunk']
                        chunks[stream_id]['chunks'].append((chunk, timestamp))
                        chunks[stream_id]['current_chunk'] = 0
                        chunks[stream_id]['current_time'] = timestamp
    return chunks
Ejemplo n.º 2
0
def get_packet_length_data(input_file,
                           home_ip,
                           website,
                           is_incoming=True,
                           ignore_acks=True,
                           tso=False,
                           need_ids=False,
                           protocols=None):
    # lengths_dict = {}
    if protocols is None:
        protocols = ['TCP']
    lengths = []
    ids = [] if need_ids else None

    with open(input_file, 'rb') as csv_file:
        data_reader = csv.DictReader(csv_file, delimiter=',')
        for row in data_reader:
            if ignore_acks and row['is_ack'] == 'True':
                continue
            if website in row['website'] and (row['protocol'] in protocols):
                if row['dst' if is_incoming else 'src'] == home_ip:
                    # if 40 < int(row['len']) < 1458:
                    lengths.append(int(row['len']))
                    if need_ids:
                        ids.append(make_stream_id(row))

    if tso:
        lengths, ids = mock_tso(lengths, ids)

    return sorted(lengths), ids
Ejemplo n.º 3
0
def generate_data(input_file, home_ip, website="youtube.com", protocols=None):
    if protocols is None:
        protocols = ["TCP"]
    data_sums = {"all_streams": {"time": [], "total": 0}}
    first_timestamp, duration = get_time_limits(input_file, website, home_ip, protocols)
    interval = 1  # second
    current_time = 0
    with open(input_file, "rb") as csv_file:
        data_reader = csv.DictReader(csv_file, delimiter=",")
        for row in data_reader:
            if website in row["website"] and (row["protocol"] in protocols):
                if row["dst"] == home_ip:
                    this_stream_id = make_stream_id(row)
                    if this_stream_id not in data_sums:
                        data_sums[this_stream_id] = {"time": map(lambda x: (x, 0), range(0, current_time)), "total": 0}
                    timestamp = float(row["timestamp"]) - first_timestamp
                    pkt_size = int(row["len"])
                    if timestamp < current_time + interval:
                        data_sums[this_stream_id]["total"] += pkt_size
                        data_sums["all_streams"]["total"] += pkt_size
                    else:
                        for stream_id in data_sums.keys():
                            data_sums[stream_id]["time"].append((current_time, data_sums[stream_id]["total"]))
                        current_time += interval
                        while timestamp > current_time:
                            for stream_id in data_sums.keys():
                                data_sums[stream_id]["time"].append((current_time, data_sums[stream_id]["total"]))
                            current_time += interval
    return data_sums
Ejemplo n.º 4
0
def generate_flow_plot_data(input_file,
                            home_ip,
                            website='youtube.com',
                            is_incoming=True):
    stream_set = set()
    closed_set = set()
    start_timestamp = None
    connection_list = GaplessList()

    with open(input_file, 'rb') as csv_file:
        data_reader = csv.DictReader(csv_file, delimiter=',')
        for row in data_reader:
            # set start_timestamp if not set
            start_timestamp = float(
                row['timestamp']
            ) if start_timestamp is None else start_timestamp
            if website in row['website'] and row['protocol'] == 'TCP':
                if row['dst' if is_incoming else 'src'] == home_ip:
                    stream_id = make_stream_id(row)
                    if row['is_rst'] == 'False' and row[
                            'is_fin'] == 'False' and stream_id not in closed_set:
                        stream_set.add(stream_id)
                    else:
                        stream_set.discard(stream_id)
                        closed_set.add(stream_id)
                    packet_time = int(
                        float(row['timestamp']) - start_timestamp)
                    connection_list.set_element(packet_time, len(stream_set))
    return connection_list.get_list()
Ejemplo n.º 5
0
def generate_data(input_file,
                  home_ip,
                  website='youtube.com',
                  is_incoming=True,
                  num_values=5,
                  protocols=None):
    if protocols is None:
        protocols = ['TCP']
    packet_sums = {}
    with open(input_file, 'rb') as csv_file:
        data_reader = csv.DictReader(csv_file, delimiter=',')
        for row in data_reader:
            if website in row['website'] and (row['protocol'] in protocols):
                if row['dst' if is_incoming else 'src'] == home_ip:
                    stream_id = make_stream_id(row)
                    try:
                        packet_sums[stream_id] += int(row['len'])
                    except KeyError:
                        packet_sums[stream_id] = int(row['len'])

    values = sorted(packet_sums.items(),
                    cmp=lambda x, y: x[1] - y[1],
                    reverse=True)
    # print values[0], values[1], values[2], values[3], values[4]
    values = map(lambda x: (x[0].split('-')[-1], x[1]), values)

    shortened = False
    if num_values < len(values):
        values = values[0:num_values - 1] + [
            ('MIX', sum(map(lambda x: x[1], values[num_values - 1:])))
        ]
        shortened = True
    return values, shortened
Ejemplo n.º 6
0
def generate_data(input_file, home_ip, website='youtube.com', protocols=None):
    if protocols is None:
        protocols = ['TCP']
    data_sums = {'all_streams': {'time': [], 'total': 0}}
    first_timestamp, duration = get_time_limits(input_file, website, home_ip, protocols)
    interval = 1  # second
    current_time = 0
    with open(input_file, 'rb') as csv_file:
        data_reader = csv.DictReader(csv_file, delimiter=',')
        for row in data_reader:
            if website in row['website'] and (row['protocol'] in protocols):
                if row['dst'] == home_ip:
                    this_stream_id = make_stream_id(row)
                    if this_stream_id not in data_sums:
                        data_sums[this_stream_id] = {'time': map(lambda x: (x, 0), range(0, current_time)), 'total': 0}
                    timestamp = float(row['timestamp']) - first_timestamp
                    pkt_size = int(row['len'])
                    if timestamp < current_time + interval:
                        data_sums[this_stream_id]['total'] += pkt_size
                        data_sums['all_streams']['total'] += pkt_size
                    else:
                        for stream_id in data_sums.keys():
                            data_sums[stream_id]['time'].append((current_time, data_sums[stream_id]['total']))
                        current_time += interval
                        while timestamp > current_time:
                            for stream_id in data_sums.keys():
                                data_sums[stream_id]['time'].append((current_time, data_sums[stream_id]['total']))
                            current_time += interval
    return data_sums
Ejemplo n.º 7
0
def get_stream_sizes(input_file, home_ip, stream_ids, is_incoming=False):
    sizes = [0] * len(stream_ids)
    all_sizes = 0
    with open(input_file, 'rb') as csv_file:
        data_reader = csv.DictReader(csv_file, delimiter=',')
        for row in data_reader:
            if row['is_ack'] == 'True':
                continue
            if row['protocol'] != 'TCP':
                continue
            if row['dst' if is_incoming else 'src'] == home_ip:
                all_sizes += int(row['len'])
                this_stream_id = make_stream_id(row)
                for i, stream_id in enumerate(stream_ids):
                    if stream_id == this_stream_id:
                        sizes[i] += int(row['len'])
                        break
    return sizes, all_sizes
Ejemplo n.º 8
0
def get_stream_sizes(input_file, home_ip, stream_ids, is_incoming=False):
    sizes = [0] * len(stream_ids)
    all_sizes = 0
    with open(input_file, "rb") as csv_file:
        data_reader = csv.DictReader(csv_file, delimiter=",")
        for row in data_reader:
            if row["is_ack"] == "True":
                continue
            if row["protocol"] != "TCP":
                continue
            if row["dst" if is_incoming else "src"] == home_ip:
                all_sizes += int(row["len"])
                this_stream_id = make_stream_id(row)
                for i, stream_id in enumerate(stream_ids):
                    if stream_id == this_stream_id:
                        sizes[i] += int(row["len"])
                        break
    return sizes, all_sizes
Ejemplo n.º 9
0
def get_ad_streams(input_file, home_ip, website="youtube.com", is_incoming=False):
    ad_dict = {}
    with open(input_file, "rb") as csv_file:
        data_reader = csv.DictReader(csv_file, delimiter=",")
        for row in data_reader:
            if row["is_ack"] == "True":
                continue
            if row["protocol"] != "TCP":
                continue
            if website in row["website"]:
                if row["dst" if is_incoming else "src"] == home_ip:
                    if row["url"] != "":
                        stream_id = make_stream_id(row)
                        try:
                            ad_dict[stream_id].append(row["url"])
                        except KeyError:
                            ad_dict[stream_id] = [row["url"]]
    return ad_dict
Ejemplo n.º 10
0
def get_ad_streams(input_file,
                   home_ip,
                   website='youtube.com',
                   is_incoming=False):
    ad_dict = {}
    with open(input_file, 'rb') as csv_file:
        data_reader = csv.DictReader(csv_file, delimiter=',')
        for row in data_reader:
            if row['is_ack'] == 'True':
                continue
            if row['protocol'] != 'TCP':
                continue
            if website in row['website']:
                if row['dst' if is_incoming else 'src'] == home_ip:
                    if row['url'] != '':
                        stream_id = make_stream_id(row)
                        try:
                            ad_dict[stream_id].append(row['url'])
                        except KeyError:
                            ad_dict[stream_id] = [row['url']]
    return ad_dict
Ejemplo n.º 11
0
def generate_flow_plot_data(input_file, home_ip, website="youtube.com", is_incoming=True):
    stream_set = set()
    closed_set = set()
    start_timestamp = None
    connection_list = GaplessList()

    with open(input_file, "rb") as csv_file:
        data_reader = csv.DictReader(csv_file, delimiter=",")
        for row in data_reader:
            # set start_timestamp if not set
            start_timestamp = float(row["timestamp"]) if start_timestamp is None else start_timestamp
            if website in row["website"] and row["protocol"] == "TCP":
                if row["dst" if is_incoming else "src"] == home_ip:
                    stream_id = make_stream_id(row)
                    if row["is_rst"] == "False" and row["is_fin"] == "False" and stream_id not in closed_set:
                        stream_set.add(stream_id)
                    else:
                        stream_set.discard(stream_id)
                        closed_set.add(stream_id)
                    packet_time = int(float(row["timestamp"]) - start_timestamp)
                    connection_list.set_element(packet_time, len(stream_set))
    return connection_list.get_list()
def get_packet_length_data(input_file, home_ip, website, is_incoming=True, ignore_acks=True, tso=False, need_ids=False, protocols=None):
    # lengths_dict = {}
    if protocols is None:
        protocols = ['TCP']
    lengths = []
    ids = [] if need_ids else None

    with open(input_file, 'rb') as csv_file:
        data_reader = csv.DictReader(csv_file, delimiter=',')
        for row in data_reader:
            if ignore_acks and row['is_ack'] == 'True':
                continue
            if website in row['website'] and (row['protocol'] in protocols):
                if row['dst' if is_incoming else 'src'] == home_ip:
                    # if 40 < int(row['len']) < 1458:
                    lengths.append(int(row['len']))
                    if need_ids:
                        ids.append(make_stream_id(row))

    if tso:
        lengths, ids = mock_tso(lengths, ids)

    return sorted(lengths), ids
Ejemplo n.º 13
0
def generate_data(input_file, home_ip, website='youtube.com', is_incoming=True, num_values=5, protocols=None):
    if protocols is None:
        protocols = ['TCP']
    packet_sums = {}
    with open(input_file, 'rb') as csv_file:
        data_reader = csv.DictReader(csv_file, delimiter=',')
        for row in data_reader:
            if website in row['website'] and (row['protocol'] in protocols):
                if row['dst' if is_incoming else 'src'] == home_ip:
                    stream_id = make_stream_id(row)
                    try:
                        packet_sums[stream_id] += int(row['len'])
                    except KeyError:
                        packet_sums[stream_id] = int(row['len'])

    values = sorted(packet_sums.items(), cmp=lambda x, y: x[1]-y[1], reverse=True)
    # print values[0], values[1], values[2], values[3], values[4]
    values = map(lambda x: (x[0].split('-')[-1], x[1]), values)

    shortened = False
    if num_values < len(values):
        values = values[0:num_values - 1] + [('MIX', sum(map(lambda x: x[1], values[num_values - 1:])))]
        shortened = True
    return values, shortened
Ejemplo n.º 14
0
def generate_chunk_data(input_file,
                        home_ip,
                        website,
                        protocols=None,
                        threshold=0.11):
    ignore_init = 25
    if protocols is None:
        protocols = ['TCP']
    chunks = {}
    first_timestamp, duration = get_time_limits(input_file, website, home_ip,
                                                protocols)
    with open(input_file, 'rb') as csv_file:
        data_reader = csv.DictReader(csv_file, delimiter=',')
        for row in data_reader:
            if website in row['website'] and (row['protocol'] in protocols):
                if row['dst'] == home_ip:
                    timestamp = float(row['timestamp']) - first_timestamp
                    if timestamp < ignore_init:
                        continue
                    stream_id = make_stream_id(row)
                    if stream_id not in chunks:
                        chunks[stream_id] = {
                            'chunks': [],
                            'current_chunk': 0,
                            'current_time': timestamp
                        }
                    if timestamp < chunks[stream_id][
                            'current_time'] + threshold:
                        chunks[stream_id]['current_time'] = timestamp
                        chunks[stream_id]['current_chunk'] += int(row['len'])
                    else:
                        chunk = chunks[stream_id]['current_chunk']
                        chunks[stream_id]['chunks'].append((chunk, timestamp))
                        chunks[stream_id]['current_chunk'] = 0
                        chunks[stream_id]['current_time'] = timestamp
    return chunks
Ejemplo n.º 15
0
def plot_thing(input_file,
               home_ip,
               plot_all=True,
               plot_first=False,
               plot_second=False,
               plot_rest=False,
               website='youtube.com',
               is_incoming=True,
               colors=None,
               chart=None,
               names=None,
               sizes=True,
               protocols=None,
               window=1):
    if protocols is None:
        protocols = ['TCP']
    if names is None:
        names = ['All streams', 'Top stream', 'Rest']
    if colors is None:
        colors = ['r', 'g', 'b']

    streams = {}

    with open(input_file, 'rb') as csv_file:
        data_reader = csv.DictReader(csv_file, delimiter=',')
        for row in data_reader:
            if website in row['website'] and (row['protocol'] in protocols):
                if row['dst' if is_incoming else 'src'] == home_ip and row[
                        'is_ack'] == 'False':
                    stream_id = make_stream_id(row)
                    try:
                        streams[stream_id].append(
                            (float(row['timestamp']), int(row['len'])))
                    except KeyError:
                        streams[stream_id] = [(float(row['timestamp']),
                                               int(row['len']))]

    # sort list of lists by first value of each list
    sorted_streams = sorted(streams.values(), key=lambda tuples: tuples[0][0])
    first_timestamp = sorted_streams[0][0][0]

    # subtract first timestamp from all
    sorted_streams = map(
        lambda stream: map(lambda tup:
                           (tup[0] - first_timestamp, tup[1]), stream),
        sorted_streams)

    # merge list of lists
    all_streams = reduce(lambda x, y: x + y, sorted_streams, [])

    def put_in_bins(time_stamps, size=True):
        """
        Puts packets into time bins (e. g. packets of same second)
        :param time_stamps: list of tuples (time_stamp, packet_length)
        :param size: (optional) if True, sums up lengths, otherwise number of packets
        :return:
        """
        time_bins = []
        for time_stamp, length in time_stamps:
            bin_index = int(time_stamp / window)
            try:
                time_bins[bin_index] += length if size else 1
            except IndexError:
                while len(time_bins) != bin_index:
                    time_bins.append(0)
                time_bins.append(length if size else 1)
        return time_bins

    plot_line_list = []
    plot_name_list = []
    if chart is None:
        fig, ax = plt.subplots()
        if sizes:
            ax.set_ylabel('size in KB')
        else:
            ax.set_ylabel('# of packets')
        ax.set_xlabel('time, s')
    else:
        fig, ax, plot_line_list, plot_name_list = chart

    top_stream = sorted(sorted_streams,
                        key=lambda stream: len(stream),
                        reverse=True)[0]
    second_stream = sorted(sorted_streams,
                           key=lambda stream: len(stream),
                           reverse=True)[1]
    rest_start_index = 2 if plot_second else 1
    rest_streams = reduce(
        lambda x, y: x + y,
        sorted(sorted_streams, key=lambda stream: len(stream),
               reverse=True)[rest_start_index:], [])

    i = 0
    all_values = []
    if plot_all:
        values = put_in_bins(all_streams, sizes)
        all_values.extend(values)
        red_line = plt.plot(values, colors[i] + '-')
        plot_line_list.append(red_line[0])
        plot_name_list.append(names[i])
        i += 1
    if plot_first:
        values = put_in_bins(top_stream, sizes)
        all_values.extend(values)
        blue_line = plt.plot(values, colors[i] + '-')
        plot_line_list.append(blue_line[0])
        plot_name_list.append(names[i])
        i += 1
    if plot_second:
        values = put_in_bins(second_stream, sizes)
        all_values.extend(values)
        magenta_line = plt.plot(values, colors[i] + '-')
        plot_line_list.append(magenta_line[0])
        plot_name_list.append(names[i])
        i += 1
    if plot_rest:
        values = put_in_bins(rest_streams, sizes)
        all_values.extend(values)
        green_line = plt.plot(values, colors[i] + '-')
        plot_line_list.append(green_line[0])
        plot_name_list.append(names[i])
        i += 1
    plt.legend(plot_line_list, plot_name_list)
    if sizes:
        fn, tix = make_storage_ticks(all_values)
        ax.yaxis.set_major_formatter(fn)
        plt.yticks(tix)
    ax.set_xticklabels([int(x * window) for x in ax.get_xticks()])
    return fig, ax, plot_line_list, plot_name_list
Ejemplo n.º 16
0
def plot_thing(input_file, home_ip, plot_all=True, plot_first=False, plot_second=False, plot_rest=False, website='youtube.com', is_incoming=True, colors=None, chart=None, names=None, sizes=True, protocols=None, window=1):
    if protocols is None:
        protocols = ['TCP']
    if names is None:
        names = ['All streams', 'Top stream', 'Rest']
    if colors is None:
        colors = ['r', 'g', 'b']

    streams = {}

    with open(input_file, 'rb') as csv_file:
        data_reader = csv.DictReader(csv_file, delimiter=',')
        for row in data_reader:
            if website in row['website'] and (row['protocol'] in protocols):
                if row['dst' if is_incoming else 'src'] == home_ip and row['is_ack'] == 'False':
                    stream_id = make_stream_id(row)
                    try:
                        streams[stream_id].append((float(row['timestamp']), int(row['len'])))
                    except KeyError:
                        streams[stream_id] = [(float(row['timestamp']), int(row['len']))]

    # sort list of lists by first value of each list
    sorted_streams = sorted(streams.values(), key=lambda tuples: tuples[0][0])
    first_timestamp = sorted_streams[0][0][0]

    # subtract first timestamp from all
    sorted_streams = map(lambda stream: map(lambda tup: (tup[0] - first_timestamp, tup[1]), stream), sorted_streams)

    # merge list of lists
    all_streams = reduce(lambda x, y: x + y, sorted_streams, [])

    def put_in_bins(time_stamps, size=True):
        """
        Puts packets into time bins (e. g. packets of same second)
        :param time_stamps: list of tuples (time_stamp, packet_length)
        :param size: (optional) if True, sums up lengths, otherwise number of packets
        :return:
        """
        time_bins = []
        for time_stamp, length in time_stamps:
            bin_index = int(time_stamp / window)
            try:
                time_bins[bin_index] += length if size else 1
            except IndexError:
                while len(time_bins) != bin_index:
                    time_bins.append(0)
                time_bins.append(length if size else 1)
        return time_bins

    plot_line_list = []
    plot_name_list = []
    if chart is None:
        fig, ax = plt.subplots()
        if sizes:
            ax.set_ylabel('size in KB')
        else:
            ax.set_ylabel('# of packets')
        ax.set_xlabel('time, s')
    else:
        fig, ax, plot_line_list, plot_name_list = chart

    top_stream = sorted(sorted_streams, key=lambda stream: len(stream), reverse=True)[0]
    second_stream = sorted(sorted_streams, key=lambda stream: len(stream), reverse=True)[1]
    rest_start_index = 2 if plot_second else 1
    rest_streams = reduce(lambda x, y: x + y, sorted(sorted_streams, key=lambda stream: len(stream), reverse=True)[rest_start_index:], [])

    i = 0
    all_values = []
    if plot_all:
        values = put_in_bins(all_streams, sizes)
        all_values.extend(values)
        red_line = plt.plot(values, colors[i] + '-')
        plot_line_list.append(red_line[0])
        plot_name_list.append(names[i])
        i += 1
    if plot_first:
        values = put_in_bins(top_stream, sizes)
        all_values.extend(values)
        blue_line = plt.plot(values, colors[i] + '-')
        plot_line_list.append(blue_line[0])
        plot_name_list.append(names[i])
        i += 1
    if plot_second:
        values = put_in_bins(second_stream, sizes)
        all_values.extend(values)
        magenta_line = plt.plot(values, colors[i] + '-')
        plot_line_list.append(magenta_line[0])
        plot_name_list.append(names[i])
        i += 1
    if plot_rest:
        values = put_in_bins(rest_streams, sizes)
        all_values.extend(values)
        green_line = plt.plot(values, colors[i] + '-')
        plot_line_list.append(green_line[0])
        plot_name_list.append(names[i])
        i += 1
    plt.legend(plot_line_list, plot_name_list)
    if sizes:
        fn, tix = make_storage_ticks(all_values)
        ax.yaxis.set_major_formatter(fn)
        plt.yticks(tix)
    ax.set_xticklabels([int(x * window) for x in ax.get_xticks()])
    return fig, ax, plot_line_list, plot_name_list