Beispiel #1
0
def main(flow_pickle_file, model_name):
    # load flow data
    with open(flow_pickle_file, "rb") as raw_pickle:
        print('Opening pickle...')
        pickle_data = pickle.load(raw_pickle)
        print('Loaded!')

        flows = pickle_data

        for flow_tuple in flows:
            iterations_length = len(flows[flow_tuple]['iteration_bins'])
            break

    # prepare plot
    plt.title(
        'CDF of flow sizes in {} iterations of {} training\nDump file:{}'.
        format(iterations_length, model_name, flow_pickle_file))
    plt.ylabel('% of flow sizes per iteration')
    plt.xlabel('flow size per iteration [MB per iteration]')

    # extract information from flow data
    useful_flow_count = 0
    flow_strs = []
    for flow_tuple in flows:
        # flow identifiers, size, and packets
        src_tuple, dst_tuple = flow_tuple
        src_address, src_port = src_tuple
        dst_address, dst_port = dst_tuple
        flow_size = flows[flow_tuple]['flow_size_bytes']
        iteration_bins = flows[flow_tuple]['iteration_bins']

        # flow size must be nontrivial
        # if flow_size < MIN_FLOW_SIZE_THRESHOLD_BYTES:
        #     continue

        # formatted flow string
        flow_str = 'Flow {}: {} ~~> {} sent {} GB'.format(
            useful_flow_count + 1, src_address[-2:], dst_address[-2:],
            flow_size / 10**9)
        flow_strs.append(flow_str)
        print(flow_str)

        bin_sum = sum(iteration_bins)
        print('Disparity: {} bytes'.format(flow_size - bin_sum))

        # scale iteration_bins sizes to MB
        MB_iteration_bins = []
        for size in iteration_bins:
            size_MB = size / (10**6)
            MB_iteration_bins.append(size_MB)

        add_cdf_to_plot(MB_iteration_bins, plt)
        useful_flow_count += 1

    plt.legend(tuple(flow_strs))
    plt.show()
def main(flows_pickle_file, output_csv_file):
    # load flows
    with open(flows_pickle_file, "rb") as raw_pickle:
        flows = pickle.load(raw_pickle)

    # keep flows with a large flow_size_bytes
    flows = {k: v for k, v in flows.items() if v['flow_size_bytes'] > 10**9}

    # aggregate all flows' size per iteration
    all_sizes_per_iteration = []
    for flow_tuple in flows:
        all_sizes_per_iteration.extend(flows[flow_tuple]['iteration_bins'])

    # calculate cdf
    all_sizes_per_iteration = sorted(s for s in all_sizes_per_iteration
                                     if s != 0)
    items, counts = add_cdf_to_plot(all_sizes_per_iteration)

    # write cdf to output csv file
    with open(output_csv_file, 'w') as csv_file:

        # open csv and write header
        fieldnames = ['bytes_per_iteration', 'cdf']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()

        # write the flow size value
        for i, c in zip(items, counts):
            writer.writerow({'bytes_per_iteration': i, 'cdf': c / 100})

    plt.plot(items, counts)
    plt.show()
def do_for_model(model_name):
    print(model_name)
    model_dir = os.path.join(CERBERUS_PATH, model_name)

    # populate iteration durations and flow sizes
    durations = []
    flow_sizes = []
    summmary_file = os.path.join(
        model_dir, f'{model_name}_summary_of_iteration_and_flow_size.csv')
    with open(summmary_file) as csvfile:
        # skip header
        next(csvfile, None)
        reader = csv.reader(csvfile, delimiter=',')
        for _, iteration_duration_seconds, flow_size_bytes_in_iteration in reader:
            durations.append(float(iteration_duration_seconds))
            flow_sizes.append(float(flow_size_bytes_in_iteration))

    # return CDFs for iteration and flow
    dItems, dCounts = add_cdf_to_plot(durations)
    fItems, fCounts = add_cdf_to_plot(flow_sizes)
    return (dItems, dCounts), (fItems, fCounts)
Beispiel #4
0
def main(iteration_logs, model_name, output_time_cdf):
    # parse file into iterations
    with open(iteration_logs, 'r') as iter_file:
        raw_text = iter_file.read()
        iterations = get_iterations_list(raw_text)

    # fetch length of iterations
    numbers = list(map(lambda it: it.number, iterations))
    lengths = list(map(lambda it: it.end_time - it.start_time, iterations))

    # calculate stats
    avg_length = np.average(lengths)
    avg_text = 'Average duration of iterations is {} seconds'.format(
        avg_length)

    # plot iteration times
    _, ax1 = plt.subplots()
    ax1.plot(numbers, lengths)
    ax1.set_ylabel('Iteration duration in [second]')
    ax1.set_xlabel('Iteration identifier [0-based index]')
    ax1.set_title('Durations of {} iterations of {} training\n{}'.format(
        len(numbers), model_name, avg_text))

    # plot CDF of iteration durations
    _, ax2 = plt.subplots()
    ax2.set_title(
        'CDF of iterations from {} iterations of {} training\n'.format(
            len(iterations), model_name),
        fontdict={'fontsize': 30})
    ax2.set_ylabel('Percent of Iterations', fontsize=30)
    ax2.set_xlabel('Iteration Duration (s)', fontsize=30)
    ax2.tick_params(axis='x', labelsize=30)
    ax2.tick_params(axis='y', labelsize=30)
    ax2.set_xlim(left=0.3)
    items, counts = add_cdf_to_plot(lengths, ax2)

    # write csv for iteration durations
    with open(output_time_cdf, 'w') as csv_file:

        # open csv and write header
        fieldnames = ['iteration_duration_seconds', 'cdf']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()

        # write the flow size value
        for i, c in zip(items, counts):
            writer.writerow({'iteration_duration_seconds': i, 'cdf': c / 100})

    plt.show()
def main(
    summary_file,
    model_name
):
    iteration_durations = []
    flow_sizes_in_iteration = []
    num_iters = -1 * float('inf')

    # populate iteration duration and flow size information
    with open(summary_file, newline='') as csvfile:
        # skip header
        next(csvfile, None)
        reader = csv.reader(csvfile, delimiter=',')
        for iteration_number, iteration_duration_seconds, flow_size_bytes_in_iteration in reader:
            num_iters = max(num_iters, int(iteration_number))
            iteration_durations.append(float(iteration_duration_seconds))
            flow_sizes_in_iteration.append(float(flow_size_bytes_in_iteration) / (10**9))

    # plot CDF of iteration durations
    # plt.subplot(211)
    # items, counts = add_cdf_to_plot(iteration_durations, plt)
    # plt.plot(items, counts)
    # plt.title('CDF of iteration durations from {} iterations of {} training\n'.format(num_iters, model_name))
    # plt.ylabel('% of iteration durations')
    # plt.xlabel('iteration duration in [second]')

    # plot CDF of flow sizes
    # plt.subplot(212)
    items, counts = add_cdf_to_plot(flow_sizes_in_iteration, plt)
    for i, c in zip(items, counts):
        print((i, c))
    plt.plot(items, counts)
    plt.title('CDF of flow size per iteration from {} iterations of {} training\n'.format(num_iters, model_name), fontdict={'fontsize':30})
    plt.ylabel('% of flow size size per iteration', fontsize=30)
    plt.xlabel('flow size size per iteration in [GB]', fontsize=30)
    plt.tick_params(axis='x', labelsize=30)
    plt.tick_params(axis='y', labelsize=30)

    plt.show()
def main(
    iteration_logs,
    model_name,
    output_time_cdf
):
    # parse file into iterations
    with open(iteration_logs, 'r') as iter_file:
        raw_text = iter_file.read()
        iterations = get_iterations_list(raw_text)

    # fetch length of iterations
    numbers = list(map(lambda it: it.number, iterations))
    lengths = list(map(lambda it: it.end_time - it.start_time, iterations))

    # calculate stats
    avg_length = np.average(lengths)
    avg_text = 'Average duration of iterations is {} seconds'.format(avg_length)
    
    # plot iteration times
    plt.plot(numbers, lengths)
    plt.ylabel('Iteration duration in [second]')
    plt.xlabel('Iteration identifier [0-based index]')
    plt.title('Durations of {} iterations of {} training\n{}'.format(len(numbers), model_name, avg_text))

    # write csv for iteration durations
    items, counts = add_cdf_to_plot(lengths)
    with open(output_time_cdf, 'w') as csv_file:

        # open csv and write header
        fieldnames = ['iteration_duration_seconds', 'cdf']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()

        # write the flow size value
        for i, c in zip(items, counts):
            writer.writerow({'iteration_duration_seconds': i, 'cdf': c / 100})

    plt.show()
def make_iterations_cdf_csv(model):
    iterfilename = f'hvd_out_{model}'
    model_dir = os.path.join(path, model)
    iteration_duration_outfile = f'{model}_iterations_cdf.csv'

    # parse file into iterations
    with open(os.path.join(model_dir, iterfilename), 'r') as iter_file:
        raw_text = iter_file.read()
        iterations = get_iterations_list(raw_text)
        lengths = list(map(lambda it: it.end_time - it.start_time, iterations))

    # write csv for iteration durations
    with open(os.path.join(model_dir, iteration_duration_outfile), 'w') as csv_file:
        items, counts = add_cdf_to_plot(lengths)
        # open csv and write header
        fieldnames = ['iteration_duration_seconds', 'cdf']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        # write the flow size value
        for i, c in zip(items, counts):
            writer.writerow({'iteration_duration_seconds': i, 'cdf': c / 100})

    return iterations
def make_flows_cdf_csv(model):
    model_dir = os.path.join(path, model)
    flow_cdf_csv_outfile = f'{model}_flow_size_per_iteration_cdf.csv'
    all_flows = []
    
    # add all relevant flows
    flow_size_per_iterations = []
    pickle_dir = os.path.join(model_dir, 'flow_pickles')
    slice_dir = os.path.join(pickle_dir, 'slice')
    for pickle_file_name in sorted(os.listdir(slice_dir)):
        # construct path to the flow pickle file
        flow_pickle_file_path = os.path.join(slice_dir, pickle_file_name)
        # fetch flows from pickle
        flows = flows_from_pickle(flow_pickle_file_path)
        for ft in flows:
            # if flows[ft]['flow_size_bytes'] >= 10**9:
            #     all_flows.append(flows[ft])
            #     flow_size_per_iterations.extend(flows[ft]['iteration_bins'])
            all_flows.append(flows[ft])
            # flow_size_per_iterations.extend(flows[ft]['iteration_bins'])
            for fspi in flows[ft]['iteration_bins']:
                if fspi > FLOW_SIZE_THRESH_BYTES:
                    flow_size_per_iterations.append(fspi)
    
    # write csv of flow sizes
    with open(os.path.join(model_dir, flow_cdf_csv_outfile), 'w') as csv_file:
        items, counts = add_cdf_to_plot(flow_size_per_iterations)
        # open csv and write header
        fieldnames = ['bytes_per_iteration', 'cdf']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        # write the flow size value
        for i, c in zip(items, counts):
            writer.writerow({'bytes_per_iteration': i, 'cdf': c / 100})

    return all_flows
        # fetch flows from pickle
        flows = flows_from_pickle(flow_pickle_file_path)

        # add this flow's iteration bins to the total
        for flow_tuple in tqdm(flows,
                               total=len(flows),
                               desc=f"Accumulating {model_name} flows..."):

            # record total flow size
            flow_size = flows[flow_tuple]['flow_size_bytes']
            flow_sizes.append(flow_size)

            # record flow sizes per iteration
            iteration_bins_bytes = flows[flow_tuple]['iteration_bins']
            all_iteration_bins.extend(iteration_bins_bytes)

    # calculate flow total bytes sent cdf
    flow_sizes = sorted(s for s in flow_sizes if s != 0)
    items, counts = add_cdf_to_plot(flow_sizes)

    # write cdf of flow's bytes sent
    write_flow_bytes_sent_csv(model_name, items, counts)

    # calculate flow size per iteration cdf
    all_iteration_bins = sorted(s for s in all_iteration_bins if s != 0)
    items, counts = add_cdf_to_plot(all_iteration_bins)

    # write cdf of flow's bytes sent
    write_flow_size_per_iteration_bytes_csv(model_name, items, counts)
    for flow_pickle_filename in os.listdir(slice_flow_directory):

        # construct path to the flow pickle file
        flow_pickle_file_path = os.path.join(slice_flow_directory,
                                             flow_pickle_filename)

        # fetch flows from pickle
        flows = flows_from_pickle(flow_pickle_file_path)

        # add this flow's iteration bins to the total
        for flow_tuple in tqdm(flows,
                               total=len(flows),
                               desc=f"Accumulating {model_name} flows..."):
            # scale iteration_bins flow sizes to GB
            gigabyte_flow_iteration_bins = map(
                lambda s: s / 10**9, flows[flow_tuple]['iteration_bins'])
            all_iteration_bins.extend(gigabyte_flow_iteration_bins)

    # calculate cdf
    all_iteration_bins = sorted(s for s in all_iteration_bins if s != 0)
    items, counts = add_cdf_to_plot(all_iteration_bins)

    # normalize counts to CDF [0, 1]
    counts = [c / 100 for c in counts]

    # plot the flows' cdf for this model
    plt.plot(items, counts, linewidth=10.0, label=f"{model_name} flows")

# show legend and plot
plt.legend()
plt.show()
Beispiel #11
0
    with open(flow_size_file_path) as flow_size_file:
        model_flows = json.load(flow_size_file)

    data_flows_counter = 0
    for flow in model_flows:
        flow_src = flow['flow_src_ip']
        flow_dst = flow['flow_dst_ip']
        flow_size_GB = flow['flow_size_bytes'] / 10**9
        flow_size_per_iteration = flow['flow_size_per_iteration']
        flow_size_GB_per_iteration = [
            s / 10**9 for s in flow_size_per_iteration
        ]

        print(flow_size_GB, sum(flow_size_GB_per_iteration))

        # keep only nontrivial data flows larger
        # if flow_size_GB <= 4 * 10**-8:
        # if flow_size_GB <= 1:
        # 	continue

        data_flows_counter += 1
        flow_str = '{} flow #{}: {} ~~> {} sent {} GB'.format(\
           model_name, data_flows_counter, flow_src, flow_dst, flow_size_GB)
        flow_strs.append(flow_str)
        print(flow_str)

        add_cdf_to_plot(flow_size_GB_per_iteration, plt)

plt.legend(flow_strs)
plt.show()