Beispiel #1
0
def main():
    print("Loading...")
    fullset = common.load_data(FULLSET_PATH, sep=',')

    types = get_types(fullset)

    print("Predicting...")
    uncertain_mask = (types == UNCERTAIN_LABEL)
    uncertainset = fullset[uncertain_mask]
    probs = get_probs_for_uncertain(uncertainset)
    linenum_to_probs = {
        idx: prob
        for idx, prob in zip(np.nonzero(uncertain_mask)[0], probs)
    }

    print("Deciding...")
    probs_and_predictions = []
    for i, (row, type_) in enumerate(zip(fullset, types)):
        if type_ == UNCERTAIN_LABEL:
            probs = linenum_to_probs[i].tolist()
            prediction, order = check_and_decide(row[:common.N_DISASTER],
                                                 probs)
            probs_and_predictions.append(probs + [prediction] + [order + 1])
        elif type_ == -99:
            probs_and_predictions.append([-99] * (common.N_CLASS + 2))
        else:
            probs = [0.0] * common.N_CLASS
            probs[type_] = 1.0
            probs_and_predictions.append(probs + [type_] + [0])

    print("Saving...")
    common.save_data(
        np.concatenate((types[:, np.newaxis], probs_and_predictions), axis=1),
        OUTPUT_PATH)
Beispiel #2
0
def main():
    matrices = [correct_nodata(common.load_data(path, skip=6)) for path in INPUT_PATHS]

    matrix = np.dstack(matrices)
    rows = matrix.reshape(-1, len(MAP_NAMES))

    common.save_data(rows, OUTPUT_PATH)
Beispiel #3
0
def get_fires():
    """!
    Load fire data
    @return None
    """
    stmt_select = r'''SELECT
    f.FIREID,
    c.ORGUNITCODE || TO_CHAR(f.FIRESEQ, '000') AS FIRENAME,
    f.LATITUDE,
    f.LONGITUDE
FROM
    (SELECT *
    FROM FIRE f2
    WHERE f2.FIREID NOT IN 
        (SELECT DISTINCT FIREID
        FROM FIRESTATUS s
        WHERE CONDITION = 'OUT')) f
    LEFT JOIN CT_ORGUNIT c ON c.ORGUNIT=f.ORGUNIT
'''
    local_table = r'[ACTIVEFIRE]'
    index = ['FIREID']
    delete_all = True
    # need to fix longitude because they're all positive even though they're in the west
    result = read_remote(stmt_select)
    if result is None:
        return
    df = result.set_index(index)
    df['LONGITUDE'] = df['LONGITUDE'].apply(lambda x: -abs(x))
    common.save_data(local_table, df, delete_all, DBNAME)
Beispiel #4
0
 async def _shutdown(self, ctx):
     """Shut down the bot."""
     user = ctx.message.author
     if str(user) in common.admins:
         common.save_data()
         await self.bot.say("Goodbye!\nVocamon has shut down.")
         import sys
         sys.exit(0)
Beispiel #5
0
def main():
    print('Loading...')
    fullset = common.load_data(FULLSET_PATH, sep=',')

    print('Processing...')
    trainset = get_trainset(fullset, upward=True)

    print('Saving...')
    common.save_data(trainset, OUTPUT_PATH)

    print('Done!')
Beispiel #6
0
def run_convbase_on_images(src_folder, dst_file, image_shape, force_all=False):
    """
    Run the imported convolutional base on the cropped images.
    Save the result in a file.

    Parameters
    ----------
    src_folder (str):
        The folder of the images which have been cropped.
    dst_file (str):
        The destination file of images which have been processes in the convolutional base.
    force_all (bool), default False.
        If false, process only images which are present in the src_folder, but not the dst_folder.
        If True, delete everything in the dst_folder and process everything in the src_folder.
    Returns
    -------
        None
    """

    if force_all:
        xd = XData()
    else:
        try:
            with open(dst_file, "rb") as f:
                xd = pickle.load(f)
        except FileNotFoundError:
            xd = XData()

    src_images = os.listdir(src_folder)
    dst_images = xd.get_stored_filenames()
    missing_in_dst = list(set(src_images).difference(set(dst_images)))
    convbase = get_VGG16_convbase(image_shape)

    for i, filename in enumerate(missing_in_dst):
        print(f"{i + 1}/{len(missing_in_dst)}: Processing {filename}")
        src_path = os.path.join(src_folder, filename)
        im = cv2.imread(src_path)
        im = normalize_image(im)

        single_image_batch = np.array([im])
        after_convbase_single = convbase.predict(single_image_batch)
        after_convbase_single = after_convbase_single.flatten()

        xd.add_image(filename, after_convbase_single, assert_dimensions=1)

        if i % 100 == 0:
            print("Saving to avoid losing work if interrupted...")
            save_data(xd, dst_file)

    save_data(xd, dst_file)

    return xd
Beispiel #7
0
    async def _restart(self, ctx):
        """Restart the bot."""
        user = ctx.message.author
        if str(user) in common.admins:
            print("Restart command received. Restarting...")
            await self.bot.say("Bot restarting...")
            common.save_data()

            import sys
            python = sys.executable
            sys.stdout.flush()
            os.execl("bot.py", python)
        else:
            await self.bot.say("{0} is not an administrator".format(user))
Beispiel #8
0
def copy_table(stmt_select, local_table, index, delete_all=False):
    """!
    Copy contents of remote table to local database
    @param stmt_select SQL statement for reading data
    @param local_table Table in local database to insert into
    @param index Index for data that gets read
    @param delete_all Whether or not to clear entire table before adding data
    @return None
    """
    # insert_data(read_remote(stmt_select), local_table)
    # need to set index or it screws up and tries to insert 'Index' column
    result = read_remote(stmt_select)
    if result is not None:
        common.save_data(SCHEMA + '.' + local_table, result.set_index(index),
                         delete_all, DBNAME)
Beispiel #9
0
def main():
    fullset = common.load_data(FULLSET_PATH, sep=',')

    clust_samples = get_clust_samples(fullset)

    km = KModes(n_clusters=N_CLUST, n_init=N_INIT, init='Huang', verbose=True)
    clust_labels = km.fit_predict(clust_samples)

    label_to_codes = get_label_to_codes(clust_samples, clust_labels)

    with open(JSON_PATH, 'w') as f:
        json.dump(label_to_codes, f, sort_keys=True)

    common.save_data([[km.cost_]] + km.cluster_centroids_.tolist(),
                     RESULT_PATH)
Beispiel #10
0
def copy_ct_wstn():
    """!
    Copy CT_WSTN (code table for weather stations)
    @return None
    """
    stmt_select = r'select * from ct_wstn'
    local_table = r'[CT_WSTN]'
    index = ['WSTNCODE']
    delete_all = True
    # copy_table(stmt_select, local_table, index, True)
    # need to fix longitude because they're all positive even though they're in the west
    result = read_remote(stmt_select)
    if result is None:
        return
    df = result.set_index(index)
    df['LONGITUDE'] = df['LONGITUDE'].apply(lambda x: -abs(x))
    common.save_data(SCHEMA + '.' + local_table, df, delete_all, DBNAME)
Beispiel #11
0
 def run(self):
     cm.create_all_directories(
         [os.path.join(cm.checkpoint_path, 'feature_creation')])
     df = self.read_data()
     df = self.create_features(df)
     cm.save_data(
         df,
         os.path.join(
             cm.cleaned_data_path, 'regression',
             'features_extractor_sample_rate_' +
             str(self.sample_size).replace('.', '') + '_version_' +
             str(self.version) + '.csv'))
     pd.DataFrame().to_csv(
         os.path.join(
             cm.checkpoint_path, 'feature_creation',
             'success_sample_rate_' +
             str(self.sample_size).replace('.', '') + '_version_' +
             str(self.version) + '.csv'))
Beispiel #12
0
def process_trace(pcap_filepath,
                  graph_dir_exp,
                  stat_dir_exp,
                  failed_conns_dir_exp,
                  acksize_tcp_dir_exp,
                  tcpcsm,
                  mptcp_connections=None,
                  print_out=sys.stdout):
    """ Process a tcp pcap file and generate stats of its connections """
    cmd = ['tstat', '-s', os.path.basename(pcap_filepath[:-5]), pcap_filepath]

    try:
        connections = process_tstat_cmd(cmd,
                                        pcap_filepath,
                                        keep_log=True,
                                        graph_dir_exp=graph_dir_exp)
    except TstatError as e:
        print(str(e) + ": skip process", file=sys.stderr)
        return

    # Directory containing all TCPConnections that tried to be MPTCP subflows, but failed to
    failed_conns = {}

    if tcpcsm:
        retransmissions_tcpcsm(pcap_filepath, connections)

    inverse_conns = create_inverse_tcp_dictionary(connections)

    acksize_all = compute_tcp_acks_retrans(pcap_filepath, connections,
                                           inverse_conns)
    acksize_all_mptcp = {co.C2S: {}, co.S2C: {}}

    if mptcp_connections:
        for flow_id in connections:
            # Copy info to mptcp connections
            copy_info_to_mptcp_connections(connections, mptcp_connections,
                                           failed_conns, acksize_all,
                                           acksize_all_mptcp, flow_id)

    # Save connections info
    if mptcp_connections:
        co.save_data(pcap_filepath, acksize_tcp_dir_exp, acksize_all_mptcp)
        # Also save TCP connections that failed to be MPTCP subflows
        co.save_data(pcap_filepath, failed_conns_dir_exp, failed_conns)
    else:
        co.save_data(pcap_filepath, acksize_tcp_dir_exp, acksize_all)
        co.save_data(pcap_filepath, stat_dir_exp, connections)
Beispiel #13
0
def update_file(childs, last_upd):
    """Update data file and log file.

    The log file creates an history to be used in the future.
    """
    if common.DATA_FORMAT == common.JSON:
        last_upd_str = str(last_upd)  # ISO format
        common.save_data([childs, last_upd_str])
    elif common.DATA_FORMAT == common.PKL:
        common.save_data([childs, last_upd])
    else:
        # error
        pass

    if common.LOG:
        if common.DATA_FORMAT == common.JSON:
            last_upd_str = str(last_upd)
            common.save_data([childs, last_upd_str], common.LOG_FILE)
        elif common.DATA_FORMAT == common.PKL:
            common.save_data([childs, last_upd], common.LOG_FILE)
        else:
            # error
            pass
Beispiel #14
0
def create_setup():
    """Copy files from template and update them with user input."""
    global app_name, app_version, app_license, app_author, app_email, \
        app_url, app_keywords, DEFAULT_AUTHOR, DEFAULT_EMAIL, \
        DEFAULT_LICENSE, DEFAULT_URL, DEFAULT_VERSION

    data_lst = common.load_data()
    if data_lst:
        (DEFAULT_AUTHOR, DEFAULT_EMAIL, DEFAULT_LICENSE, DEFAULT_URL,
         DEFAULT_VERSION) = data_lst

    while not app_name:
        app_name = input(lcl.Q_APP_NAME).decode(lcl.INPUT_ENC)

    app_version = input(lcl.Q_APP_VERSION + '[' + DEFAULT_VERSION +
                        '] ').decode(lcl.INPUT_ENC)
    if not app_version:
        app_version = DEFAULT_VERSION

    app_license = input(lcl.Q_APP_LICENSE + '[' + DEFAULT_LICENSE +
                        '] ').decode(lcl.INPUT_ENC)
    if not app_license:
        app_license = DEFAULT_LICENSE

    app_author = input(lcl.Q_APP_AUTHOR + '[' + DEFAULT_AUTHOR +
                       '] ').decode(lcl.INPUT_ENC)
    if not app_author:
        app_author = DEFAULT_AUTHOR

    app_email = input(lcl.Q_APP_EMAIL + '[' + DEFAULT_EMAIL +
                      '] ').decode(lcl.INPUT_ENC)
    if not app_email:
        app_email = DEFAULT_EMAIL

    app_url = input(lcl.Q_APP_URL + '[' + DEFAULT_URL +
                    '] ').decode(lcl.INPUT_ENC)
    if not app_url:
        app_url = DEFAULT_URL

    app_keywords = input(lcl.Q_APP_KEYWORDS).decode(lcl.INPUT_ENC)
    if not app_keywords:
        app_keywords = app_name

    data_lst = [app_author, app_email, app_license, app_url, app_version]
    common.save_data(data_lst)

    app_url += app_name

    # backup existing files
    backup = False
    filenames = glob.glob('*')
    filenames += glob.glob('.*')
    if filenames:
        backup = True
        os.mkdir(BAK_DIR)
        for filename in filenames:
            dest = BAK_DIR + '/' + filename.split(os.sep)[-1]
            shu.move(filename, dest)

    filenames = glob.glob(common.DATA_PATH + 'template/*')
    filenames += glob.glob(common.DATA_PATH + 'template/.*')
    # remove doc dir
    filenames = [filename for filename in filenames
                 if 'template' + os.sep + 'doc' not in filename]

    # copy files and dirs
    for filename in filenames:
        if os.path.isfile(filename):
            shu.copyfile(filename, filename.split(os.sep)[-1])
        else:
            shu.copytree(filename, filename.split(os.sep)[-1])

    common.sleep(2)

    os.rename('APPLICATION_NAME', app_name)  # rename application dir

    # collect all filenames, including from 1st level subdirs
    filenames = glob.glob('*')
    filenames = [filename for filename in filenames if BAK_DIR not in filename]
    filenames += glob.glob('.*')
    new_filenames = []
    for filename in filenames:
        if os.path.isdir(filename):
            new_filenames += glob.glob(filename + '/*')
    filenames += new_filenames

    exceptions = ['__init__.py', 'build.cmd', 'requirements.txt',
                  'requirements-dev.txt', 'setup.py', 'setup_py2exe.py',
                  'setup_utils.py']

    # delete .pyc files and update files
    for filename in filenames:
        if os.path.isfile(filename):
            if '.pyc' in filename:
                os.remove(filename)
            else:
                if filename.split(os.sep)[-1] not in exceptions:
                    update_file(filename)

    create_redir2rtd_zip()

    if backup:
        os.remove(app_name + APPLICATION_TEMPLATE_FILE)  # remove app template
        # restore files from backup, but only if they don't already exist
        filenames = glob.glob(BAK_DIR + '/*')
        for filename in filenames:
            dest = app_name + '/' + filename.split(os.sep)[-1]
            if not os.path.isfile(dest):
                shu.copyfile(filename, dest)
    else:
        os.rename(app_name + APPLICATION_TEMPLATE_FILE,
                  app_name + '/' + app_name + '.py')  # rename app template

    print(lcl.REMINDERS)
Beispiel #15
0
import os
import csv
import numpy as np

from a3_1 import run_kmeans
from a3_2 import run_mog
from common import save_data



if __name__ == '__main__':
    data = np.load('data100D.npy')

    # Run 2.2.3
    for k in [5, 10, 15, 20, 30]:
        result = run_kmeans(k, data, epochs=1000, tol=1e-6)
        save_data(result, '2.2.3', 'kmeans-%s' % str(k))
    
    for k in [5, 10, 15, 20, 30]:
        result = run_mog(k, data, epochs=1000, tol=1e-8)
        save_data(result, '2.2.3', 'mog-%s' % str(k))
Beispiel #16
0
            #     index = (CD.data[:, D.RT_fast[PLAYER]] is True) | (CD.data[:, D.RT_slow[PLAYER]] is True)
            #     CD.cursor_xy2[:, :, index, PLAYER] = np.nan
            #     CD.gaze[:, :, index, :] = np.NaN

            return CD

        CD_results[SESSION] = [None] * common.Number.control

        for CONTROL in range(0, common.Number.control):
            CD_results[SESSION][CONTROL] = limit_to_control_condition(
                CONTROL, data, cursor_xy2, target_position, gaze[:, :, :, :,
                                                                 SESSION])

    ## save results

    common.save_data(CD_results, output_directory + "CD_results.bin")
    test = common.load_data(output_directory + "CD_results.bin")

else:
    test = common.load_data(output_directory + "CD_results.bin")

## psychopy setup

is_play_movies = True

if is_play_movies:

    import psychopy.visual
    #import psychopy.event

    win = psychopy.visual.Window(size=[1920, 1080],
Beispiel #17
0
def main():
    fullset = common.load_data(FULLSET_PATH, sep=',')
    codes = get_codes(fullset)
    uncertain_mask = (codes == common.N_CLASS)

    uncertain_set = fullset[uncertain_mask]
    uncertain_features = common.onehot_encode(
        uncertain_set[:, common.N_DISASTER:], 0)

    trainset = common.load_data(TRAINSET_PATH, sep=',')
    trainset = common.onehot_encode(trainset, 0)

    prob_sum = np.zeros((uncertain_features.shape[0], common.N_CLASS))
    for i in range(N_MODEL):
        x_train, _, _, _ = common.split(trainset, i)
        _, normalized_features = common.normalize(x_train, uncertain_features)
        prob_sum += tf.keras.models.load_model(common.numbering(
            MODEL_PATH, i)).predict(normalized_features)
        print(i, ' is done.')
    probs = prob_sum / N_MODEL
    linenum_to_prob = {
        idx: prob
        for idx, prob in zip(np.nonzero(uncertain_mask)[0], probs)
    }

    # unpredicted map
    common.save_map(codes.reshape(common.N_ROWS, -1), UNPRED_OUTPUT_PATH)

    # predicted map
    counter = [0] * common.N_CLASS
    predicted_map = codes.copy()
    for i, (row, code) in enumerate(zip(fullset, codes)):
        if code == common.N_CLASS:
            predicted_map[i], order = check_and_decide(row[:common.N_DISASTER],
                                                       linenum_to_prob[i])
            counter[order] += 1
    common.save_map(predicted_map.reshape(common.N_ROWS, -1), PRED_OUTPUT_PATH)
    print(counter)

    # full_probs
    encoded_codes = get_encoded_codes(fullset)
    certain_mask = (codes < common.N_CLASS) & (codes >= 0)

    certain_set = codes[certain_mask]
    cerntain_probs = get_hunnit_prob_vecs(certain_set)
    linenum_to_certain_prob = {
        idx: prob
        for idx, prob in zip(np.nonzero(certain_mask)[0], cerntain_probs)
    }

    full_probs = []
    for i, code in enumerate(encoded_codes):
        if i in linenum_to_prob:
            _, order = check_and_decide(fullset[i][:common.N_DISASTER],
                                        linenum_to_prob[i])
            full_probs.append([code] + linenum_to_prob[i].tolist() +
                              [order + 1])
        elif i in linenum_to_certain_prob:
            full_probs.append([code] + linenum_to_certain_prob[i].tolist() +
                              [0])
        else:
            full_probs.append([0])

    cur_id = 0
    reversed_full_probs = []
    for row in np.flipud(np.array(full_probs).reshape(common.N_ROWS,
                                                      -1)).reshape(-1):
        if len(row) == 1:
            continue
        reversed_full_probs.append([cur_id] + row)
        cur_id += 1
    common.save_data(reversed_full_probs, PROBS_OUTPUT_PATH)
    # print question_action
    # print replacement_action_conjugated
    # print editable_question

    data = {}
    data['image_file'] = image_file
    data['original_question'] = question
    data['question'] = editable_question
    data['answer'] = question_action
    # data['replacement_action'] = replacement_action_conjugated
    data['relevant'] = 0
    data['image_id'] = row['image_id']
    data['qa_id'] = -1 * row['qa_id']
    data['image_actions'] = ','.join(image_actions)
    editable_questions.append(data)

    # noedit_data = {}
    # noedit_data['image_file'] = image_file
    # noedit_data['original_question'] = question
    # noedit_data['question'] = question
    # noedit_data['answer'] = question_action
    # noedit_data['replacement_action'] = question_action
    # noedit_data['relevant'] = 1
    # noedit_data['image_id'] = row['image_id']
    # noedit_data['qa_id'] = row['qa_id']
    # noedit_data['image_actions'] = row['image_actions']
    # editable_questions.append(noedit_data)

editable_df = save_data(editable_questions, editable_dataset_output_file)
# print editable_df
def plot(connections, multiflow_connections, sums_dir_exp):
    threshold_handover = 1.0
    syn_first_additional_sf = []
    syn_additional_sfs = []
    time_handover = []
    time_handover_conn = []
    time_handover_conn_info = []
    react_handover = []
    handover_conns = {}
    second_sf_handover = []
    log_file = sys.stdout
    less_200ms = 0
    less_1s = 0
    more_60s = 0
    more_3600s = 0
    less_200ms_second = 0
    less_1s_second = 0
    more_60s_second = 0
    more_3600s_second = 0
    # Look only at multiple subflows connections
    for fname, conns in multiflow_connections.iteritems():
        handover_conns[fname] = {}
        for conn_id, conn in conns.iteritems():
            # First find initial subflow timestamp
            initial_sf_ts = float('inf')
            initial_sf_id = None
            last_acks = []
            min_time_last_ack = float('inf')
            for flow_id, flow in conn.flows.iteritems():
                if co.START not in flow.attr or flow.attr[
                        co.SADDR] in co.IP_PROXY:
                    continue

                if (flow.attr[co.START] -
                        conn.attr[co.START]).total_seconds() < -30:
                    continue

                if flow.attr[co.START].total_seconds() < initial_sf_ts:
                    initial_sf_ts = flow.attr[co.START].total_seconds()
                    initial_sf_id = flow_id
                flow_bytes = 0
                for direction in co.DIRECTIONS:
                    flow_bytes += flow.attr[direction].get(co.BYTES_DATA, 0)
                if flow_bytes > 0 and co.TIME_LAST_ACK_TCP in flow.attr[
                        co.S2C] and flow.attr[co.S2C][
                            co.TIME_LAST_ACK_TCP].total_seconds(
                            ) > 0.0 and co.TIME_LAST_ACK_TCP in flow.attr[
                                co.C2S] and flow.attr[co.C2S][
                                    co.TIME_LAST_ACK_TCP].total_seconds(
                                    ) > 0.0:
                    last_acks.append(flow.attr[co.S2C][
                        co.TIME_LAST_ACK_TCP].total_seconds())
                    min_time_last_ack = min(
                        min_time_last_ack, flow.attr[co.S2C][
                            co.TIME_LAST_ACK_TCP].total_seconds())

            if initial_sf_ts == float('inf'):
                continue

            # Now store the delta and record connections with handover
            handover_detected = False
            count_flows = 0
            min_delta = float('inf')
            flow_id_min_delta = None
            for flow_id, flow in conn.flows.iteritems():
                if co.START not in flow.attr or flow.attr[
                        co.SADDR] in co.IP_PROXY:
                    continue

                if co.TIME_LAST_ACK_TCP not in flow.attr[co.S2C] or flow.attr[
                        co.S2C][co.TIME_LAST_ACK_TCP].total_seconds(
                        ) == 0 or co.TIME_LAST_ACK_TCP not in flow.attr[
                            co.C2S] or flow.attr[co.C2S][
                                co.TIME_LAST_ACK_TCP].total_seconds() == 0:
                    # RST, don't consider as valid MP_JOIN
                    continue

                if (flow.attr[co.START] -
                        conn.attr[co.START]).total_seconds() < -30:
                    continue

                if (flow.attr[co.START] - conn.attr[co.START]
                    ).total_seconds() > conn.attr[co.DURATION]:
                    # This subflow is maybe wrongly attributed
                    continue

                delta = flow.attr[co.START].total_seconds() - initial_sf_ts
                min_last_acks = float('inf')
                if len(last_acks) >= 1:
                    min_last_acks = min(last_acks)

                max_last_payload = 0 - float('inf')
                if flow.attr[co.C2S].get(co.BYTES,
                                         0) > 0 or flow.attr[co.S2C].get(
                                             co.BYTES, 0) > 0:
                    max_last_payload = max([
                        flow.attr[direction][co.TIME_LAST_PAYLD]
                        for direction in co.DIRECTIONS
                    ])
                handover_delta = flow.attr[co.START].total_seconds(
                ) + max_last_payload - min_last_acks
                if delta > 0.0:
                    min_delta = min(min_delta, delta)
                    if min_delta == delta:
                        flow_id_min_delta = flow_id
                    if delta < 0.01:
                        print(fname, conn_id, flow_id, delta)
                    syn_additional_sfs.append(delta)

                    if handover_delta > 0.0:
                        # A subflow is established after the last ack of the client seen --> Handover
                        time_handover.append(min_last_acks - initial_sf_ts)
                        react_handover.append(handover_delta)
                        last_acks.remove(min_last_acks)
                        if not handover_detected:
                            handover_detected = True
                            time_handover_conn.append(delta)
                            time_handover_conn_info.append(
                                (min_last_acks - initial_sf_ts, delta, fname,
                                 conn_id))
                            handover_conns[fname][conn_id] = conn
                    if delta >= 50000:
                        print("HUGE DELTA",
                              fname,
                              conn_id,
                              flow_id,
                              delta,
                              file=log_file)

                    if delta <= 0.2:
                        less_200ms += 1
                    if delta <= 1:
                        less_1s += 1
                    if delta >= 60:
                        more_60s += 1
                    if delta >= 3600:
                        more_3600s += 1

            if flow_id_min_delta:
                syn_first_additional_sf.append(min_delta)
                if conn.flows[initial_sf_id].attr[co.S2C][
                        co.TIME_LAST_ACK_TCP].total_seconds() < conn.flows[
                            flow_id_min_delta].attr[co.START].total_seconds():
                    # Handover between initial and second subflow
                    second_sf_handover.append(min_delta)
                if delta <= 0.2:
                    less_200ms_second += 1
                if delta <= 1:
                    less_1s_second += 1
                if delta >= 60:
                    more_60s_second += 1
                if delta >= 3600:
                    more_3600s_second += 1

    # Do a first CDF plot of the delta between initial SYN and additional ones
    base_graph_path = os.path.join(sums_dir_exp, 'cdf_delta_addtitional_syns')
    color = 'red'
    graph_fname = os.path.splitext(base_graph_path)[0] + "_cdf.pdf"
    graph_fname_log = os.path.splitext(base_graph_path)[0] + "_cdf_log.pdf"
    sample = np.array(sorted(syn_additional_sfs))
    sorted_array = np.sort(sample)
    yvals = np.arange(len(sorted_array)) / float(len(sorted_array))
    sample_2 = np.array(sorted(syn_first_additional_sf))
    sorted_array_2 = np.sort(sample_2)
    yvals_2 = np.arange(len(sorted_array_2)) / float(len(sorted_array_2))
    if len(sorted_array) > 0:
        # Add a last point
        sorted_array = np.append(sorted_array, sorted_array[-1])
        yvals = np.append(yvals, 1.0)

        sorted_array_2 = np.append(sorted_array_2, sorted_array_2[-1])
        yvals_2 = np.append(yvals_2, 1.0)

        # Log plot
        plt.figure()
        plt.clf()
        fig, ax = plt.subplots()
        ax.plot(sorted_array,
                yvals,
                color=color,
                linewidth=2,
                label="Additional subflows")
        ax.plot(sorted_array_2,
                yvals_2,
                color='blue',
                linestyle='--',
                linewidth=2,
                label="Second subflows")

        # Shrink current axis's height by 10% on the top
        # box = ax.get_position()
        # ax.set_position([box.x0, box.y0,
        #                  box.width, box.height * 0.9])
        ax.set_xscale('log')

        # Put a legend above current axis
        # ax.legend(loc='lower center', bbox_to_anchor=(0.5, 1.05), fancybox=True, shadow=True, ncol=ncol)
        ax.legend(loc='lower right')

        plt.xlim(xmin=0.01)
        plt.xlabel('Time between MP_JOIN and MP_CAP [s]',
                   fontsize=24,
                   labelpad=-2)
        plt.ylabel("CDF", fontsize=24)
        plt.savefig(graph_fname_log)
        plt.close('all')

    #     # Normal plot
    #     plt.figure()
    #     plt.clf()
    #     fig, ax = plt.subplots()
    #     ax.plot(sorted_array, yvals, color=color, linewidth=2, label="MP_JOIN - MP_CAP")
    #
    #     # Shrink current axis's height by 10% on the top
    #     # box = ax.get_position()
    #     # ax.set_position([box.x0, box.y0,
    #     #                  box.width, box.height * 0.9])
    #     # ax.set_xscale('log')
    #
    #     # Put a legend above current axis
    #     # ax.legend(loc='lower center', bbox_to_anchor=(0.5, 1.05), fancybox=True, shadow=True, ncol=ncol)
    #     ax.legend(loc='lower right')
    #
    #     plt.xlabel('Time [s]', fontsize=18)
    #     plt.ylabel("CDF", fontsize=18)
    #     plt.savefig(graph_fname)
    #     plt.close('all')

    # Now quantify in handover connections the amount of data not on the initial subflows
    bytes_init_sf = 0.0
    bytes_init_sfs = 0.0
    bytes_total = 0.0
    for fname, conns in handover_conns.iteritems():
        for conn_id, conn in conns.iteritems():
            # First find initial subflow timestamp
            initial_sf_ts = float('inf')
            for flow_id, flow in conn.flows.iteritems():
                if co.START not in flow.attr:
                    continue
                if flow.attr[co.START].total_seconds() < initial_sf_ts:
                    initial_sf_ts = flow.attr[co.START].total_seconds()

            min_delta = float('inf')
            for flow_id, flow in conn.flows.iteritems():
                if co.START not in flow.attr:
                    continue
                delta = flow.attr[co.START].total_seconds() - initial_sf_ts
                if delta > 0.0:
                    min_delta = min(min_delta, delta)

            # Now collect the amount of data on all subflows
            for flow_id, flow in conn.flows.iteritems():
                if co.START not in flow.attr:
                    continue
                delta = flow.attr[co.START].total_seconds() - initial_sf_ts
                for direction in co.DIRECTIONS:
                    bytes_total += flow.attr[direction].get(co.BYTES, 0)
                    if flow.attr[direction].get(co.BYTES, 0) >= 1000000000:
                        print("WARNING!!!",
                              fname,
                              conn_id,
                              flow_id,
                              bytes_total,
                              file=log_file)
                    if delta <= min_delta:
                        # Initial subflows
                        bytes_init_sfs += flow.attr[direction].get(co.BYTES, 0)
                        if delta == 0.0:
                            # Initial subflow
                            bytes_init_sf += flow.attr[direction].get(
                                co.BYTES, 0)

    # Log those values in the log file
    print("DELTA HANDOVER IN FILE delta_handover")
    co.save_data("delta_handover", sums_dir_exp, time_handover)
    print("REACT HANDOVER IN FILE react_handover")
    co.save_data("react_handover", sums_dir_exp, react_handover)
    print("REACT HANDOVER IN FILE time_handover_conn")
    co.save_data("time_handover_conn", sums_dir_exp, time_handover_conn)
    print("REACT HANDOVER IN FILE time_handover_conn_info")
    co.save_data("time_handover_conn_info", sums_dir_exp,
                 time_handover_conn_info)
    print("SECOND SF HANDOVER IN FILE second_sf_handover")
    co.save_data("second_sf_handover", sums_dir_exp, second_sf_handover)
    print("QUANTIFY HANDOVER", file=log_file)
    print(bytes_init_sf,
          "BYTES ON INIT SF",
          bytes_init_sf * 100 / bytes_total,
          "%",
          file=log_file)
    print(bytes_init_sfs,
          "BYTES ON INIT SFS",
          bytes_init_sfs * 100 / bytes_total,
          "%",
          file=log_file)
    print("TOTAL BYTES", bytes_total, file=log_file)

    print("<= 200ms", less_200ms, less_200ms * 100.0 / len(syn_additional_sfs),
          "%")
    print("<= 1s", less_1s, less_1s * 100.0 / len(syn_additional_sfs), "%")
    print(">= 60s", more_60s, more_60s * 100.0 / len(syn_additional_sfs), "%")
    print(">= 3600s", more_3600s, more_3600s * 100.0 / len(syn_additional_sfs),
          "%")

    print("<= 200ms second", less_200ms_second,
          less_200ms_second * 100.0 / len(syn_first_additional_sf), "%")
    print("<= 1s second", less_1s_second,
          less_1s_second * 100.0 / len(syn_first_additional_sf), "%")
    print(">= 60s second", more_60s_second,
          more_60s_second * 100.0 / len(syn_first_additional_sf), "%")
    print(">= 3600s second", more_3600s_second,
          more_3600s_second * 100.0 / len(syn_first_additional_sf), "%")
def get_ip_address(cmd):
    return str(ord(cmd[1])) + '.' + str(ord(cmd[2])) + '.' + str(ord(cmd[3])) + '.' + str(ord(cmd[4]))


def process_pcap(pcap_filepath, ports):
    # condition = "tcp.len==7"
    # tshark_filter(condition, pcap_filepath, pcap_filtered_filepath)
    file = open(pcap_filepath)
    try:
        pcap = dpkt.pcap.Reader(file)
        for ts, data in pcap:
            eth = dpkt.ethernet.Ethernet(data)
            ip = eth.data
            tcp = ip.data
            if len(tcp.data) == 7:
                crypted_socks_cmd = tcp.data
                decrypted_socks_cmd = decode(crypted_socks_cmd)
                if decrypted_socks_cmd[0] == b'\x01': # Connect
                    add_port(decrypted_socks_cmd, ports)
    except Exception as e:
        print(e)

    file.close()

if __name__ == "__main__":
    for pcap_filepath in pcap_list:
        ports = {}
        process_pcap(pcap_filepath, ports)
        co.save_data(pcap_filepath, ports_dir_exp, ports)
def plot(connections, multiflow_connections, sums_dir_exp):
    threshold_handover = 1.0
    syn_first_additional_sf = []
    syn_additional_sfs = []
    time_handover = []
    time_handover_conn = []
    time_handover_conn_info = []
    react_handover = []
    handover_conns = {}
    second_sf_handover = []
    log_file = sys.stdout
    less_200ms = 0
    less_1s = 0
    more_60s = 0
    more_3600s = 0
    less_200ms_second = 0
    less_1s_second = 0
    more_60s_second = 0
    more_3600s_second = 0
    # Look only at multiple subflows connections
    for fname, conns in multiflow_connections.iteritems():
        handover_conns[fname] = {}
        for conn_id, conn in conns.iteritems():
            # First find initial subflow timestamp
            initial_sf_ts = float('inf')
            initial_sf_id = None
            last_acks = []
            min_time_last_ack = float('inf')
            for flow_id, flow in conn.flows.iteritems():
                if co.START not in flow.attr or flow.attr[co.SADDR] in co.IP_PROXY:
                    continue

                if (flow.attr[co.START] - conn.attr[co.START]).total_seconds() < -30:
                    continue

                if flow.attr[co.START].total_seconds() < initial_sf_ts:
                    initial_sf_ts = flow.attr[co.START].total_seconds()
                    initial_sf_id = flow_id
                flow_bytes = 0
                for direction in co.DIRECTIONS:
                    flow_bytes += flow.attr[direction].get(co.BYTES_DATA, 0)
                if flow_bytes > 0 and co.TIME_LAST_ACK_TCP in flow.attr[co.S2C] and flow.attr[co.S2C][co.TIME_LAST_ACK_TCP].total_seconds() > 0.0 and co.TIME_LAST_ACK_TCP in flow.attr[co.C2S] and flow.attr[co.C2S][co.TIME_LAST_ACK_TCP].total_seconds() > 0.0:
                    last_acks.append(flow.attr[co.S2C][co.TIME_LAST_ACK_TCP].total_seconds())
                    min_time_last_ack = min(min_time_last_ack, flow.attr[co.S2C][co.TIME_LAST_ACK_TCP].total_seconds())

            if initial_sf_ts == float('inf'):
                continue

            # Now store the delta and record connections with handover
            handover_detected = False
            count_flows = 0
            min_delta = float('inf')
            flow_id_min_delta = None
            for flow_id, flow in conn.flows.iteritems():
                if co.START not in flow.attr or flow.attr[co.SADDR] in co.IP_PROXY:
                    continue

                if co.TIME_LAST_ACK_TCP not in flow.attr[co.S2C] or flow.attr[co.S2C][co.TIME_LAST_ACK_TCP].total_seconds() == 0 or co.TIME_LAST_ACK_TCP not in flow.attr[co.C2S] or flow.attr[co.C2S][co.TIME_LAST_ACK_TCP].total_seconds() == 0:
                    # RST, don't consider as valid MP_JOIN
                    continue

                if (flow.attr[co.START] - conn.attr[co.START]).total_seconds() < -30:
                    continue

                if (flow.attr[co.START] - conn.attr[co.START]).total_seconds() > conn.attr[co.DURATION]:
                    # This subflow is maybe wrongly attributed
                    continue

                delta = flow.attr[co.START].total_seconds() - initial_sf_ts
                min_last_acks = float('inf')
                if len(last_acks) >= 1:
                    min_last_acks = min(last_acks)

                max_last_payload = 0 - float('inf')
                if flow.attr[co.C2S].get(co.BYTES, 0) > 0 or flow.attr[co.S2C].get(co.BYTES, 0) > 0:
                    max_last_payload = max([flow.attr[direction][co.TIME_LAST_PAYLD] for direction in co.DIRECTIONS])
                handover_delta = flow.attr[co.START].total_seconds() + max_last_payload - min_last_acks
                if delta > 0.0:
                    min_delta = min(min_delta, delta)
                    if min_delta == delta:
                        flow_id_min_delta = flow_id
                    if delta < 0.01:
                        print(fname, conn_id, flow_id, delta)
                    syn_additional_sfs.append(delta)

                    if handover_delta > 0.0:
                        # A subflow is established after the last ack of the client seen --> Handover
                        time_handover.append(min_last_acks - initial_sf_ts)
                        react_handover.append(handover_delta)
                        last_acks.remove(min_last_acks)
                        if not handover_detected:
                            handover_detected = True
                            time_handover_conn.append(delta)
                            time_handover_conn_info.append((min_last_acks - initial_sf_ts, delta, fname, conn_id))
                            handover_conns[fname][conn_id] = conn
                    if delta >= 50000:
                        print("HUGE DELTA", fname, conn_id, flow_id, delta, file=log_file)

                    if delta <= 0.2:
                        less_200ms += 1
                    if delta <= 1:
                        less_1s += 1
                    if delta >= 60:
                        more_60s += 1
                    if delta >= 3600:
                        more_3600s += 1

            if flow_id_min_delta:
                syn_first_additional_sf.append(min_delta)
                if conn.flows[initial_sf_id].attr[co.S2C][co.TIME_LAST_ACK_TCP].total_seconds() < conn.flows[flow_id_min_delta].attr[co.START].total_seconds():
                    # Handover between initial and second subflow
                    second_sf_handover.append(min_delta)
                if delta <= 0.2:
                    less_200ms_second += 1
                if delta <= 1:
                    less_1s_second += 1
                if delta >= 60:
                    more_60s_second += 1
                if delta >= 3600:
                    more_3600s_second += 1

    # Do a first CDF plot of the delta between initial SYN and additional ones
    base_graph_path = os.path.join(sums_dir_exp, 'cdf_delta_addtitional_syns')
    color = 'red'
    graph_fname = os.path.splitext(base_graph_path)[0] + "_cdf.pdf"
    graph_fname_log = os.path.splitext(base_graph_path)[0] + "_cdf_log.pdf"
    sample = np.array(sorted(syn_additional_sfs))
    sorted_array = np.sort(sample)
    yvals = np.arange(len(sorted_array)) / float(len(sorted_array))
    sample_2 = np.array(sorted(syn_first_additional_sf))
    sorted_array_2 = np.sort(sample_2)
    yvals_2 = np.arange(len(sorted_array_2)) / float(len(sorted_array_2))
    if len(sorted_array) > 0:
        # Add a last point
        sorted_array = np.append(sorted_array, sorted_array[-1])
        yvals = np.append(yvals, 1.0)

        sorted_array_2 = np.append(sorted_array_2, sorted_array_2[-1])
        yvals_2 = np.append(yvals_2, 1.0)

        # Log plot
        plt.figure()
        plt.clf()
        fig, ax = plt.subplots()
        ax.plot(sorted_array, yvals, color=color, linewidth=2, label="Additional subflows")
        ax.plot(sorted_array_2, yvals_2, color='blue', linestyle='--', linewidth=2, label="Second subflows")

        # Shrink current axis's height by 10% on the top
        # box = ax.get_position()
        # ax.set_position([box.x0, box.y0,
        #                  box.width, box.height * 0.9])
        ax.set_xscale('log')

        # Put a legend above current axis
        # ax.legend(loc='lower center', bbox_to_anchor=(0.5, 1.05), fancybox=True, shadow=True, ncol=ncol)
        ax.legend(loc='lower right')

        plt.xlim(xmin=0.01)
        plt.xlabel('Time between MP_JOIN and MP_CAP [s]', fontsize=24, labelpad=-2)
        plt.ylabel("CDF", fontsize=24)
        plt.savefig(graph_fname_log)
        plt.close('all')

    #     # Normal plot
    #     plt.figure()
    #     plt.clf()
    #     fig, ax = plt.subplots()
    #     ax.plot(sorted_array, yvals, color=color, linewidth=2, label="MP_JOIN - MP_CAP")
    #
    #     # Shrink current axis's height by 10% on the top
    #     # box = ax.get_position()
    #     # ax.set_position([box.x0, box.y0,
    #     #                  box.width, box.height * 0.9])
    #     # ax.set_xscale('log')
    #
    #     # Put a legend above current axis
    #     # ax.legend(loc='lower center', bbox_to_anchor=(0.5, 1.05), fancybox=True, shadow=True, ncol=ncol)
    #     ax.legend(loc='lower right')
    #
    #     plt.xlabel('Time [s]', fontsize=18)
    #     plt.ylabel("CDF", fontsize=18)
    #     plt.savefig(graph_fname)
    #     plt.close('all')

    # Now quantify in handover connections the amount of data not on the initial subflows
    bytes_init_sf = 0.0
    bytes_init_sfs = 0.0
    bytes_total = 0.0
    for fname, conns in handover_conns.iteritems():
        for conn_id, conn in conns.iteritems():
            # First find initial subflow timestamp
            initial_sf_ts = float('inf')
            for flow_id, flow in conn.flows.iteritems():
                if co.START not in flow.attr:
                    continue
                if flow.attr[co.START].total_seconds() < initial_sf_ts:
                    initial_sf_ts = flow.attr[co.START].total_seconds()

            min_delta = float('inf')
            for flow_id, flow in conn.flows.iteritems():
                if co.START not in flow.attr:
                    continue
                delta = flow.attr[co.START].total_seconds() - initial_sf_ts
                if delta > 0.0:
                    min_delta = min(min_delta, delta)

            # Now collect the amount of data on all subflows
            for flow_id, flow in conn.flows.iteritems():
                if co.START not in flow.attr:
                    continue
                delta = flow.attr[co.START].total_seconds() - initial_sf_ts
                for direction in co.DIRECTIONS:
                    bytes_total += flow.attr[direction].get(co.BYTES, 0)
                    if flow.attr[direction].get(co.BYTES, 0) >= 1000000000:
                        print("WARNING!!!", fname, conn_id, flow_id, bytes_total, file=log_file)
                    if delta <= min_delta:
                        # Initial subflows
                        bytes_init_sfs += flow.attr[direction].get(co.BYTES, 0)
                        if delta == 0.0:
                            # Initial subflow
                            bytes_init_sf += flow.attr[direction].get(co.BYTES, 0)

    # Log those values in the log file
    print("DELTA HANDOVER IN FILE delta_handover")
    co.save_data("delta_handover", sums_dir_exp, time_handover)
    print("REACT HANDOVER IN FILE react_handover")
    co.save_data("react_handover", sums_dir_exp, react_handover)
    print("REACT HANDOVER IN FILE time_handover_conn")
    co.save_data("time_handover_conn", sums_dir_exp, time_handover_conn)
    print("REACT HANDOVER IN FILE time_handover_conn_info")
    co.save_data("time_handover_conn_info", sums_dir_exp, time_handover_conn_info)
    print("SECOND SF HANDOVER IN FILE second_sf_handover")
    co.save_data("second_sf_handover", sums_dir_exp, second_sf_handover)
    print("QUANTIFY HANDOVER", file=log_file)
    print(bytes_init_sf, "BYTES ON INIT SF", bytes_init_sf * 100 / bytes_total, "%", file=log_file)
    print(bytes_init_sfs, "BYTES ON INIT SFS", bytes_init_sfs * 100 / bytes_total, "%", file=log_file)
    print("TOTAL BYTES", bytes_total, file=log_file)

    print("<= 200ms", less_200ms, less_200ms * 100.0 / len(syn_additional_sfs), "%")
    print("<= 1s", less_1s, less_1s * 100.0 / len(syn_additional_sfs), "%")
    print(">= 60s", more_60s, more_60s * 100.0 / len(syn_additional_sfs), "%")
    print(">= 3600s", more_3600s, more_3600s * 100.0 / len(syn_additional_sfs), "%")

    print("<= 200ms second", less_200ms_second, less_200ms_second * 100.0 / len(syn_first_additional_sf), "%")
    print("<= 1s second", less_1s_second, less_1s_second * 100.0 / len(syn_first_additional_sf), "%")
    print(">= 60s second", more_60s_second, more_60s_second * 100.0 / len(syn_first_additional_sf), "%")
    print(">= 3600s second", more_3600s_second, more_3600s_second * 100.0 / len(syn_first_additional_sf), "%")
Beispiel #22
0
                'X': X_valid,
                'y': y_valid,
                'Mu': c_valid,
                'loss': valid_loss,
            },
        })
    
    return result


if __name__ == '__main__':

    data = np.load('data2D.npy')

    # Remove comment to run each problem

    # Run 1.1
    result = run_kmeans(3, data)
    save_data(result, '1.1')

    # Run 1.2
    for k in range(1, 6):
        result = run_kmeans(k, data)
        save_data(result, '1.2', str(k))

    # Run 1.3
    for k in range(1, 6):
        result = run_kmeans(k, data, with_valid=True)
        save_data(result, '1.3', str(k))

def process_trace(pcap_filepath,
                  graph_dir_exp,
                  stat_dir_exp,
                  aggl_dir_exp,
                  rtt_dir_exp,
                  rtt_subflow_dir_exp,
                  failed_conns_dir_exp,
                  acksize_dir_exp,
                  acksize_tcp_dir_exp,
                  plot_cwin,
                  tcpcsm,
                  min_bytes=0,
                  light=False,
                  return_dict=False):
    """ Process a mptcp pcap file and generate graphs of its subflows
        Notice that we can't change dir per thread, we should use processes
    """
    # if not check_mptcp_joins(pcap_filepath):
    #     print("WARNING: no mptcp joins on " + pcap_filepath, file=sys.stderr)
    csv_tmp_dir = tempfile.mkdtemp(dir=os.getcwd())
    connections = None
    do_tcp_processing = False
    try:
        with co.cd(csv_tmp_dir):
            # If segmentation faults, remove the -S option
            # cmd = ['mptcptrace', '-f', pcap_filepath, '-s', '-S', '-t', '5000', '-w', '0']
            # if not light:
            #     cmd += ['-G', '250', '-r', '2', '-F', '3', '-a']
            # connections = process_mptcptrace_cmd(cmd, pcap_filepath)
            #
            # # Useful to count the number of reinjected bytes
            # cmd = ['mptcptrace', '-f', pcap_filepath, '-s', '-a', '-t', '5000', '-w', '2']
            # if not light:
            #     cmd += ['-G', '250', '-r', '2', '-F', '3']
            # devnull = open(os.devnull, 'w')
            # if subprocess.call(cmd, stdout=devnull) != 0:
            #     raise MPTCPTraceError("Error of mptcptrace with " + pcap_filepath)
            # devnull.close()
            #
            # cmd = ['mptcptrace', '-f', pcap_filepath, '-r', '2', '-t', '5000', '-w', '2']
            # if not light:
            #     cmd += ['-G', '250', '-r', '2', '-F', '3']
            # devnull = open(os.devnull, 'w')
            # if subprocess.call(cmd, stdout=devnull) != 0:
            #     raise MPTCPTraceError("Error of mptcptrace with " + pcap_filepath)
            # devnull.close()

            cmd = [
                'mptcptrace', '-f', pcap_filepath, '-s', '-S', '-a', '-A',
                '-R', '-r', '2', '-t', '5000', '-w', '2'
            ]
            connections = process_mptcptrace_cmd(cmd, pcap_filepath)

            # The mptcptrace call will generate .xpl files to cope with
            # First see all xpl files, to detect the relative 0 of all connections
            # Also, compute the duration and number of bytes of the MPTCP connection
            first_pass_on_files(connections)
            rtt_all = {co.C2S: {}, co.S2C: {}}
            acksize_all = {co.C2S: {}, co.S2C: {}}

            # Then really process xpl files
            if return_dict:
                for xpl_fname in glob.glob(os.path.join('*.xpl')):
                    try:
                        os.remove(xpl_fname)
                    except IOError as e:
                        print(str(e), file=sys.stderr)
            else:
                for xpl_fname in glob.glob(os.path.join('*.xpl')):
                    try:
                        directory = co.DEF_RTT_DIR if MPTCP_RTT_FNAME in xpl_fname else co.TSG_THGPT_DIR
                        shutil.move(
                            xpl_fname,
                            os.path.join(
                                graph_dir_exp, directory,
                                os.path.basename(pcap_filepath[:-5]) + "_" +
                                os.path.basename(xpl_fname)))
                    except IOError as e:
                        print(str(e), file=sys.stderr)

            # And by default, save only seq csv files
            for csv_fname in glob.glob(os.path.join('*.csv')):
                if not light:
                    if MPTCP_GPUT_FNAME in os.path.basename(csv_fname):
                        process_gput_csv(csv_fname, connections)
                try:
                    if os.path.basename(csv_fname).startswith(
                            MPTCP_ADDADDR_FNAME):
                        conn_id = get_connection_id(
                            os.path.basename(csv_fname))
                        if conn_id not in connections:
                            # Not a real connection; skip it
                            continue

                        process_add_addr_csv(csv_fname, connections, conn_id)
                        os.remove(csv_fname)

                    elif os.path.basename(csv_fname).startswith(
                            MPTCP_RMADDR_FNAME):
                        conn_id = get_connection_id(
                            os.path.basename(csv_fname))
                        if conn_id not in connections:
                            # Not a real connection; skip it
                            continue

                        process_rm_addr_csv(csv_fname, connections, conn_id)
                        os.remove(csv_fname)

                    elif MPTCP_RTT_FNAME in os.path.basename(csv_fname):
                        conn_id = get_connection_id(
                            os.path.basename(csv_fname))
                        if conn_id not in connections:
                            # Not a real connection; skip it
                            continue

                        is_reversed = is_reverse_connection(
                            os.path.basename(csv_fname))
                        process_rtt_csv(csv_fname, rtt_all, connections,
                                        conn_id, is_reversed)
                        os.remove(csv_fname)
                        # co.move_file(csv_fname, os.path.join(
                        #    graph_dir_exp, co.DEF_RTT_DIR, os.path.basename(pcap_filepath[:-5]) + "_" + csv_fname))
                    elif MPTCP_SEQ_FNAME in os.path.basename(csv_fname):
                        conn_id = get_connection_id(
                            os.path.basename(csv_fname))
                        if conn_id not in connections:
                            # Not a real connection; skip it
                            continue

                        is_reversed = is_reverse_connection(
                            os.path.basename(csv_fname))
                        process_csv(csv_fname, connections, conn_id,
                                    is_reversed)
                        if return_dict:
                            try:
                                os.remove(csv_fname)
                            except Exception:
                                pass
                        else:
                            co.move_file(
                                csv_fname,
                                os.path.join(
                                    graph_dir_exp, co.TSG_THGPT_DIR,
                                    os.path.basename(pcap_filepath[:-5]) +
                                    "_" + os.path.basename(csv_fname)))
                    elif MPTCP_ACKSIZE_FNAME in os.path.basename(csv_fname):
                        collect_acksize_csv(csv_fname, connections,
                                            acksize_all)
                        os.remove(csv_fname)
                    else:
                        if not light and not return_dict:
                            co.move_file(
                                csv_fname,
                                os.path.join(
                                    graph_dir_exp, co.TSG_THGPT_DIR,
                                    os.path.basename(pcap_filepath[:-5]) +
                                    "_" + os.path.basename(csv_fname)))
                        else:
                            os.remove(csv_fname)
                except IOError as e:
                    print(str(e), file=sys.stderr)

            do_tcp_processing = True

    except MPTCPTraceError as e:
        print(str(e) + "; skip mptcp process", file=sys.stderr)

    shutil.rmtree(csv_tmp_dir)

    # This will save the mptcp connections
    if connections and do_tcp_processing:
        dicts = tcp.process_trace(pcap_filepath,
                                  graph_dir_exp,
                                  stat_dir_exp,
                                  failed_conns_dir_exp,
                                  acksize_tcp_dir_exp,
                                  tcpcsm,
                                  mptcp_connections=connections,
                                  light=light,
                                  return_dict=return_dict)
        if return_dict:
            tcp_connections, acksize_all_tcp = dicts
            return connections, tcp_connections, rtt_all, acksize_all, acksize_all_tcp
        else:
            co.save_data(pcap_filepath, acksize_dir_exp, acksize_all)
            co.save_data(pcap_filepath, rtt_dir_exp, rtt_all)
            co.save_data(pcap_filepath, stat_dir_exp, connections)
Beispiel #24
0
            data['original_question'] = original_question
            data['question'] = question
            data['original_answer'] = original_answer
            data['answer'] = answer
            data['image_file'] = url
            data['qa_id'] = q['qa_id']
            data['question_type'] = question_type
            data['image_id'] = image_id
            data['question_action'] = o[0]
            data['matched_image_action'] = o[1]
            data['original_question_action'] = verb_lemma_to_full_verb[o[1]]
            data['image_actions'] = ','.join(image_actions)
            question_actions_data.append(data)

print "Finished questions"

# print add_actions

question_actions_df = save_data(question_actions_data, questions_output_file)
print "Question actions: [%d]" % (len(question_actions_df))

if not os.path.exists(output_actions_file):
    print 'Adding in new actions...'
    add_df = pd.DataFrame(add_actions)
    add_df = add_df.loc[:, ~add_df.columns.str.contains('^Unnamed')]
    # print add_df
    # print actions_df
    actions_df = actions_df.append(add_df)
actions_df = actions_df.loc[:, ~actions_df.columns.str.contains('^Unnamed')]
actions_df = actions_df[actions_df['image_id'].isin(found_image_ids)]
actions_df.to_csv(output_actions_file)
Beispiel #25
0
	# # print relevant

	irrelevant = {}
	irrelevant.update(relevant)
	irrelevant['relevant'] = 0
	irrelevant['original_answer'] = 'no ' + ' or '.join(question_actions) + ' found'
	irrelevant['answer'] = irrelevant['original_answer']
	irrelevant['image_file'] = random_image_without_actions
	irrelevant['qa_id'] = -1 * irrelevant['qa_id']
	new_questions.append(irrelevant)

	# print irrelevant

print 'Finished adding questions'
new_questions_df = save_data(new_questions, dataset_output_file)
# print new_questions_df
print len(new_questions_df)

if filter_infrequent:
	print 'Filtering dataset to remove infrequent answers...'
	irrelevant_df = new_questions_df[new_questions_df['relevant'] == 0]
	grouped_df = irrelevant_df.groupby('answer', as_index=False).count().sort_values(['image_file'])
	# print grouped_df
	# print grouped_df[grouped_df['image_file'] < 30]
	remove_answers = grouped_df[grouped_df['image_file'] < 5]['answer'].tolist()
	remove_qa_ids = new_questions_df[new_questions_df['answer'].isin(remove_answers)]['qa_id'].tolist()
	remove_qa_ids += [(-1 * qa) for qa in remove_qa_ids]
	remove_qa_ids = set(remove_qa_ids)

	new_questions_df = new_questions_df[~new_questions_df['qa_id'].isin(remove_qa_ids)]
Beispiel #26
0
import os
import csv
import numpy as np

from a3_1 import run_kmeans
from a3_2 import run_mog
from common import save_data



if __name__ == '__main__':
    data = np.load('data100D.npy')

    # Run 2.2.3
    for i in range(10):
        for k in [5, 10, 15, 20, 30]:
            result = run_kmeans(k, data, epochs=1000, tol=1e-6)
            save_data(result, '2.2.3-silhouette-%s' % i, 'kmeans-%s' % str(k))
        
        for k in [5, 10, 15, 20, 30]:
            result = run_mog(k, data, epochs=1000, tol=1e-8)
            save_data(result, '2.2.3-silhouette-%s' % i, 'mog-%s' % str(k))
def mnist_classifier_tanh():
    # paths
    path = dict()
    path['project'] = os.path.dirname(os.path.abspath(__file__))
    path['state'] = os.path.join(path['project'], 'epoch')
    path['dataset'] = os.path.join(path['project'], 'dataset')
    path['graph'] = os.path.join(path['project'], 'graph')
    path['array'] = os.path.join(path['project'], 'array')
    for key, value in path.items():
        if not os.path.exists(path[key]):
            os.mkdir(path[key])

    # parameters
    batch_size = 1000
    number_of_epochs = 20
    learning_rate = 1e-3
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    mean = 0.1307
    std = 0.3081
    loss = nn.CrossEntropyLoss()
    train_info_per_batch = 6
    validation_info_per_batch = 3
    test_info_per_batch = 5
    validation_ratio = 0.1

    # transform
    transform = torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize(mean=(mean, ), std=(std, ))
    ])

    # dataset
    train_dataset = torchvision.datasets.MNIST(root=path['dataset'],
                                               train=True,
                                               transform=transform,
                                               download=True)
    test_dataset = torchvision.datasets.MNIST(root=path['dataset'],
                                              train=False,
                                              transform=transform,
                                              download=True)

    # validation dataset
    validation_limit = int((1 - validation_ratio) * len(train_dataset))
    index_list = list(range(len(train_dataset)))
    train_indexes, validation_indexes = index_list[:
                                                   validation_limit], index_list[
                                                       validation_limit:]
    train_sampler = SubsetRandomSampler(train_indexes)
    validation_sampler = SequentialSampler(validation_indexes)

    # dataset loaders
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler)
    validation_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                    batch_size=batch_size,
                                                    sampler=validation_sampler)
    test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                              batch_size=batch_size)

    # model
    model = MnistClassifierTanh().to(device)

    # optimizer
    optimizer = optim.SGD(params=model.parameters(), lr=learning_rate)

    epochs = np.arange(start=1, stop=(number_of_epochs + 1), step=1, dtype=int)

    print('Mnist Classifier Tanh')
    train_losses = []
    train_accuracies = []
    validation_losses = []
    validation_accuracies = []
    test_losses = []
    test_accuracies = []
    for epoch in epochs:
        info = 'Epoch {epoch_index}/{number_of_epochs}'
        print(info.format(epoch_index=epoch,
                          number_of_epochs=number_of_epochs))

        # train
        train_loss, train_accuracy = train(model=model,
                                           device=device,
                                           loader=train_loader,
                                           optimizer=optimizer,
                                           loss=loss,
                                           info_per_batch=train_info_per_batch)
        info = 'Train: Average Loss: {train_loss:.5f}, Accuracy: % {train_accuracy:.2f}'
        print(
            info.format(train_loss=train_loss,
                        train_accuracy=(100 * train_accuracy)))
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)

        # validation
        validation_loss, validation_accuracy = test(
            model=model,
            loader=validation_loader,
            device=device,
            loss=loss,
            info_per_batch=validation_info_per_batch,
            info_name='Validation')
        info = 'Validation: Average Loss: {validation_loss:.5f}, Accuracy: % {validation_accuracy:.2f}'
        print(
            info.format(validation_loss=validation_loss,
                        validation_accuracy=(100 * validation_accuracy)))
        validation_losses.append(validation_loss)
        validation_accuracies.append(validation_accuracy)

        # test
        test_loss, test_accuracy = test(model=model,
                                        loader=test_loader,
                                        device=device,
                                        loss=loss,
                                        info_per_batch=test_info_per_batch,
                                        info_name='Test')
        info = 'Test: Average Loss: {test_loss:.5f}, Accuracy: % {test_accuracy:.2f}'
        print(
            info.format(test_loss=test_loss,
                        test_accuracy=(100 * test_accuracy)))
        test_losses.append(test_loss)
        test_accuracies.append(test_accuracy)

        # epoch state
        state_file_name = 'mnist_classifier_tanh_epoch_{epoch_index}.pkl'.format(
            epoch_index=epoch)
        save_state(model=model,
                   directory=path['state'],
                   file_name=state_file_name)

    # train loss
    save_data(array=train_losses,
              directory=path['array'],
              file_name='mnist_classifier_tanh_train_loss.npy')
    draw_line_graph(x=epochs,
                    y=train_losses,
                    x_label='Epoch',
                    y_label='Loss',
                    title='Mnist Classifier Tanh Train Loss',
                    directory=path['graph'],
                    file_name='mnist_classifier_tanh_train_loss.png')

    # train accuracy
    save_data(array=train_accuracies,
              directory=path['array'],
              file_name='mnist_classifier_tanh_train_accuracy.npy')
    draw_line_graph(x=epochs,
                    y=train_accuracies,
                    x_label='Epoch',
                    y_label='Accuracy',
                    title='Mnist Classifier Tanh Train Accuracy',
                    directory=path['graph'],
                    file_name='mnist_classifier_tanh_train_accuracy.png')

    # validation loss
    save_data(array=validation_losses,
              directory=path['array'],
              file_name='mnist_classifier_tanh_validation_loss.npy')
    draw_line_graph(x=epochs,
                    y=validation_losses,
                    x_label='Epoch',
                    y_label='Loss',
                    title='Mnist Classifier Tanh Validation Loss',
                    directory=path['graph'],
                    file_name='mnist_classifier_tanh_validation_loss.png')

    # validation accuracy
    save_data(array=validation_accuracies,
              directory=path['array'],
              file_name='mnist_classifier_tanh_validation_accuracy.npy')
    draw_line_graph(x=epochs,
                    y=validation_accuracies,
                    x_label='Epoch',
                    y_label='Accuracy',
                    title='Mnist Classifier Tanh Validation Accuracy',
                    directory=path['graph'],
                    file_name='mnist_classifier_tanh_validation_accuracy.png')

    # test loss
    save_data(array=test_losses,
              directory=path['array'],
              file_name='mnist_classifier_tanh_test_loss.npy')
    draw_line_graph(x=epochs,
                    y=test_losses,
                    x_label='Epoch',
                    y_label='Loss',
                    title='Mnist Classifier Tanh Test Loss',
                    directory=path['graph'],
                    file_name='mnist_classifier_tanh_test_loss.png')

    # test accuracy
    save_data(array=test_accuracies,
              directory=path['array'],
              file_name='mnist_classifier_tanh_test_accuracy.npy')
    draw_line_graph(x=epochs,
                    y=test_accuracies,
                    x_label='Epoch',
                    y_label='Accuracy',
                    title='Mnist Classifier Tanh Test Accuracy',
                    directory=path['graph'],
                    file_name='mnist_classifier_tanh_test_accuracy.png')

    # loss
    draw_multi_lines_graph(lines=[
        dict(label='Train', data=dict(x=epochs, y=train_losses)),
        dict(label='Validation', data=dict(x=epochs, y=validation_losses)),
        dict(label='Test', data=dict(x=epochs, y=test_losses))
    ],
                           x_label='Epoch',
                           y_label='Loss',
                           title='Mnist Classifier Tanh Loss',
                           directory=path['graph'],
                           file_name='mnist_classifier_tanh_loss.png')

    # accuracy
    draw_multi_lines_graph(lines=[
        dict(label='Train', data=dict(x=epochs, y=train_accuracies)),
        dict(label='Validation', data=dict(x=epochs, y=validation_accuracies)),
        dict(label='Test', data=dict(x=epochs, y=test_accuracies))
    ],
                           x_label='Epoch',
                           y_label='Accuracy',
                           title='Mnist Classifier Tanh Accuracy',
                           directory=path['graph'],
                           file_name='mnist_classifier_tanh_accuracy.png')
def process_trace(
    pcap_filepath,
    graph_dir_exp,
    stat_dir_exp,
    aggl_dir_exp,
    rtt_dir_exp,
    rtt_subflow_dir_exp,
    failed_conns_dir_exp,
    acksize_dir_exp,
    acksize_tcp_dir_exp,
    plot_cwin,
    tcpcsm,
    min_bytes=0,
    light=False,
    return_dict=False,
):
    """ Process a mptcp pcap file and generate graphs of its subflows
        Notice that we can't change dir per thread, we should use processes
    """
    # if not check_mptcp_joins(pcap_filepath):
    #     print("WARNING: no mptcp joins on " + pcap_filepath, file=sys.stderr)
    csv_tmp_dir = tempfile.mkdtemp(dir=os.getcwd())
    connections = None
    do_tcp_processing = False
    try:
        with co.cd(csv_tmp_dir):
            # If segmentation faults, remove the -S option
            # cmd = ['mptcptrace', '-f', pcap_filepath, '-s', '-S', '-t', '5000', '-w', '0']
            # if not light:
            #     cmd += ['-G', '250', '-r', '2', '-F', '3', '-a']
            # connections = process_mptcptrace_cmd(cmd, pcap_filepath)
            #
            # # Useful to count the number of reinjected bytes
            # cmd = ['mptcptrace', '-f', pcap_filepath, '-s', '-a', '-t', '5000', '-w', '2']
            # if not light:
            #     cmd += ['-G', '250', '-r', '2', '-F', '3']
            # devnull = open(os.devnull, 'w')
            # if subprocess.call(cmd, stdout=devnull) != 0:
            #     raise MPTCPTraceError("Error of mptcptrace with " + pcap_filepath)
            # devnull.close()
            #
            # cmd = ['mptcptrace', '-f', pcap_filepath, '-r', '2', '-t', '5000', '-w', '2']
            # if not light:
            #     cmd += ['-G', '250', '-r', '2', '-F', '3']
            # devnull = open(os.devnull, 'w')
            # if subprocess.call(cmd, stdout=devnull) != 0:
            #     raise MPTCPTraceError("Error of mptcptrace with " + pcap_filepath)
            # devnull.close()

            cmd = ["mptcptrace", "-f", pcap_filepath, "-s", "-S", "-a", "-A", "-R", "-r", "2", "-t", "5000", "-w", "2"]
            connections = process_mptcptrace_cmd(cmd, pcap_filepath)

            # The mptcptrace call will generate .xpl files to cope with
            # First see all xpl files, to detect the relative 0 of all connections
            # Also, compute the duration and number of bytes of the MPTCP connection
            first_pass_on_files(connections)
            rtt_all = {co.C2S: {}, co.S2C: {}}
            acksize_all = {co.C2S: {}, co.S2C: {}}

            # Then really process xpl files
            if return_dict:
                for xpl_fname in glob.glob(os.path.join("*.xpl")):
                    try:
                        os.remove(xpl_fname)
                    except IOError as e:
                        print(str(e), file=sys.stderr)
            else:
                for xpl_fname in glob.glob(os.path.join("*.xpl")):
                    try:
                        directory = co.DEF_RTT_DIR if MPTCP_RTT_FNAME in xpl_fname else co.TSG_THGPT_DIR
                        shutil.move(
                            xpl_fname,
                            os.path.join(
                                graph_dir_exp,
                                directory,
                                os.path.basename(pcap_filepath[:-5]) + "_" + os.path.basename(xpl_fname),
                            ),
                        )
                    except IOError as e:
                        print(str(e), file=sys.stderr)

            # And by default, save only seq csv files
            for csv_fname in glob.glob(os.path.join("*.csv")):
                if not light:
                    if MPTCP_GPUT_FNAME in os.path.basename(csv_fname):
                        process_gput_csv(csv_fname, connections)
                try:
                    if os.path.basename(csv_fname).startswith(MPTCP_ADDADDR_FNAME):
                        conn_id = get_connection_id(os.path.basename(csv_fname))
                        if conn_id not in connections:
                            # Not a real connection; skip it
                            continue

                        process_add_addr_csv(csv_fname, connections, conn_id)
                        os.remove(csv_fname)

                    elif os.path.basename(csv_fname).startswith(MPTCP_RMADDR_FNAME):
                        conn_id = get_connection_id(os.path.basename(csv_fname))
                        if conn_id not in connections:
                            # Not a real connection; skip it
                            continue

                        process_rm_addr_csv(csv_fname, connections, conn_id)
                        os.remove(csv_fname)

                    elif MPTCP_RTT_FNAME in os.path.basename(csv_fname):
                        conn_id = get_connection_id(os.path.basename(csv_fname))
                        if conn_id not in connections:
                            # Not a real connection; skip it
                            continue

                        is_reversed = is_reverse_connection(os.path.basename(csv_fname))
                        process_rtt_csv(csv_fname, rtt_all, connections, conn_id, is_reversed)
                        os.remove(csv_fname)
                        # co.move_file(csv_fname, os.path.join(
                        #    graph_dir_exp, co.DEF_RTT_DIR, os.path.basename(pcap_filepath[:-5]) + "_" + csv_fname))
                    elif MPTCP_SEQ_FNAME in os.path.basename(csv_fname):
                        conn_id = get_connection_id(os.path.basename(csv_fname))
                        if conn_id not in connections:
                            # Not a real connection; skip it
                            continue

                        is_reversed = is_reverse_connection(os.path.basename(csv_fname))
                        process_csv(csv_fname, connections, conn_id, is_reversed)
                        if return_dict:
                            try:
                                os.remove(csv_fname)
                            except Exception:
                                pass
                        else:
                            co.move_file(
                                csv_fname,
                                os.path.join(
                                    graph_dir_exp,
                                    co.TSG_THGPT_DIR,
                                    os.path.basename(pcap_filepath[:-5]) + "_" + os.path.basename(csv_fname),
                                ),
                            )
                    elif MPTCP_ACKSIZE_FNAME in os.path.basename(csv_fname):
                        collect_acksize_csv(csv_fname, connections, acksize_all)
                        os.remove(csv_fname)
                    else:
                        if not light and not return_dict:
                            co.move_file(
                                csv_fname,
                                os.path.join(
                                    graph_dir_exp,
                                    co.TSG_THGPT_DIR,
                                    os.path.basename(pcap_filepath[:-5]) + "_" + os.path.basename(csv_fname),
                                ),
                            )
                        else:
                            os.remove(csv_fname)
                except IOError as e:
                    print(str(e), file=sys.stderr)

            do_tcp_processing = True

    except MPTCPTraceError as e:
        print(str(e) + "; skip mptcp process", file=sys.stderr)

    shutil.rmtree(csv_tmp_dir)

    # This will save the mptcp connections
    if connections and do_tcp_processing:
        dicts = tcp.process_trace(
            pcap_filepath,
            graph_dir_exp,
            stat_dir_exp,
            failed_conns_dir_exp,
            acksize_tcp_dir_exp,
            tcpcsm,
            mptcp_connections=connections,
            light=light,
            return_dict=return_dict,
        )
        if return_dict:
            tcp_connections, acksize_all_tcp = dicts
            return connections, tcp_connections, rtt_all, acksize_all, acksize_all_tcp
        else:
            co.save_data(pcap_filepath, acksize_dir_exp, acksize_all)
            co.save_data(pcap_filepath, rtt_dir_exp, rtt_all)
            co.save_data(pcap_filepath, stat_dir_exp, connections)