Exemple #1
0
def train_randr_model(log, in_files, out_files, start, interval_width,
                      hidden_dim, feature_extractor):

    log.info("train_rand_model")
    dataset = load_dataset_vendor(in_files, start, interval_width)

    model = RefexRolx(hidden_dim, feature_extractor)
    model.initialize_embeddings(dataset.get(0))
    pickle.dump(model, open(out_files['trained_model'].path, 'wb'))
Exemple #2
0
def create_lanl_dataset(tp, in_files, out_files, *op_args, **op_kwargs):
    print("create_lanl_dataset")

    dataset = load_dataset_vendor(out_files, tp['start'], tp['interval_width'])

    df = DatasetLANL(in_files["raw_file"].path).element("auth")

    if tp['feature_extractor'] == 'degree':
        extractor = DegreeExtractor()
    elif tp['feature_extractor'] == 'boston':
        extractor = BostonExtractor()

    process_intervals(df, dataset, tp['start'], tp['end'],
                      tp['interval_width'], tp['graph_representation'],
                      extractor)
Exemple #3
0
def infer_graph_model(log, in_files, out_files, start, end, interval_width,
                      predicator_name, hidden_dim, nodes_of_interest,
                      tensorboard_writer):

    log.info(f"infer_graph_model, start:{start}, end={end}")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    trained_model_path = in_files['trained_model'].path
    dataset_files_paths = in_files
    del dataset_files_paths['trained_model']

    dataset = load_dataset_vendor(in_files, start, interval_width)

    if len(nodes_of_interest) == 0:
        nodes_of_interest = get_all_nodes_from_dataset(dataset)

    if predicator_name == 'infomax':
        input_dim = len(dataset.get(0).x[0])
        trained_model = ModelWrapper(model=Infomax(input_dim, hidden_dim).to(
            device)).load(trained_model_path)
        predictor = InfomaxUNBPredict(trained_model)

    elif predicator_name == 'randr':
        trained_model = pickle.load(open(trained_model_path, 'rb'))
        predictor = RandRUNBPredict(trained_model)
    else:
        raise AirflowException('predicator_name unsupported')

    node_embeddings = index_embeddings(dataset=dataset,
                                       predictor=predictor,
                                       device=device,
                                       tensorboard_writer=tensorboard_writer,
                                       nodes_of_interest=nodes_of_interest,
                                       metric_normalization=None)

    print(f"Saving node_embeddings to: {out_files['node_embeddings'].path}")
    output = InferenceOutput(node_embeddings=node_embeddings,
                             filename=out_files['node_embeddings'].path)

    output.save()
    return 'succeeded'
Exemple #4
0
def post_analysis(tp, in_files, out_files, *op_args, **op_kwargs):

    # todo: add as config params...
    svm_kernel = 'rbf'
    svm_gamma = 'scale'

    start = tp['start']
    end = tp['end']

    with open(out_files['metrics_summary_file'].path, "w") as summary_file:

        summary_file.write(f"\nMetrics data {tp['experiment_name']}\n\n")

        output_path = osp.join(Variable.get(tp['airflow_vars']['out_dir']),
                               Variable.get(tp['airflow_vars']['hash']))

        inf_output = InferenceOutput(filename=in_files['node_embeddings'].path)
        inf_output.restore()
        inference_dataset = load_dataset_vendor(in_files, start,
                                                tp['interval_width'])

        training_intervals_count = Variable.get(
            tp['airflow_vars']['training_intervals_count'], default_var=0)

        # Generating metrics for selected nodes
        metrics_df = create_metrics_df(inf_output.node_embeddings,
                                       inference_dataset,
                                       tp['nodes_of_interest'],
                                       training_intervals_count,
                                       generate_cosine_only=True)

        df = metrics_df
        df['index'] = df['timestamp_adjusted']
        df.set_index('index', inplace=True)

        # Preparing reference_embeddings
        embeds_192_168_10_50, _ = get_labeled_embeddings(
            df, '192.168.10.50', start, end)
        file_path_ref_embeds_192_168_10_50 = os.path.join(
            output_path, "ref_embeds_192_168_10_50.p")
        os.makedirs(output_path, exist_ok=True)
        pickle.dump(embeds_192_168_10_50['embeddings'],
                    open(file_path_ref_embeds_192_168_10_50, "wb"))

        inferred_reference_embeddings = []
        for node in tp['reference_nodes']:
            labeled_data, _ = get_labeled_embeddings(df, node, start, end)
            inferred_reference_embeddings.extend(labeled_data['embeddings'])
        file_path_ref_nodes_embeds = os.path.join(output_path,
                                                  "ref_nodes_embeds.p")
        pickle.dump(inferred_reference_embeddings,
                    open(file_path_ref_nodes_embeds, "wb"))

        target_names = ['class MALICIOUS',
                        'class BENIGN']  # Confirm class ordering
        target_names_report = [
            'class MALICIOUS', 'class BENIGN', 'weighted avg'
        ]  # Confirm class ordering

        nu_values = []
        roc_values = []
        for node in tp['nodes_of_interest']:

            # Get test set's embeddings
            x_testing_embeddings, _ = get_labeled_embeddings(
                df, node, start, end)
            x_test = x_testing_embeddings["embeddings"]
            y_true_test = x_testing_embeddings["class_label_int"]

            # Determine the node label for the entire interval
            node_global_label = 'malicious' if len(
                set(y_true_test)) > 1 else 'benign'
            # len > 1 means the set contains more than 1 unique value, thus bening and malicious exist.
            summary_file.write(
                f"NODE: {node}\t Global Label: {node_global_label}\n")

            # Generate a few reference embeddings
            reference_embeddings_configs = []
            # 1- Embeddings from the node itself only (works well on victim nodes)
            ref_embeds_1 = x_test
            reference_embeddings_configs.append(
                ('ref1', ref_embeds_1))  # itself only

            # 2- Embeddings from the node itself + a victim node
            ref_embeds_2 = pickle.load(
                open(file_path_ref_embeds_192_168_10_50, "rb"))
            ref_embeds_2.extend(x_test)
            reference_embeddings_configs.append(
                ('ref2', ref_embeds_2))  # _embeds_2_itself_plus_1victim

            # 3- Embeddings from the node + all_victims + a few non_victims
            ref_embeds_3 = pickle.load(open(file_path_ref_nodes_embeds, "rb"))
            reference_embeddings_configs.append(
                ('ref3', ref_embeds_3))  # _embeds_3_ref_nodes_list

            for ref_embeds_cfg in reference_embeddings_configs:
                fpr = []
                tpr = []
                nu = []

                skip_roc_plot = False
                try:
                    for svm_nu in frange(0.05, 1.0, 0.05):
                        nu.append(svm_nu)
                        clf = svm.OneClassSVM(nu=svm_nu,
                                              kernel=svm_kernel,
                                              gamma=svm_gamma)

                        x_ref = ref_embeds_cfg[1]
                        clf.fit(x_ref)

                        y_pred_test = clf.predict(x_test)

                        plot_node_analysis(node=node,
                                           np_embeddings=np.array(x_test),
                                           interval_count=len(x_test),
                                           y_true_test=y_true_test,
                                           y_pred_test=y_pred_test,
                                           output_path=output_path,
                                           svm_nu=svm_nu,
                                           ref_embed_config=ref_embeds_cfg[0])

                        tn, fp, fn, tp = confusion_matrix(
                            y_true_test, y_pred_test).ravel()
                        tn = float(tn)
                        fp = float(fp)
                        fn = float(fn)
                        tp = float(tp)

                        fpr_sample = fp / (fp + tn) if fp != 0 else 0.0
                        tpr_sample = tp / (tp + fn) if tp != 0 else 0.0

                        fpr.append(fpr_sample)
                        tpr.append(tpr_sample)

                        report_txt = classification_report(
                            y_true_test,
                            y_pred_test,
                            target_names=target_names,
                            output_dict=False)

                        if svm_nu == 0.15:
                            summary_file.write(
                                f"=== svm_nu: {svm_nu}  SVM training setup:{ref_embeds_cfg[0]}===\n"
                            )
                            summary_file.write(
                                f"classification_report:\n{report_txt}\n")

                        report = classification_report(
                            y_true_test,
                            y_pred_test,
                            target_names=target_names,
                            output_dict=True)

                        for target_name in target_names_report:
                            nu_values.append({
                                'node':
                                node,
                                'node_global_label':
                                node_global_label,
                                'classifier':
                                'OneClassSVM',
                                'hyperparam1':
                                ref_embeds_cfg[0],
                                'hyperparam2':
                                svm_nu,
                                'target':
                                target_name,
                                'precision':
                                report[target_name]['precision'],
                                'recall':
                                report[target_name]['recall'],
                                'f1-score':
                                report[target_name]['f1-score'],
                                'support':
                                report[target_name]['support']
                            })
                except Exception as e:
                    summary_file.write(
                        f"=== OneClassSVM Failed (bad fit dimensions?) ===\n\n"
                    )
                    skip_roc_plot = True
                    print(e)

                print(f"Node:{node}\t Node global label: {node_global_label}")
                if skip_roc_plot is False:
                    try:
                        roc_auc = auc(fpr, tpr)
                        plot_roc(node, fpr, tpr, roc_auc, output_path,
                                 ref_embeds_cfg[0])

                        roc_values.append({
                            'node': node,
                            'node_global_label': node_global_label,
                            'classifier': 'OneClassSVM',
                            'hyperparam1': ref_embeds_cfg[0],
                            'hyperparam2': -1,
                            'roc_auc': roc_auc,
                        })
                    except Exception as e:
                        summary_file.write(
                            f"---> FAILED TO GENERATE ROC CURVE FROM VALUES\n\n"
                        )
                        print(e)

        df_detailed_classifier_data = pd.DataFrame(nu_values)
        df_roc_classifier_data = pd.DataFrame(roc_values)

        metrics_df.to_hdf(out_files['df_metrics'].path,
                          key='metrics_df',
                          mode='w')
        df_detailed_classifier_data.to_hdf(
            out_files['df_detailed_classifier_data'].path, key='df', mode='w')
        df_roc_classifier_data.to_hdf(out_files['df_roc_classifier_data'].path,
                                      key='df',
                                      mode='w')

        summary_file.write('Done')

        return 'succeeded'
def predict(log, in_files, out_files, start, end, interval_width,
            svm_training_technique, nodes_of_interest, reference_nodes,
            reference_victim_node, airflow_vars):

    log.info("predict")
    print(f"svm_training_technique: {svm_training_technique}")
    svm_kernel = 'rbf'
    svm_gamma = 'scale'
    svm_nu = 0.15

    output_path = os.path.dirname(out_files['df_metrics'].path)
    print(f"Predict saving to {output_path}")

    inf_output = InferenceOutput(filename=in_files['node_embeddings'].path)
    inf_output.restore()

    dataset_input_files = in_files
    del dataset_input_files['node_embeddings']
    del dataset_input_files['trained_model']

    inference_dataset = load_dataset_vendor(dataset_input_files, start,
                                            interval_width)

    training_intervals_count = Variable.get(
        airflow_vars['training_intervals_count'], default_var=0)

    if len(nodes_of_interest) == 0:
        log.warning(
            "Node list not specified: running prediction on entire node list (might be slow!!)"
        )
        nodes_of_interest = get_all_nodes_from_dataset(inference_dataset)

    # Generating metrics for selected nodes
    df = create_metrics_df(inf_output.node_embeddings,
                           inference_dataset,
                           nodes_of_interest,
                           training_intervals_count,
                           generate_cosine_only=True)

    df['index'] = df['timestamp']
    df.set_index('index', inplace=True)
    df.to_hdf(out_files['df_metrics'].path, key='df', mode='w')

    print(f"start: {start}")
    print(f"end: {end}")

    embeds_ref_node = prepare_embeddings_ref_node(df, reference_victim_node,
                                                  start, end, output_path)
    embeds_ref_node_list = prepare_embeddings_ref_node_list(
        df, reference_nodes, start, end, output_path)

    node_truth = {}
    node_predictions = {}

    interval_len = 0

    for node in nodes_of_interest:
        print(f"processing node: {node}")
        # Get test set's embeddings
        x_testing_embeddings, _ = get_labeled_embeddings(df, node, start, end)
        x_test = x_testing_embeddings["embeddings"]

        # Reuse y tensor here?
        y_true_test = x_testing_embeddings[
            "class_label_int"]  # 1 for inliers, -1 for outliers, 0 for unlabelled

        print(f"interval count: {len(y_true_test)}")
        assert interval_len == 0 or len(
            y_true_test) == interval_len, "Inference length mismatch!"
        interval_len = len(y_true_test)
        node_truth[node] = y_true_test

        # Determine the node label for the entire interval
        node_global_label = 'malicious' if len(
            set(y_true_test)) > 1 else 'benign'
        # len > 1 means the set contains more than 1 unique value, thus benign and malicious exist.

        # Generate a single reference embeddings
        if svm_training_technique == 'self':
            x_ref = x_test  # 'ref1'
        elif svm_training_technique == 'self_plus_victim':
            # 2- Embeddings from the node itself + a victim node
            x_ref = copy.deepcopy(embeds_ref_node['embeddings'])
            x_ref.extend(x_test)
        elif svm_training_technique == 'self_plus_reference_nodes':
            # 3- Embeddings from the node + all_victims + a few non_victims
            x_ref = copy.deepcopy(embeds_ref_node_list)
        else:
            raise ValueError(
                f'invalid svm_training_technique: "{svm_training_technique}", supported values: "self", '
                '"self plus victim", "self plus reference nodes"')

        # Need to drop N/A intervals here (either from x_ref or x_test)
        x_ref_incomplete_idx = []
        for i, j in enumerate(x_ref):
            if type(j) == str:  # str for "n/a", array otherwise
                x_ref_incomplete_idx.append(i)

        x_test_incomplete_idx = []
        for i, j in enumerate(x_test):
            if type(j) == str:
                x_test_incomplete_idx.append(i)

        # 2 is to allow a minimum of 2 intervals for training the OC-SVM (add as a config param?)
        if len(x_ref) >= (len(x_ref_incomplete_idx) +
                          2) and len(x_test) > len(x_test_incomplete_idx):
            if len(x_ref_incomplete_idx) > 0:
                print(
                    f"Dropping intervals because of incomplete reference data for node: {node}"
                )
                print(f"\tIntervals: {x_ref_incomplete_idx}")
            if len(x_test_incomplete_idx) > 0:
                print(
                    f"Dropping intervals because of incomplete embeddings for node: {node}"
                )
                print(f"\tIntervals: {x_test_incomplete_idx}")

            # starting from the end, removing items from list
            for i in reversed(x_ref_incomplete_idx):
                del (x_ref[i])
            for i in reversed(x_test_incomplete_idx):
                del (x_test[i])

            try:
                print("Fitting OC-SVM")
                clf = svm.OneClassSVM(nu=svm_nu,
                                      kernel=svm_kernel,
                                      gamma=svm_gamma)
                clf.fit(x_ref)
                y_pred_test = clf.predict(
                    x_test)  # 1 for inliers, -1 for outliers.
                node_predictions[node] = y_pred_test

                # Put back missing intervals at the right place.
                for i in x_test_incomplete_idx:
                    if i < node_predictions[node].size:
                        node_predictions[node] = np.insert(
                            node_predictions[node], i, [NODE_ABSENT])
                    else:
                        node_predictions[node] = np.append(
                            node_predictions[node], [NODE_ABSENT])

            except Exception as e:
                assert False, e
        else:
            print(f"All intervals dropped for node: {node}")
            node_predictions[node] = [NODE_ABSENT] * interval_len

        print(f"---> Node:{node}\t Node global label: {node_global_label}")

    predictions = []

    for k, interval_data in tqdm(enumerate(inference_dataset)):
        interval_start = inference_dataset.start + k * inference_dataset.interval_width
        interval_end = inference_dataset.start + (
            k + 1) * inference_dataset.interval_width

        print(f"Getting results for interval: {k}")
        for node in nodes_of_interest:
            print(f"Node: {node}")

            pred_label = node_predictions[node][k]
            truth_label = node_truth[node][k]

            if (truth_label == UNLABELLED
                    or truth_label == 'n/a') and pred_label == MALICIOUS:
                result = P
            elif (truth_label == UNLABELLED
                  or truth_label == 'n/a') and pred_label == BENIGN:
                result = N
            elif truth_label == MALICIOUS and pred_label == MALICIOUS:
                result = TP
            elif truth_label == BENIGN and pred_label == BENIGN:
                result = TN
            elif truth_label == MALICIOUS and pred_label == BENIGN:
                result = FN
            elif truth_label == BENIGN and pred_label == MALICIOUS:
                result = FP
            else:
                result = NODE_ABSENT

            pred = {
                'interval_id': k,
                'start_timestamp': interval_start,
                'end_timestamp': interval_end,
                'ip': node,
                'pred_label': pred_label,
                'truth_label': truth_label,
                'result': result
            }
            predictions.append(pred)
            print(f"Appending to predictions: {pred}")

    print("Saving prediction dataframe - DONE")
    prediction_df = pd.DataFrame(predictions)

    # Pandas dataframe  visualizing metrics into a grid:
    #     | interval_id | start_timestamp | end_timestamp | ip | pred_label | truth_label | /
    #     | result (TP=0, TN=1, FP=2, FN=3, Node_absent=4, POSITIVE=5, NEGATIVE=6) |
    #
    prediction_df.to_hdf(out_files['prediction_df'].path, key='df', mode='w')

    return 'succeeded'
Exemple #6
0
def train_deep_graph_embeddings_model(log, in_files, out_files, start,
                                      interval_width, hidden_dim,
                                      training_epochs, tensorboard_writer,
                                      patience_epochs, learning_rate,
                                      **kwargs):

    log.info(train_deep_graph_embeddings_model)
    verbose = kwargs['verbose'] if 'verbose' in kwargs else True

    dataset = load_dataset_vendor(in_files, start, interval_width)

    input_dim = len(dataset.get(0).x[0])
    print("Desired Backend: %s" %
          ('cuda' if torch.cuda.is_available() else 'cpu'))
    device = torch.device('cpu')

    infomax_model = Infomax(input_dim, hidden_dim).to(device)
    infomax_optimizer = torch.optim.Adam(infomax_model.parameters(),
                                         lr=learning_rate)

    for step, interval_data in enumerate(dataset):
        data = interval_data.to(device)

        if verbose:
            start = dataset.start + step * dataset.interval_width
            end = dataset.start + (step + 1) * dataset.interval_width
            print(
                f"Generated a data training interval: START {start} END {end}")
            print(f"Nodes: {data.num_nodes}")
            print(f"Edges: {data.num_edges}")
            print(f"Feat.: {data.num_features}")

    if verbose:
        print(
            f"Starting Training ({len(dataset)} intervalscreate_graph_model_node_embeddings_hash, "
            f"{training_epochs} epochs)")

    output = ModelWrapper(model=infomax_model,
                          filename=out_files['trained_model'].path)
    output.save()

    global_count = 0
    lower_loss = 1000
    patience_epochs = patience_epochs
    epochs_stagnation_count = 0
    for epoch in range(1, training_epochs + 1):
        epochs_stagnation_count = epochs_stagnation_count + 1
        if epochs_stagnation_count == patience_epochs:
            if verbose:
                print(
                    f"Early stop, patience={patience_epochs} exceeded. Lower loss reached: {lower_loss}"
                )
            return True
        for step, data in enumerate(dataset):
            loss = train_infomax(infomax_model, infomax_optimizer, data, epoch)
            global_count = global_count + 1
            if loss < lower_loss:
                lower_loss = loss
                epochs_stagnation_count = 0
                output = ModelWrapper(model=infomax_model,
                                      filename=out_files['trained_model'].path)
                output.save()

                if verbose:
                    print(f"Saved model with loss: {loss}")

            if tensorboard_writer:
                tensorboard_writer.add_scalar('data/loss', float(loss),
                                              global_count)

            if verbose:
                print('Epoch: {:03d}, Step: {:03d}, Loss: {:.7f}'.format(
                    epoch, step, loss))
Exemple #7
0
def create_dataset(log, in_files, out_files, start, end, interval_width,
                   interval_overlap, graph_representation, feature_extractor):

    log.info("create_dataset")
    dataset = load_dataset_vendor(out_files, start, interval_width)

    df = pd.read_hdf(in_files['raw_file'].path, mode='r')

    # TODO: fix this in raw unb df file.
    if 'unb' in in_files['raw_file'].path:
        print("UNB workaround: modifying the index")
        df['index'] = df['Adjusted Time']  # Set "Adjusted Time" as index
        df = df.sort_values('Adjusted Time')
        df.set_index('index', inplace=True)
    else:
        df = df.sort_values('Timestamp')
        df.reset_index(inplace=True)

    # To be modified if other graph representations are being used!
    assert graph_representation == "shallow_simplified_edges" or graph_representation == "boston", \
        f"Node creation needs to be adapted for the '{graph_representation}' representation"
    assert interval_overlap == 0, "Current implementation doesn't support window overlapping"

    for file_index, file_path in enumerate(dataset.files):
        log.info(f"Checking path: {file_path}...")
        if osp.exists(file_path):
            continue

        print(f"Preparing {file_path}...")

        data = FullData()
        interval_start = start + file_index * interval_width
        interval_end = start + (file_index + 1) * interval_width
        interval_end = end if interval_end > end else interval_end

        print(
            f"\nfile_index: {file_index} \n start: {interval_start} \n end: {interval_end}"
        )

        if 'unb' in in_files['raw_file'].path:
            df_interval = df.loc[pd.Timestamp(int(interval_start), unit='s'):pd
                                 .Timestamp(int(interval_end), unit='s')]
        else:
            serie = df[df['Timestamp'] >= interval_start]
            start_idx = serie.index[0] if len(serie) > 0 else None
            serie = df[(df['Timestamp'] >= interval_start)
                       & (df['Timestamp'] < interval_end)]
            end_idx = serie.index[-1] if len(serie) > 0 else None

            if start_idx is None or end_idx is None:
                print(
                    f"Can't find matching df entries for interval delimiters (file_path: {file_path},"
                    f"start_idx:{start_idx}, end_idx: {end_idx}")
                torch.save(data, file_path)
                return "succeeded"
            df_interval = df.iloc[start_idx:end_idx]

        if len(df_interval) == 0:
            print(f"Empty interval, saving empty data file : {file_path}")
            torch.save(data, file_path)
            return "succeeded"

        if graph_representation == "shallow_simplified_edges":
            G, interval_info = create_shallow_simplified_edges_graph(
                df_interval, interval_start, interval_end)
        elif graph_representation == "boston":
            G, interval_info = create_boston_graph(df_interval, interval_start,
                                                   interval_end)

        if len(list(G.nodes)) == 0:
            print(f"Empty graph, saving empty data file : {file_path}")
            torch.save(data, file_path)
            return "succeeded"

        # convert to tensors
        adj = nx.to_scipy_sparse_matrix(G).tocoo()
        row = torch.from_numpy(adj.row).to(torch.long)
        col = torch.from_numpy(adj.col).to(torch.long)
        edge_index = torch.stack([row, col], dim=0)

        if feature_extractor == 'degree':
            extractor = DegreeExtractor()
        elif feature_extractor == 'boston':
            extractor = BostonExtractor()

        node_indexes_in_tensors = {}
        for i, node_id in enumerate(G.nodes):
            node_indexes_in_tensors[node_id] = i

        node_ground_truths = list(
            nx.get_node_attributes(G, 'interval_ground_truth').values())

        data = FullData(x=torch.Tensor(extractor(G).tolist()),
                        y=torch.tensor(node_ground_truths),
                        edge_index=edge_index,
                        node_indexes_in_tensors=node_indexes_in_tensors)

        torch.save(data, file_path)
    return "succeeded"