Ejemplo n.º 1
0
    def fit(self, X=None):
        """
        INPUT:
            X		X, n x p features variable. Will use values from
                        .load_files() step (in self.X), if not supplied here.
        OUTPUT: None
        With data in self.X, this calls the Isolation Forest model to obtain
        anomaly scores.
        """

        # if X != None:
        if X is not None:
            self.X = X

        self.iFmodel = isof.iForest(n_estimators=self.n_estimators,
                                    max_depth=self.max_depth)

        if self.show_calc_time:
            print "Constructing iTrees ..."
            start = time.time()

        self.iFmodel.fit(self.X)

        if self.show_calc_time:
            totsecs = time.time() - start
            mins = int(totsecs / 60)
            secs = totsecs - 60.0 * mins
            print "Elapsed time = {0} minutes, {1} seconds".format(mins, secs)
        return
Ejemplo n.º 2
0
    def fit(self, X=None):
        """
        INPUT:
            X		X, n x p features variable. Will use values from
                        .load_files() step (in self.X), if not supplied here.
        OUTPUT: None
        With data in self.X, this calls the Isolation Forest model to obtain
        anomaly scores.
        """

        # if X != None:
        if X is not None:
            self.X = X

        self.iFmodel = isof.iForest(n_estimators=self.n_estimators,
                                    max_depth=self.max_depth)

        if self.show_calc_time:
            print "Constructing iTrees ..."
            start = time.time()

        self.iFmodel.fit(self.X)

        if self.show_calc_time:
            totsecs = time.time() - start
            mins = int(totsecs/60)
            secs = totsecs - 60.0*mins
            print "Elapsed time = {0} minutes, {1} seconds".format(mins, secs)
        return
Ejemplo n.º 3
0
 def __init__(self, signal):
     data = self.feautre_extraction(signal)
     self.data = data
     bound1 = np.array(
         [np.amax(data, axis=0) * 2,
          np.amin(data, axis=0) * 2],
         dtype=np.float64).T
     self.iforest = iForest(data, bound1, 100)
Ejemplo n.º 4
0
def scatter_plot(X, Y, IDs, yname, xname, title, val):
    # Main Plot
    global plot_num
    plot_num = plot_num + 1
    # fig = plt.figure(plot_num)
    # ax1 = fig.add_subplot(111)
    # ax1.set_xlabel(xname)
    # ax1.set_ylabel(yname)
    # plt.title(title)
    #
    # plt.ylim([min(Y) / 2.0, ceil(max(Y) * 2.0)])
    # plt.xlim([min(X) / 2.0, ceil(max(X) * 2.0)])
    # plt.loglog(X, Y, 'k.')
    construction_time, scoring_time = 0, 0
    if algo_oddball:
        # Interpolate the median line
        minX = parametric_min(X, val)
        maxX = parametric_max(X, val)
        binedges = np.logspace(log10(minX), log10(maxX), 10)
        median_points_X = []
        median_points_Y = []
        median_points_X.append(minX)
        median_points_Y.append(
            get_median(
                [Y[j] for j in [ind for ind, x in enumerate(X) if x == minX]]))
        for i in xrange(1, 10):
            median_points_X.append(
                int(geometric_mean(binedges[i], binedges[i - 1])))
            median_points_Y.append(
                get_median([
                    Y[j] for j in [
                        ind for ind, x in enumerate(np.digitize(X, binedges))
                        if x == i
                    ]
                ]))
            if isnan(float(median_points_Y[-1])):
                median_points_Y.pop()
                median_points_X.pop()
        median_points_X.append(maxX)
        median_points_Y.append(
            get_median(
                [Y[j] for j in [ind for ind, x in enumerate(X) if x == maxX]]))
        plt.plot(median_points_X, median_points_Y, 'ro-')

        # Calculate Oddball Scores
        scores = oddball.get_scores(median_points_X, median_points_Y, X, Y,
                                    IDs)

    elif algo_iForests:
        features = combine_features([IDs, X, Y])
        scores, construction_time, scoring_time = iForest(features)
    else:
        print_fail("Scoring Algorithm not Chosen")

    # Write rank and scores to outputfile
    return scores, construction_time, scoring_time
Ejemplo n.º 5
0
def scatter_plot(feature_X, feature_Y, make_plot=False):
    ids, data = combine_features([feature_X, feature_Y])
    scores = iForest(ids, data)

    fig = plt.figure()
    if make_plot:
        X = feature_X.get_data()
        Y = feature_Y.get_data()
        ax = fig.add_subplot(111)
        ax.set_xlabel(feature_X.get_description(), fontsize=SIZES['label'])
        ax.set_ylabel(feature_Y.get_description(), fontsize=SIZES['label'])
        ax.xaxis.set_tick_params(labelsize=SIZES['tick'])
        ax.yaxis.set_tick_params(labelsize=SIZES['tick'])
        plt.gcf().subplots_adjust(bottom=0.18, left=0.18)
        if feature_X.get_log():
            ax.set_xscale('log')
            plt.xlim([min(X) / 2.0, ceil(max(X) * 2.0)])
        if feature_Y.get_log():
            ax.set_yscale('log')
            plt.ylim([min(Y) / 2.0, ceil(max(Y) * 2.0)])
        plt.plot(X, Y, 'k.')
    return fig, scores
Ejemplo n.º 6
0
                           discription[Y] + ' vs ' + discription[X],
                           compare_value[X])
        pp.savefig(fig)
        update_progress(i + 1, len(feature_pairs))
    pp.close()
    scatter_plots = len(feature_pairs)
    print_ok('Scatter Plots Generated')
""" PlotSPOT Algorithm """
# Get Outliers Scores if using iForests
if generate_iForest:
    cprint("Generating Graph File")
    features = combine_features([
        eval(F)
        for F in identity_features + continuous_features + discrete_features
    ])
    iForest(features)
    print_ok("iForest Generation Complete")

file = open(filefolder + "Log.txt", 'w')
N_list = [10, 20, 50, 75, 100]
for N_val in N_list:
    # Create graph between outliers and plots
    cprint("Generating Graph File")
    ranklist.generate_graph(P_val, N_val)
    print_ok("Graph File Generated")
    # Run plotSpot to get selected graphs
    Budget = [1, 2, 3, 4, 5, 6]
    for B in Budget:
        for algo in ["SpellOut", "Greedy", "G_Norm"]:
            print "N_val = ", N_val, " Budget = ", B, " ALGO = ", algo
            start_time = time.time()
Ejemplo n.º 7
0
def scatter_plot(X, Y, IDs, yname, xname, title, val):
    # Main Plot
    global plot_num
    plot_num = plot_num + 1
    fig = plt.figure(plot_num)
    fig.text(0.5, 0.04, xname, ha='center')
    fig.text(0.04, 0.5, yname, va='center', rotation='vertical')
    ax1 = fig.add_subplot(111)
    plt.title(title)

    plt.ylim([min(Y) / 2.0, ceil(max(Y) * 2.0)])
    plt.xlim([min(X) / 2.0, ceil(max(X) * 2.0)])
    plt.loglog(X, Y, 'k.')

    if algo_oddball:
        # Interpolate the median line
        minX = parametric_min(X, val)
        maxX = parametric_max(X, val)
        binedges = np.logspace(log10(minX), log10(maxX), 10)
        median_points_X = []
        median_points_Y = []
        median_points_X.append(minX)
        median_points_Y.append(
            get_median(
                [Y[j] for j in [ind for ind, x in enumerate(X) if x == minX]]))
        for i in xrange(1, 10):
            median_points_X.append(
                int(geometric_mean(binedges[i], binedges[i - 1])))
            median_points_Y.append(
                get_median([
                    Y[j] for j in [
                        ind for ind, x in enumerate(np.digitize(X, binedges))
                        if x == i
                    ]
                ]))
            if isnan(float(median_points_Y[-1])):
                median_points_Y.pop()
                median_points_X.pop()
        median_points_X.append(maxX)
        median_points_Y.append(
            get_median(
                [Y[j] for j in [ind for ind, x in enumerate(X) if x == maxX]]))
        plt.plot(median_points_X, median_points_Y, 'ro-')

        # Calculate Oddball Scores
        scores = oddball.get_scores(median_points_X, median_points_Y, X, Y,
                                    IDs)

    elif algo_iForests:
        features = combine_features([IDs, X, Y])
        scores = iForest(features)
    else:
        print_fail("Scoring Algorithm not Chosen")

    # Write rank and scores to outputfile
    write_to_file(scores, plot_num)

    # # Heat Map
    # heatmap, xedges ,yedges = np.histogram2d(X, Y, bins=(np.logspace(0, ceil(log10(max(X))), 400), np.logspace(0, ceil(log10(max(Y))), 250)))
    # extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
    # ax2 = fig.add_subplot(212, sharex = ax1, sharey = ax1)
    # # plt.xscale('log'); plt.yscale('log')
    # cmap = plt.cm.jet
    # cmap.set_under(color = 'white')
    # plt.imshow(heatmap.T, extent = extent, origin='lower', aspect='auto', cmap = cmap, vmin = 0.01)
    # plt.ylim([min(Y),max(Y)])
    # plt.xlim([min(X),max(X)])
    return fig
Ejemplo n.º 8
0
def main():
    # datasets = {'cardio.mat','shuttle.mat'}
    # datasets = {'thyroid.mat'}
    datasets = {'mnist.mat'}
    print('================================================================')
    print('> Loading Data ...')
    for mat_fname in datasets:
        X, y = load_from_mat(mat_fname)
        X = stats.zscore(X)
        print('> dataset: ', mat_fname)
        print(
            '================================================================')
        algorithms = {
            # 'KNN': neighbors.KNeighborsClassifier(),
            'SVM':
            svm.SVC(gamma='auto'),
            # 'SVM_linear': svm.OneClassSVM(kernel='linear'),
            # 'RandomForest': ensemble.RandomForestClassifier(),
            # 'ExtraTrees': ensemble.ExtraTreesClassifier(n_estimators=100),
            # 'DBSCAN': cluster.DBSCAN(eps=1, min_samples=5),
            # 'iForest_create': iForest(with_replacement=True),
            'OneClassSVM':
            svm.OneClassSVM(gamma='auto'),
            'iForest_n':
            iForest(with_replacement=True, project_flag=True),
            'iForest_create':
            iForest(with_replacement=True, project_flag=False),
            # 'iForest_paper': iForest(with_replacement=False),
            'IsolationForest':
            ensemble.IsolationForest(contamination=0.2, behaviour='new'),
            'LOF':
            neighbors.LocalOutlierFactor(contamination=0.2, novelty=True),
        }
        print('> Algorithms: {}'.format([i for i, j in algorithms.items()]))
        print('> Accuracy for iForest is not correct')
        for name, clf in algorithms.items():
            print('---------------------------------------------')
            print('> {}'.format(name))
            sum_accuracy = 0
            auc_value = []
            total_time = 0
            ratio = 0.7
            iteration = 5

            for i in range(iteration):
                data = cross_validation(X, y, ratio)
                t0 = time.process_time()
                if (  # unsupervised method
                        name == 'IsolationForest' or name == 'OneClassSVM'
                        or name == 'LOF' or name == 'iForest_n'
                        or name == 'iForest_create'
                        or name == 'iForest_paper'):
                    X_train = np.array(data['X_train'])
                    y_train = np.array(data['y_train'])
                    X_train = X_train[np.where(y_train == 1)]
                    clf.fit(X_train)
                else:
                    # supervised method.
                    clf.fit(data['X_train'], data['y_train'])

                t1 = time.process_time()
                y_pred = clf.predict(data['X_test'])
                # if(name != 'KNN' and name != 'RandomForest'):
                # y_score = clf.decision_function(data['X_test'])

                y_real = data['y_test']
                y_real = np.array(y_real)
                y_pred[np.where(y_pred == -1)] = 0
                if (  # methods with decision functions and have ROC curves
                        name == 'SVM' or name == 'SVM_linear' or name == 'LOF'
                        or name == 'OneClassSVM' or name == 'IsolationForest'
                        or name == 'iForest_n' or name == 'iForest_create'
                        or name == 'iForest_paper'):
                    y_score = np.asarray(clf.decision_function(data['X_test']))
                    if (name == 'iForest_n' or name == 'iForest_create'):
                        y_score = -1 * y_score
                    fpr, tpr, thresholds = metrics.roc_curve(data['y_test'],
                                                             y_score,
                                                             pos_label=1)
                    roc_auc = metrics.auc(fpr, tpr)
                    auc_value.append(roc_auc)
                total_time += (t1 - t0)
            print('\nAUC value is: ', np.mean(np.asarray(auc_value)))
            print('Accuracy: {}'.format(float(sum_accuracy) / iteration))
            print('Training Time: {}'.format(float(total_time) / iteration))
        print('---------------------------------------------')
        print('==============================================================')