Ejemplo n.º 1
0
def test_lof_precomputed(random_state=42):
    """Tests LOF with a distance matrix."""
    # Note: smaller samples may result in spurious test success
    rng = np.random.RandomState(random_state)
    X = rng.random_sample((10, 4))
    Y = rng.random_sample((3, 4))
    DXX = metrics.pairwise_distances(X, metric='euclidean')
    DYX = metrics.pairwise_distances(Y, X, metric='euclidean')
    # As a feature matrix (n_samples by n_features)
    lof_X = neighbors.LocalOutlierFactor(n_neighbors=3, novelty=True)
    lof_X.fit(X)
    pred_X_X = lof_X._predict()
    pred_X_Y = lof_X.predict(Y)

    # As a dense distance matrix (n_samples by n_samples)
    lof_D = neighbors.LocalOutlierFactor(n_neighbors=3,
                                         algorithm='brute',
                                         metric='precomputed',
                                         novelty=True)
    lof_D.fit(DXX)
    pred_D_X = lof_D._predict()
    pred_D_Y = lof_D.predict(DYX)

    assert_array_almost_equal(pred_X_X, pred_D_X)
    assert_array_almost_equal(pred_X_Y, pred_D_Y)
Ejemplo n.º 2
0
def test_n_neighbors_attribute():
    X = iris.data
    clf = neighbors.LocalOutlierFactor(n_neighbors=500).fit(X)
    assert clf.n_neighbors_ == X.shape[0] - 1

    clf = neighbors.LocalOutlierFactor(n_neighbors=500)
    assert_warns_message(UserWarning,
                         "n_neighbors will be set to (n_samples - 1)", clf.fit,
                         X)
    assert clf.n_neighbors_ == X.shape[0] - 1
Ejemplo n.º 3
0
def test_n_neighbors_attribute():
    X = iris.data
    clf = neighbors.LocalOutlierFactor(n_neighbors=500).fit(X)
    assert clf.n_neighbors_ == X.shape[0] - 1

    clf = neighbors.LocalOutlierFactor(n_neighbors=500)
    msg = "n_neighbors will be set to (n_samples - 1)"
    with pytest.warns(UserWarning, match=re.escape(msg)):
        clf.fit(X)
    assert clf.n_neighbors_ == X.shape[0] - 1
Ejemplo n.º 4
0
def test_score_samples():
    X_train = [[1, 1], [1, 2], [2, 1]]
    clf1 = neighbors.LocalOutlierFactor(n_neighbors=2,
                                        contamination=0.1).fit(X_train)
    clf2 = neighbors.LocalOutlierFactor(n_neighbors=2).fit(X_train)
    assert_array_equal(clf1._score_samples([[2., 2.]]),
                       clf1._decision_function([[2., 2.]]) + clf1.offset_)
    assert_array_equal(clf2._score_samples([[2., 2.]]),
                       clf2._decision_function([[2., 2.]]) + clf2.offset_)
    assert_array_equal(clf1._score_samples([[2., 2.]]),
                       clf2._score_samples([[2., 2.]]))
Ejemplo n.º 5
0
def test_novelty_errors():
    X = iris.data

    # check errors for novelty=False
    clf = neighbors.LocalOutlierFactor()
    clf.fit(X)
    # predict, decision_function and score_samples raise ValueError
    for method in ['predict', 'decision_function', 'score_samples']:
        msg = ('{} is not available when novelty=False'.format(method))
        assert_raises_regex(AttributeError, msg, getattr, clf, method)

    # check errors for novelty=True
    clf = neighbors.LocalOutlierFactor(novelty=True)
    msg = 'fit_predict is not available when novelty=True'
    assert_raises_regex(AttributeError, msg, getattr, clf, 'fit_predict')
Ejemplo n.º 6
0
def choose_models():

    isolFor = {
        'name': 'Isolation Forest',
        'class': ensemble.IsolationForest(),
        'parameters': {
            'n_estimators': [5, 10, 20, 50, 100, 150, 200]
        }
    }

    locOutFac = {
        'name': 'Local Outlier Factor',
        'class': neighbors.LocalOutlierFactor(novelty=True),
        'parameters': {
            'n_neighbors': range(5, 50, 5)
        }
    }
    # ocSVM = {'name': 'One Class SVM',
    #          'class': svm.OneClassSVM(),
    #          'parameters': {
    #              'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    #              'nu': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]
    #          }
    #          }

    elEnv = {
        'name': 'Elliptic Envelope',
        'class': covariance.EllipticEnvelope(),
        'parameters': {
            'contamination': np.linspace(0.05, 0.45, 9)
        }
    }

    return [isolFor, locOutFac, elEnv]
Ejemplo n.º 7
0
def remove_outliers_and_normalize(data_dirty, n_neighbors=20):
    clf = neighbors.LocalOutlierFactor(n_neighbors=n_neighbors)
    norm = preprocessing.Normalizer()
    data_map = clf.fit_predict(data_dirty)
    data_clean = data_dirty[data_map > 0]
    data_normalized = norm.fit_transform(data_clean)
    return data_normalized, data_map
Ejemplo n.º 8
0
def test_novelty_training_scores():
    # check that the scores of the training samples are still accessible
    # when novelty=True through the negative_outlier_factor_ attribute
    X = iris.data

    # fit with novelty=False
    clf_1 = neighbors.LocalOutlierFactor()
    clf_1.fit(X)
    scores_1 = clf_1.negative_outlier_factor_

    # fit with novelty=True
    clf_2 = neighbors.LocalOutlierFactor(novelty=True)
    clf_2.fit(X)
    scores_2 = clf_2.negative_outlier_factor_

    assert_array_almost_equal(scores_1, scores_2)
Ejemplo n.º 9
0
def test_lof():
    # Toy sample (the last two samples are outliers):
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [5, 3], [-4, 2]]

    # Test LocalOutlierFactor:
    clf = neighbors.LocalOutlierFactor(n_neighbors=5)
    score = clf.fit(X).negative_outlier_factor_
    assert_array_equal(clf._fit_X, X)

    # Assert largest outlier score is smaller than smallest inlier score:
    assert_greater(np.min(score[:-2]), np.max(score[-2:]))

    # Assert predict() works:
    clf = neighbors.LocalOutlierFactor(contamination=0.25,
                                       n_neighbors=5).fit(X)
    assert_array_equal(clf._predict(), 6 * [1] + 2 * [-1])
Ejemplo n.º 10
0
def test_novelty_errors():
    X = iris.data

    # check errors for novelty=False
    clf = neighbors.LocalOutlierFactor()
    clf.fit(X)
    # predict, decision_function and score_samples raise ValueError
    for method in ["predict", "decision_function", "score_samples"]:
        msg = "{} is not available when novelty=False".format(method)
        with pytest.raises(AttributeError, match=msg):
            getattr(clf, method)

    # check errors for novelty=True
    clf = neighbors.LocalOutlierFactor(novelty=True)
    msg = "fit_predict is not available when novelty=True"
    with pytest.raises(AttributeError, match=msg):
        getattr(clf, "fit_predict")
Ejemplo n.º 11
0
def test_score_samples():
    X_train = [[1, 1], [1, 2], [2, 1]]
    clf1 = neighbors.LocalOutlierFactor(n_neighbors=2,
                                        contamination=0.1,
                                        novelty=True).fit(X_train)
    clf2 = neighbors.LocalOutlierFactor(n_neighbors=2,
                                        novelty=True).fit(X_train)
    assert_array_equal(
        clf1.score_samples([[2.0, 2.0]]),
        clf1.decision_function([[2.0, 2.0]]) + clf1.offset_,
    )
    assert_array_equal(
        clf2.score_samples([[2.0, 2.0]]),
        clf2.decision_function([[2.0, 2.0]]) + clf2.offset_,
    )
    assert_array_equal(clf1.score_samples([[2.0, 2.0]]),
                       clf2.score_samples([[2.0, 2.0]]))
Ejemplo n.º 12
0
def test_lof_values():
    # toy samples:
    X_train = [[1, 1], [1, 2], [2, 1]]
    clf1 = neighbors.LocalOutlierFactor(n_neighbors=2,
                                        contamination=0.1).fit(X_train)
    clf2 = neighbors.LocalOutlierFactor(n_neighbors=2).fit(X_train)
    s_0 = 2. * sqrt(2.) / (1. + sqrt(2.))
    s_1 = (1. + sqrt(2)) * (1. / (4. * sqrt(2.)) + 1. / (2. + 2. * sqrt(2)))
    # check predict()
    assert_array_almost_equal(-clf1.negative_outlier_factor_, [s_0, s_1, s_1])
    assert_array_almost_equal(-clf2.negative_outlier_factor_, [s_0, s_1, s_1])
    # check predict(one sample not in train)
    assert_array_almost_equal(-clf1._score_samples([[2., 2.]]), [s_0])
    assert_array_almost_equal(-clf2._score_samples([[2., 2.]]), [s_0])
    # check predict(one sample already in train)
    assert_array_almost_equal(-clf1._score_samples([[1., 1.]]), [s_1])
    assert_array_almost_equal(-clf2._score_samples([[1., 1.]]), [s_1])
Ejemplo n.º 13
0
def test_hasattr_prediction():
    # check availability of prediction methods depending on novelty value.
    X = [[1, 1], [1, 2], [2, 1]]

    # when novelty=True
    clf = neighbors.LocalOutlierFactor(novelty=True)
    clf.fit(X)
    assert hasattr(clf, 'predict')
    assert hasattr(clf, 'decision_function')
    assert hasattr(clf, 'score_samples')
    assert not hasattr(clf, 'fit_predict')

    # when novelty=False
    clf = neighbors.LocalOutlierFactor(novelty=False)
    clf.fit(X)
    assert hasattr(clf, 'fit_predict')
    assert not hasattr(clf, 'predict')
    assert not hasattr(clf, 'decision_function')
    assert not hasattr(clf, 'score_samples')
Ejemplo n.º 14
0
def sk_check(X_train, X_test, y_test, o_list):
    f_f = [neighbors.LocalOutlierFactor(n_neighbors=5),\
    neighbors.LocalOutlierFactor(n_neighbors=10),\
    neighbors.LocalOutlierFactor(n_neighbors=35),\
    IsolationForest(max_samples='auto')]
    f_name = ['LOF5', 'LOF10', 'LOF35', 'i-forest']

    columns = ['method'] + ['AUC', 'MCC', 'BRU']
    n_row = 2
    index = np.arange(n_row)  # array of numbers for the number of samples
    df = pd.DataFrame(columns=columns, index=index)

    exec 'T_o =' + (' | '.join(['(y_test==' + str(i) + ')' for i in o_list]))

    auc_max = -1
    for i in range(3):
        lof = f_f[i]
        lof.fit(X_test)
        outliers = -lof.negative_outlier_factor_

        auc_test = roc_auc_score(T_o, outliers)
        if auc_test > auc_max:
            auc_max = auc_test
            df['method'][0] = f_name[i]
            df['MCC'][0] = mce.MCC(T_o, outliers)
            df['AUC'][0] = auc_max
            df['BRU'][0] = mce.bru_score(T_o, outliers)

    df['method'][1] = f_name[3]
    isof = f_f[3]
    isof.fit(X_train)
    scores_pred = isof.decision_function(X_test)
    outliers = scores_pred.max() - scores_pred
    df['MCC'][1] = mce.MCC(T_o, outliers)
    df['AUC'][1] = roc_auc_score(T_o, outliers)
    df['BRU'][1] = mce.bru_score(T_o, outliers)

    return df
Ejemplo n.º 15
0
def test_predicted_outlier_number(expected_outliers):
    # the number of predicted outliers should be equal to the number of
    # expected outliers unless there are ties in the abnormality scores.
    X = iris.data
    n_samples = X.shape[0]
    contamination = float(expected_outliers) / n_samples

    clf = neighbors.LocalOutlierFactor(contamination=contamination)
    y_pred = clf.fit_predict(X)

    num_outliers = np.sum(y_pred != 1)
    if num_outliers != expected_outliers:
        y_dec = clf.negative_outlier_factor_
        check_outlier_corruption(num_outliers, expected_outliers, y_dec)
Ejemplo n.º 16
0
    def __train(self,
                train_data,
                columns,
                label,
                n_neighbors=None,
                distance_metric=None):
        if (n_neighbors == None):
            n_neighbors = self.__n_neighbors_list[0]
        if (distance_metric == None):
            distance_metric = self.__distMetricsList[0]

        self.__model = neighbors.LocalOutlierFactor(n_neighbors=n_neighbors,
                                                    algorithm='auto',
                                                    metric=distance_metric)
        self.__model.fit_predict(train_data[columns])
Ejemplo n.º 17
0
def train_and_save(training_data, outloc):
    """
    Trains a LOF algorithm for the purposes of novelty detection and pickles it
    Standardizes the data first (transforms each column by subtracting the mean
    and then dividing by the stddev)
    

    Parameters
    ----------
    training_data : TYPE
        a pandas DataFrame of the training data.
    out_loc : TYPE
        name of the pickled object to save, which is a tuple with length 2, where
        the first entry is the model. The second is a list of lists, where the first
        list is the list of means used to transform the data and the second is the list
        of the stddevs used to transform the data

    Returns
    -------
     a tuple with length 2, where
        the first entry is the model. The second is a list of lists, where the first
        list is the list of means used to transform the data and the second is the list
        of the stddevs used to transform the data

    """
    standard_data = training_data.copy()
    
    means = []
    stddevs = []
    for col in training_data.columns:
        mean = training_data[col].mean()
        stddev = training_data[col].std()
        
        means.append(mean)
        stddevs.append(stddev)
        
    standard_data = standardize_data(training_data, (means, stddevs))
        

    lof = neighbors.LocalOutlierFactor(novelty=True)
    lof.fit(standard_data)
    
    out_obj = (lof, (means, stddevs))
    pickle.dump(out_obj, open(outloc, "wb" ))
    
    return out_obj
Ejemplo n.º 18
0
def test_lof_performance():
    # Generate train/test data
    rng = check_random_state(2)
    X = 0.3 * rng.randn(120, 2)
    X_train = X[:100]

    # Generate some abnormal novel observations
    X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
    X_test = np.r_[X[100:], X_outliers]
    y_test = np.array([0] * 20 + [1] * 20)

    # fit the model for novelty detection
    clf = neighbors.LocalOutlierFactor(novelty=True).fit(X_train)

    # predict scores (the lower, the more normal)
    y_pred = -clf.decision_function(X_test)

    # check that roc_auc is good
    assert roc_auc_score(y_test, y_pred) > .99
Ejemplo n.º 19
0
def registerGroundTruth(truth):
    global model
    global modelOneClasses
    global histograms
    global kernel
    model = svm.NuSVC(kernel=kernel)
    for obj in truth:
        modelOneClasses[obj] = neighbors.LocalOutlierFactor(novelty=True)
    histograms = truth
    data = []
    labels = []
    for obj in truth:
        objData = []
        for hist in truth[obj]:
            data.append(hist[0])
            objData.append(hist[0])
            labels.append(obj)
        modelOneClasses[obj].fit(objData)
    print('Fitting model to data')
    model.fit(data, labels)
Ejemplo n.º 20
0
def AdewoyinSavgolFilter(out_col,
                         dir_model_country,
                         windowlength=13,
                         polyorder=4,
                         types=['C', 'L'],
                         save_xlsx=False,
                         _auto_filter_date=False,
                         re_turn_graph=False):

    for _out_col in out_col:

        path = dirs_excel[dir_model_country] + _out_col
        data = pd.read_excel(path + '.xlsx', sheet_name='Data')

        data = dateFilter(data, auto=_auto_filter_date)

        for _type in types:
            #Creating copy  of original data ex outliers
            #Since there is approx 600 values in my data and they are stable, I will assume there are at most 12 outliers since data
            _contamination = (40 / data[_type].shape[0])
            outlier_clf = neighbors.LocalOutlierFactor(
                n_neighbors=20, contamination=_contamination, n_jobs=1)
            data[_type + " inliers"] = outlier_clf.fit_predict(
                data.loc[:, _type].as_matrix().reshape(-1, 1))

            inlier_max = data[(data[_type + " inliers"] == 1)][_type].max()
            inlier_min = data[(data[_type + " inliers"] == 1)][_type].min()
            absolute_inlier_max = np.maximum(np.absolute(inlier_max),
                                             np.absolute(inlier_min))

            data[_type + " ex. outliers"] = [
                (np.sign(x) * absolute_inlier_max)
                if np.absolute(x) > absolute_inlier_max else x
                for x in data[_type]
            ]

            ##making savgol filter and savgol period on period change
            savgol_signal = pd.DataFrame(
                signal.savgol_filter(data.loc[:, _type],
                                     window_length=windowlength,
                                     polyorder=polyorder))
            data["{}_Savgol_Filtered".format(_type)] = savgol_signal
            signal_change_per_period = savgol_signal.diff()
            signal_change_per_period_sign = [
                1 if val > 0 else -1 if val < 0 else 0
                for val in signal_change_per_period.iloc[:, 0]
            ]

            #AdeSavGol output
            data[_type + " above 0"] = data[_type + " ex. outliers"] - (
                data[_type + " ex. outliers"].min())
            data[_out_col +
                 _type] = data[_type +
                               " above 0"] * (signal_change_per_period_sign)

        if save_xlsx == True:
            writer = pd.ExcelWriter(path + '_hardcoded.xlsx')
            data.to_excel(writer, 'Data', index=False)

        if re_turn_graph == True:
            #x= np.arange(len(savgol_signal)-1)
            trace0 = go.Scatter(x=np.asarray(data['Date']),
                                y=data[_type].as_matrix().flatten(),
                                name="Original Data")
            trace1 = go.Scatter(x=np.asarray(data['Date']),
                                y=savgol_signal.as_matrix().flatten(),
                                name="Savgol Filter")
            trace2 = go.Scatter(
                x=np.asarray(data['Date']),
                y=np.asarray(signal_change_per_period_sign).flatten(),
                name="Savgol Filter Change")

            trace3 = go.Scatter(x=np.asarray(data['Date']),
                                y=data[_type +
                                       " ex. outliers"].as_matrix().flatten(),
                                name="L^ex-out")
            trace4 = go.Scatter(x=np.asarray(data['Date']),
                                y=data[_type +
                                       " above 0"].as_matrix().flatten(),
                                name="L^S2")
            trace5 = go.Scatter(x=np.asarray(data['Date']),
                                y=data[_out_col + _type].as_matrix().flatten(),
                                name='L_asg')

            trace_set = [trace0, trace1, trace2, trace3, trace4, trace5]
            layout = go.Layout(title=_out_col + _type)
            fig = go.Figure(data=trace_set, layout=layout)

            py.offline.iplot(fig,
                             image='png',
                             filename="{}-{}".format(_out_col, _type))
Ejemplo n.º 21
0
def test_contamination():
    X = [[1, 1], [1, 0]]
    clf = neighbors.LocalOutlierFactor(contamination=0.6)
    with pytest.raises(ValueError):
        clf.fit(X)
Ejemplo n.º 22
0
def test_novelty_true_common_tests():

    # the common tests are run for the default LOF (novelty=False).
    # here we run these common tests for LOF when novelty=True
    check_estimator(neighbors.LocalOutlierFactor(novelty=True))
Ejemplo n.º 23
0
    clf.fit(X)
    assert hasattr(clf, "predict")
    assert hasattr(clf, "decision_function")
    assert hasattr(clf, "score_samples")
    assert not hasattr(clf, "fit_predict")

    # when novelty=False
    clf = neighbors.LocalOutlierFactor(novelty=False)
    clf.fit(X)
    assert hasattr(clf, "fit_predict")
    assert not hasattr(clf, "predict")
    assert not hasattr(clf, "decision_function")
    assert not hasattr(clf, "score_samples")


@parametrize_with_checks([neighbors.LocalOutlierFactor(novelty=True)])
def test_novelty_true_common_tests(estimator, check):
    # the common tests are run for the default LOF (novelty=False).
    # here we run these common tests for LOF when novelty=True
    check(estimator)


@pytest.mark.parametrize("expected_outliers", [30, 53])
def test_predicted_outlier_number(expected_outliers):
    # the number of predicted outliers should be equal to the number of
    # expected outliers unless there are ties in the abnormality scores.
    X = iris.data
    n_samples = X.shape[0]
    contamination = float(expected_outliers) / n_samples

    clf = neighbors.LocalOutlierFactor(contamination=contamination)
Ejemplo n.º 24
0
# mostra gráfico com a variação no valor de cada gene(para ver o que o filtro flat pattern irá fazer)
plt.bar(np.arange(len(variancias)), variancias, width=30)
plt.title("Variância em cada gene")
plt.xlabel('Genes')
plt.ylabel("Variância")
plt.show()

# Filtro flat pattern (retira genes com pouca variabilidade)
model_flat = VarianceThreshold(threshold=var_media * 2)
input_filtrado1 = model_flat.fit_transform(input_data)
print("Tamanho inicial: ", input_data.shape,
      "\nTamanho depois do filtro flat pattern: ", input_filtrado1.shape)

# Remoçao de anomalias(pontos que se encontram muito fora do normal)
outlier_model = neighbors.LocalOutlierFactor(n_neighbors=20, contamination=0.1)
remover = outlier_model.fit_predict(input_filtrado1.transpose())
input_filtrado2 = np.delete(input_filtrado1, remover, axis=1)
print("Tamanho depois do filtro de outliers: ", input_filtrado2.shape)

# Normalização dos dados
scaled_input = preprocessing.scale(input_filtrado2)
print("Média: ", scaled_input.mean())
print("Desvio padrão: ", scaled_input.std())

labels_doenca = meta.values[:,
                            1]  # insulin sensible, insulin resistant, diabetic
labels_tratamento = meta.values[:, 2]  # insulina vs nao tratados

print(
    '\n\n----------------------------  Análise Estatisticos Multivariada  ----------------------------------------\n'
Ejemplo n.º 25
0
def test_contamination():
    X = [[1, 1], [1, 0]]
    clf = neighbors.LocalOutlierFactor(contamination=0.6)
    assert_raises(ValueError, clf.fit, X)
Ejemplo n.º 26
0
def test_contamination_future_warning():
    X = [[1, 1], [1, 2], [2, 1]]
    assert_warns_message(
        FutureWarning, 'default contamination parameter 0.1 will change '
        'in version 0.22 to "auto"',
        neighbors.LocalOutlierFactor().fit, X)
Ejemplo n.º 27
0
def LocalOutlierFactorOutlier(data,
                              margin=0,
                              n_neighbors=20,
                              algorithm='auto',
                              leaf_size=30,
                              metric='minkowski',
                              p=2,
                              metric_params=None,
                              contamination='auto',
                              novelty=False,
                              n_jobs=None):
    """Returns numpy array with data points labelled as outliers

    Parameters
    ----------
    n_neighbors : int, default=20
        Number of neighbors to use by default for :meth:`kneighbors` queries.
        If n_neighbors is larger than the number of samples provided,
        all samples will be used.

    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
        Algorithm used to compute the nearest neighbors:

        - 'ball_tree' will use :class:`BallTree`
        - 'kd_tree' will use :class:`KDTree`
        - 'brute' will use a brute-force search.
        - 'auto' will attempt to decide the most appropriate algorithm
          based on the values passed to :meth:`fit` method.

        Note: fitting on sparse input will override the setting of
        this parameter, using brute force.

    leaf_size : int, default=30
        Leaf size passed to :class:`BallTree` or :class:`KDTree`. This can
        affect the speed of the construction and query, as well as the memory
        required to store the tree. The optimal value depends on the
        nature of the problem.

    metric : str or callable, default='minkowski'
        metric used for the distance computation. Any metric from scikit-learn
        or scipy.spatial.distance can be used.

        If metric is "precomputed", X is assumed to be a distance matrix and
        must be square. X may be a sparse matrix, in which case only "nonzero"
        elements may be considered neighbors.

        If metric is a callable function, it is called on each
        pair of instances (rows) and the resulting value recorded. The callable
        should take two arrays as input and return one value indicating the
        distance between them. This works for Scipy's metrics, but is less
        efficient than passing the metric name as a string.

        Valid values for metric are:

        - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
          'manhattan']

        - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
          'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
          'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
          'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
          'yule']

        See the documentation for scipy.spatial.distance for details on these
        metrics:
        https://docs.scipy.org/doc/scipy/reference/spatial.distance.html

    p : int, default=2
        Parameter for the Minkowski metric from
        :func:`sklearn.metrics.pairwise.pairwise_distances`. When p = 1, this
        is equivalent to using manhattan_distance (l1), and euclidean_distance
        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.

    metric_params : dict, default=None
        Additional keyword arguments for the metric function.

    contamination : 'auto' or float, default='auto'
        The amount of contamination of the data set, i.e. the proportion
        of outliers in the data set. When fitting this is used to define the
        threshold on the scores of the samples.

        - if 'auto', the threshold is determined as in the
          original paper,
        - if a float, the contamination should be in the range [0, 0.5].

        .. versionchanged:: 0.22
           The default value of ``contamination`` changed from 0.1
           to ``'auto'``.

    novelty : bool, default=False
        By default, LocalOutlierFactor is only meant to be used for outlier
        detection (novelty=False). Set novelty to True if you want to use
        LocalOutlierFactor for novelty detection. In this case be aware that
        that you should only use predict, decision_function and score_samples
        on new unseen data and not on the training set.

        .. versionadded:: 0.20

    n_jobs : int, default=None
        The number of parallel jobs to run for neighbors search.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.
    """
    lof = neighbors.LocalOutlierFactor(n_neighbors=n_neighbors,
                                       algorithm=algorithm,
                                       leaf_size=leaf_size,
                                       metric=metric,
                                       p=p,
                                       metric_params=metric_params,
                                       contamination=contamination,
                                       novelty=novelty,
                                       n_jobs=n_jobs)

    lof.fit(data)

    scores = -lof.negative_outlier_factor_
    scores = list(scores)

    lower_range, upper_range = iqr_threshold_method(scores, margin)

    outlier_points = []

    for i in range(len(scores)):
        if scores[i] < lower_range or scores[i] > upper_range:
            outlier_points.append(data[i])

    return outlier_points
Ejemplo n.º 28
0
 def __init__(self, params, k):
     self.__distMetricsList = params["distance_metrics"]
     self.__n_neighbors_list = params['n_neighbors']
     self.__model = neighbors.LocalOutlierFactor()
     self.__fold_val = k
Ejemplo n.º 29
0
# output vector
y_redox = df.loc[:, out[0]].values
y_pka = df.loc[:, out[1]].values
y_lnk = df.loc[:, out[2]].values
y_homo = df.loc[:, out[3]].values
y_tot = df.loc[:, out].values
y_extended = df.loc[:, out_extended].values
# one-hot
x.astype("float64")
y_extended.astype("float64")
full_table = np.concatenate((x, y_extended), axis=1).astype("float64")

n_samples = 157
outliers_fraction = 0.1
clusters_separation = [0, 1, 2]
clf = neighbors.LocalOutlierFactor(novelty=True)
clf.fit(full_table)

# Use this to compare with other descriptors we gen
# -erate later. This table gives what may be outliers
# We could compare with chemical distance via other
# metrics

print(clf.negative_outlier_factor_ > -1.5)

# regress via multivariate linear,
# bayesian, gd, huberregessor(applies linear loss to outliers)
# knn regressor
# NN
Ejemplo n.º 30
0
n_neighbors = 20

fscores = []
accs = []
for z in xrange(0, 1):
    logfile = directory + "log-" + str(z) + ".csv"
    with open(logfile, "w") as file:
        file.write("test,PCALevel,acc,val_acc,f1\n")

    for x in xrange(1, 71):
        pca = PCA(n_components=x)
        Xall = pca.fit_transform(dftrain.ix[:, 1:dftrain.shape[1]].values)
        clf = neighbors.LocalOutlierFactor(n_neighbors=n_neighbors,
                                           algorithm='auto',
                                           leaf_size=30,
                                           metric='minkowski',
                                           p=2,
                                           metric_params=None,
                                           contamination=cont,
                                           n_jobs=4)
        testPred = clf.fit_predict(Xall)

        print(len(Xall))
        score = 0.0
        for i in xrange(0, len(Xall)):
            if (testPred[i] == 1
                    and Yall[i] == "Normal") or (testPred[i] == -1
                                                 and Yall[i] == "Malicious"):
                score += 1
        testAcc = float(score) / len(Yall)

        preds = testPred