Exemple #1
0
def choose_models():

    isolFor = {
        'name': 'Isolation Forest',
        'class': ensemble.IsolationForest(),
        'parameters': {
            'n_estimators': [5, 10, 20, 50, 100, 150, 200]
        }
    }

    locOutFac = {
        'name': 'Local Outlier Factor',
        'class': neighbors.LocalOutlierFactor(novelty=True),
        'parameters': {
            'n_neighbors': range(5, 50, 5)
        }
    }
    # ocSVM = {'name': 'One Class SVM',
    #          'class': svm.OneClassSVM(),
    #          'parameters': {
    #              'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    #              'nu': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]
    #          }
    #          }

    elEnv = {
        'name': 'Elliptic Envelope',
        'class': covariance.EllipticEnvelope(),
        'parameters': {
            'contamination': np.linspace(0.05, 0.45, 9)
        }
    }

    return [isolFor, locOutFac, elEnv]
def remove_outlier(data, contamination=0.01):
    outlier_map = se.IsolationForest(contamination=contamination).fit_predict(
        data[[
            "Sales", "CompetitionDistance", "aggregated_promo2",
            "days_since_competition"
        ]])
    return data[outlier_map == 1]
Exemple #3
0
def test_score_samples_estimators():
    """Check the values of score_samples methods derived from sklearn.

    Check that the values are the same than sklearn decision_function methods.
    This only concerns OCSVM and IsolationForest.
    """

    X = np.random.randn(50, 2)

    clf1 = IsolationForest(random_state=88)
    clf1.fit(X)

    clf2 = ensemble.IsolationForest(random_state=88)
    clf2.fit(X)

    assert_array_equal(clf1.score_samples(X), clf2.decision_function(X))

    nu = 0.4
    sigma = 3.0
    gamma = gamma = 1. / (2. * sigma**2)
    clf1 = OCSVM(sigma=sigma, nu=nu)
    clf1.fit(X)

    clf2 = OneClassSVM(gamma=gamma, nu=nu)
    clf2.fit(X)

    assert_array_equal(clf1.score_samples(X),
                       clf2.decision_function(X).ravel())
Exemple #4
0
 def set_isolation_forest_classifier(self):
     '''
     Deprecated for now, no meaningful results - performance metrics were similar to baseline results.
     '''
     return SkLearner(
         ensemble.IsolationForest(max_samples=100,
                                  random_state=42,
                                  contamination=0.1))
Exemple #5
0
def RemoveAbnormal(BigFeatures, contamination=0.05):
    print('******************** 剔除异常样本 ********************\n')
    from sklearn import ensemble
    clf = ensemble.IsolationForest(max_samples='auto', contamination=contamination, \
                                   max_features=1.0, bootstrap=False, random_state=42)
    clf.fit(BigFeatures)
    y_detection = clf.predict(BigFeatures)
    mask = (y_detection == -1)
    return mask  # 异常样本编号
Exemple #6
0
def occ_training(X_train,
                 model_type,
                 dict_params=None,
                 val_split=0.25,
                 random_state=108):
    """Trains one-class classifier by grid search.
    
    Args:
        X_train: np array, input for training
        model_type: str, type of model, example: svm, isoforest
        dict_params: dict, key: parameter, value: list of hyperparameters, default:model_params[model_type]
        val_split: float, validation split, default=0.25
        random_state: int, seed for splitting and isolation forest classifier
    
    Returns:
        best_model: sklearn model, best model
        best_params: dict, hyperparameters of best model
        best_accuracy: float, accuracy of best model
        
    """
    X_train, X_val, _, _ = model_selection.train_test_split(
        X_train,
        np.zeros(len(X_train)),
        test_size=val_split,
        random_state=random_state)

    if dict_params is None:
        dict_params = model_params[model_type]

    all_params = list(model_selection.ParameterGrid(dict_params))

    prev_accuracy = 0

    for tmp_params in all_params:
        if model_type is 'svm':
            tmp_model = svm.OneClassSVM(cache_size=5000)
            tmp_model.set_params(kernel=tmp_params['kernel'])
            tmp_model.set_params(nu=tmp_params['nu'])
        elif model_type is 'isoforest':
            tmp_model = ensemble.IsolationForest(n_jobs=-1,
                                                 warm_start=True,
                                                 random_state=random_state)
            tmp_model.set_params(n_estimators=tmp_params['n_estimators'])
            tmp_model.set_params(max_features=tmp_params['max_features'])

        tmp_model.fit(X_train)
        val_accuracy = occ_scorer(tmp_model, X_val)

        if val_accuracy > prev_accuracy:
            best_model = tmp_model
            best_params = tmp_params
            best_accuracy = val_accuracy

    return best_model, best_params, best_accuracy
Exemple #7
0
 def fit(self, X, y=None):
     if self.transformer is not None:
         print 'Fit Transformer'
         self.transformer.fit(X, y)
     from sklearn import ensemble
     print 'Fitting ISF'
     self.isf = ensemble.IsolationForest(n_estimators=self.n_estimators,
                                         max_samples='auto',
                                         contamination=self.contamination,
                                         n_jobs=-1,
                                         random_state=self.random_state)
     self.isf.fit(
         X if self.transformer is None else self.transformer.latent(X))
     self._estimate_threshold(X)
Exemple #8
0
    def handle_app(app_id, ids_entries, experiment):
        """ Full flow for one classifier. """

        verify_ids_entries(ids_entries, app_id, experiment.storer_printer)

        training, scoring = ids_tools.ids_entries_to_train_test(ids_entries)
        X_train, _ = IdsConverter().ids_entries_to_X_y(training)
        X_test, y_true = IdsConverter().ids_entries_to_X_y(scoring)

        classifiers = [sk_svm.OneClassSVM(), sk_ens.IsolationForest()]
        for classifier in classifiers:
            classifier.fit(X_train)
            y_pred = classifier.predict(X_test)
            experiment.visualise_store("SPEC", app_id, classifier, y_true,
                                       y_pred)
    def mapper(self, data):
        """Run the mapper algorithm on the data.

        Parameters
        ----------
        data : array-like
            The data to run the algorihthm on, can have almost any shape.

        Returns
        -------
        graph : The graph output from km.KeplerMapper(...).map(...)

        """
        # Initialize
        logging.info("Applying the mapping algorithm.")
        mapper = km.KeplerMapper(verbose=2)

        # We create a custom 1-D lens with Isolation Forest
        model = ensemble.IsolationForest()
        model.fit(data)
        isolation_forest = model.decision_function(data).reshape(
            (data.shape[0], 1))

        # Fit to and transform the data
        tsne_projection = mapper.fit_transform(
            data,
            projection=sklearn.manifold.TSNE(n_components=2,
                                             perplexity=20,
                                             init='pca'))

        lens = np.c_[isolation_forest, tsne_projection]

        # Create dictionary called 'graph' with nodes, edges and meta-information
        graph = mapper.map(tsne_projection,
                           coverer=km.Cover(10, 0.2),
                           clusterer=sklearn.cluster.DBSCAN(eps=1.0,
                                                            min_samples=2))

        color_function = np.array(
            [self._label_to_color(self.labels[i]) for i in range(len(data))])
        # Visualize it
        mapper.visualize(graph,
                         path_html="actions.html",
                         title="chunk",
                         custom_tooltips=self.tooltips,
                         color_function=color_function)

        return graph
Exemple #10
0
    def handle_all(experiment):
        """ Full flow for a one-fits-all classifier. """

        from ids.TEMP_IDS_CONVERTER import IdsConverter as TEMPCONVERTER
        converter = TEMPCONVERTER()
        log_entries = []

        for line in Dir.yield_lines(experiment.file_path, ITEM_LIMIT):
            log_entry = LogEntry.from_log_string(line)
            log_entries.append(log_entry)

        all_entries = converter.LOG_ENTRIES_TO_IDS_ENTRIES(log_entries,
                                                           binary=True)

        training_entries, scoring_entries = ids_tools.ids_entries_to_train_test(
            all_entries)
        X_train, _ = IdsConverter().ids_entries_to_X_y(training_entries)

        scoring_dict = {}
        for ids_entry in scoring_entries:
            if ids_entry.app_id not in scoring_dict:
                scoring_dict[ids_entry.app_id] = []
            scoring_dict[ids_entry.app_id].append(ids_entry)

        # Classify with all entries: training_entries
        classifiers = [sk_svm.OneClassSVM(), sk_ens.IsolationForest()]
        for classifier in classifiers:
            classifier.fit(X_train)

        # Score for each app: scoring_dict
        for app_id, app_entries in util.seqr.yield_items_in_key_order(
                scoring_dict):
            X_test, y_true = IdsConverter().ids_entries_to_X_y(app_entries)
            y_preds = [clf.predict(X_test) for clf in classifiers]
            for clf, y_pred in zip(classifiers, y_preds):
                experiment.visualise_store("ALL", app_id, clf, y_true, y_pred)
Exemple #11
0
    def fit(self,
            df,
            cluster_alg_ls=['KMeans', 'DBSCAN'],
            dim_reduction_alg_ls=[],
            n_evaluations=30,
            run_obj='quality',
            seed=27,
            cutoff_time=50,
            optimizer='smac',
            evaluator=get_evaluator(evaluator_ls=['silhouetteScore'],
                                    weights=[],
                                    clustering_num=None,
                                    min_proportion=.001,
                                    min_relative_proportion=0.01),
            n_folds=3,
            preprocess_dict={},
            isolation_forest_contamination='auto',
            warmstart=False,
            warmstart_datasets_dir='silhouette',
            warmstart_metafeatures_table_path='metaknowledge/metafeatures_table.csv',
            warmstart_n_neighbors=3,
            warmstart_top_n=20,
            general_metafeatures=[],
            numeric_metafeatures=[],
            categorical_metafeatures=[],
            verbose_level=2):
        """
        ---------------------------------------------------------------------------
        Arguments
        ---------------------------------------------------------------------------
        df: a DataFrame
        n_folds: number of folds used in k-fold cross validation
        preprocess_dict: should be a dictionary with keys 'numeric_cols', 'ordinal_cols', 'categorical_cols' and 'y_col'
        isolation_forest_contamination: 'contamination' parameter in IsolationForest outlier removal model, float expected 
        optimizer: 'smac' or 'random'
        cluster_alg_ls: list of clustering algorithms to explore
        dim_reduction_alg_ls: list of dimension algorithms to explore
        n_evaluations: max # of evaluations done during optimization, higher values yield better results 
        run_obj: 'runtime' or 'quality', cutoff_time must be provided if 'runtime' chosen.
        cutoff_time: Maximum runtime, after which the target algorithm is cancelled. Required if run_obj is 'runtime'.
        shared_model: whether or not to use parallel SMAC 
        evaluator: a function for evaluating clustering result, must have the arguments X and y_pred
        verbose_level: integer, must be either 0, 1 or 2. The higher the number, the more logs/print statements are used. 
        """
        #############################################################
        # Logging/Printing                                          #
        #############################################################
        self._verbose_level = verbose_level

        #############################################################
        # Data preprocessing                                        #
        #############################################################
        # rename, save preprocess_dict for later use
        raw_data = df
        self._preprocess_dict = preprocess_dict

        # encode categorical and ordinal columns
        preprocess_dict['df'] = raw_data
        raw_data_np = PreprocessedDataset(**preprocess_dict).X

        # perform outlier detection
        predicted_labels = ensemble.IsolationForest(
            n_estimators=100,
            warm_start=True,
            behaviour='new',
            contamination=isolation_forest_contamination).fit_predict(
                raw_data_np)
        idx_np = np.where(predicted_labels == 1)

        # remove outliers
        raw_data_cleaned = raw_data.iloc[idx_np].reset_index(drop=True)
        self._log("{}/{} datapoints remaining after outlier removal".format(
            len(raw_data_cleaned), len(raw_data_np)),
                  min_verbose_level=1)

        # encode cleaned datasest
        preprocess_dict['df'] = raw_data_cleaned
        processed_data_np = PreprocessedDataset(**preprocess_dict).X

        #############################################################
        # Warmstarting (Optional)                                   #
        #############################################################

        # construct desired configuration space
        cs = build_config_space(cluster_alg_ls, dim_reduction_alg_ls)
        self._log(cs, min_verbose_level=2)

        # calculate metafeatures
        metafeatures_np = None
        metafeatures_ls = general_metafeatures + numeric_metafeatures + categorical_metafeatures
        if len(metafeatures_ls) > 0:
            metafeatures_np = calculate_metafeatures(raw_data_cleaned,
                                                     preprocess_dict,
                                                     metafeatures_ls)

        # perform warmstart, if needed
        initial_cfgs_ls = []
        if warmstart and len(metafeatures_ls) > 0:
            # create and train warmstarter
            warmstarter = KDTreeWarmstarter(metafeatures_ls)
            warmstarter.fit(warmstart_metafeatures_table_path)

            # query for suitable configurations
            initial_configurations = warmstarter.query(
                metafeatures_np,
                warmstart_n_neighbors,
                warmstart_top_n,
                datasets_dir=warmstart_datasets_dir)

            # construct configuration objects
            for cfg in initial_configurations:
                try:
                    initial_cfgs_ls.append(build_config_obj(cs, cfg[0]))
                except:
                    pass

        # if too little configurations available, just ignore
        initial_cfgs_ls = None if len(initial_cfgs_ls) < 2 else initial_cfgs_ls
        if initial_cfgs_ls is not None:
            self._log(
                'Found {} relevant intial configurations from warmstarter.'.
                format(len(initial_cfgs_ls)),
                min_verbose_level=1)

        #############################################################
        # Bayesian optimization (SMAC)                              #
        #############################################################
        # make sure n_evaluations is valid
        dim_reduction_min_size = 1 if len(dim_reduction_alg_ls) == 0 \
                                else min([Mapper.getClass(alg).n_possible_cfgs
                                          for alg in dim_reduction_alg_ls])
        clustering_min_size = min(
            [Mapper.getClass(alg).n_possible_cfgs for alg in cluster_alg_ls])
        n_evaluations = min(n_evaluations,
                            clustering_min_size * dim_reduction_min_size)
        initial_cfgs_ls = initial_cfgs_ls[
            0:n_evaluations] if initial_cfgs_ls is not None else None
        self._log('Truncated n_evaluations: {}'.format(n_evaluations),
                  min_verbose_level=1)

        # define scenario object to be passed into SMAC
        scenario_params = {
            "run_obj":
            run_obj,
            "runcount-limit":
            n_evaluations,
            "cutoff_time":
            cutoff_time,
            "cs":
            cs,
            "deterministic":
            "true",
            "output_dir":
            LogUtils.create_new_directory('{}/smac'.format(self.log_dir)),
            "abort_on_first_run_crash":
            False,
        }
        scenario = Scenario(scenario_params)
        self._log('{}'.format(scenario_params), min_verbose_level=2)

        # functions required for SMAC optimization
        def fit_models(cfg, data):
            ################################################
            # Preprocessing                                #
            ################################################
            # fit standard scaler
            scaler = preprocessing.StandardScaler()
            scaler.fit(data)

            # standardize data
            scaled_data = scaler.transform(data)

            ################################################
            # Dimensionality reduction                     #
            ################################################
            # get the dimension reduction method chosen
            dim_reduction_alg = Mapper.getClass(
                cfg.get("dim_reduction_choice", None))
            dim_reduction_model = None

            # fit dimension reduction model
            compressed_data = scaled_data
            if dim_reduction_alg:
                cfg_dim_reduction = {
                    StringUtils.decode_parameter(k, dim_reduction_alg.name): v
                    for k, v in cfg.items() if StringUtils.decode_parameter(
                        k, dim_reduction_alg.name) is not None
                }

                # compress the data using chosen configurations
                dim_reduction_model = dim_reduction_alg.model(
                    **cfg_dim_reduction)
                compressed_data = dim_reduction_model.fit_transform(
                    scaled_data)

            ################################################
            # Clustering                                   #
            ################################################
            # get the model chosen
            clustering_alg = Mapper.getClass(cfg["clustering_choice"])

            # decode the encoded parameters
            cfg_clustering = {
                StringUtils.decode_parameter(k, clustering_alg.name): v
                for k, v in cfg.items() if StringUtils.decode_parameter(
                    k, clustering_alg.name) is not None
            }

            # train clustering model
            clustering_model = clustering_alg.model(**cfg_clustering)
            clustering_model.fit(compressed_data)

            return scaler, dim_reduction_model, clustering_model,

        def cfg_to_dict(cfg):
            # convert cfg into a dictionary
            cfg = {k: cfg[k] for k in cfg if cfg[k]}

            # remove keys with value == None
            return {k: v for k, v in cfg.items() if v is not None}

        def evaluate_model(cfg):
            # get cfg as dictionary
            cfg = cfg_to_dict(cfg)

            # logging
            self._log("Fitting configuration: \n{}".format(cfg),
                      min_verbose_level=1)

            ################################################
            # K fold cross validation                      #
            ################################################
            kf = model_selection.KFold(n_splits=n_folds,
                                       shuffle=True,
                                       random_state=seed)
            kf.get_n_splits(processed_data_np)

            # store score obtain by each fold
            score_ls = []

            for train_idx, valid_idx in kf.split(processed_data_np):
                # split data into train and test
                train_data, valid_data = processed_data_np[
                    train_idx], processed_data_np[valid_idx]

                # fit clustering and dimension reduction models on training data
                scaler, dim_reduction_model, clustering_model = fit_models(
                    cfg, train_data)

                # test on validation data
                scaled_valid_data = scaler.transform(valid_data)
                compressed_valid_data = scaled_valid_data
                if dim_reduction_model:
                    try:
                        compressed_valid_data = dim_reduction_model.transform(
                            scaled_valid_data)
                    except:
                        compressed_valid_data = dim_reduction_model.fit_transform(
                            scaled_valid_data)

                # predict on validation data
                if hasattr(clustering_model, 'fit_predict'):
                    y_pred = clustering_model.fit_predict(
                        compressed_valid_data)
                else:
                    y_pred = clustering_model.predict(compressed_valid_data)

                # evaluate using provided evaluator
                score = evaluator(X=compressed_valid_data, y_pred=y_pred)
                score_ls.append(score)

                # if we have infinity, no point continue evaluating
                if score in [float('inf'), np.nan]:
                    break

            if (float('inf') in score_ls) or (np.nan in score_ls):
                score = float('inf')
            else:
                score = np.mean(score_ls)

            self._log("Score obtained by this configuration: {}".format(score),
                      min_verbose_level=1)
            return score

        optimal_config = None
        if optimizer == 'smac':
            # reset
            self._random_optimizer_obj = None

            # run SMAC to optimize
            smac_params = {
                "scenario": scenario,
                "rng": np.random.RandomState(seed),
                "tae_runner": evaluate_model,
                "initial_configurations": initial_cfgs_ls,
            }
            self._smac_obj = SMAC(**smac_params)
            optimal_config = self._smac_obj.optimize()
            time_spent = round(self._smac_obj.stats.get_used_wallclock_time(),
                               2)

        elif optimizer == 'random':
            # reset
            self._smac_obj = None

            # run random optimizer
            t0 = time.time()
            self._random_optimizer_obj = RandomOptimizer(
                random_seed=seed,
                blackbox_function=evaluate_model,
                config_space=cs)
            optimal_config, score = self._random_optimizer_obj.optimize(
                n_evaluations=n_evaluations, cutoff=cutoff_time)
            time_spent = round(time.time() - t0, 2)

        # refit to get optimal model
        self._scaler, self._dim_reduction_model, self._clustering_model = fit_models(
            cfg_to_dict(optimal_config), processed_data_np)
        self._log("Optimization is complete.", min_verbose_level=1)
        self._log("Took {} seconds.".format(time_spent), min_verbose_level=1)
        self._log("The optimal configuration is \n{}".format(optimal_config),
                  min_verbose_level=1)

        # return a dictionary
        result = {
            "cluster_alg_ls": cluster_alg_ls,
            "dim_reduction_alg_ls": dim_reduction_alg_ls,
            "random_optimizer_obj": self._random_optimizer_obj,
            "smac_obj": self._smac_obj,
            "optimal_cfg": optimal_config,
            "metafeatures": metafeatures_np,
            "metafeatures_used": metafeatures_ls,
            "clustering_model": self._clustering_model,
            "dim_reduction_model": self._dim_reduction_model,
            "scaler": self._scaler
        }
        return result
Exemple #12
0
print(grid_lsvm_estimator.score(X_train1, y_train))
final_model = grid_lsvm_estimator.best_estimator_
final_model.coef_
final_model.intercept_


gbm_estimator = GradientBoostingClassifier(random_state=2017)
gbm_grid = {'n_estimators':[50, 100], 'max_depth':[3,4,5], 'learning_rate':[0.001,0.01,0.2,0.3]}
grid_gbm_estimator = model_selection.GridSearchCV(gbm_estimator, gbm_grid,scoring="roc_auc",cv=10,n_jobs=1)
grid_gbm_estimator.fit(X_train,y_train)
print(grid_gbm_estimator.grid_scores_)
print(grid_gbm_estimator.best_score_)
print(grid_gbm_estimator.best_params_)
print(grid_gbm_estimator.score(X_train, y_train))

isf=ensemble.IsolationForest(random_state=2017)


X_test = total_data2[train_data.shape[0]:]
pca = decomposition.PCA()
pca.fit(X_test)
print(pca.explained_variance_)
print(pca.explained_variance_ratio_)
print(pca.explained_variance_ratio_.cumsum())

pca = decomposition.PCA(150)
pca.fit(X_test)
X_test1 = pca.transform(X_test)
X_test.shape
X_test.info()
test_data['target'] = dt_estimator.predict_proba(X_test1)
Exemple #13
0
 def _get_best_detector(self, train):
     detector = ensemble.IsolationForest()
     detector.fit(train)
     return detector
Exemple #14
0
def remove_outliers(G,terrorist_graph,threshold,train_set,predict_set):


    rng = np.random.RandomState(42)
    # fit the model
    # X_Red=[X_train[i] for i in range(len(X_train)) if Y_train[i]==1 ]
    X_Blue=[]
    X_Blue={}
    X_Red={}
    keys=[]
    for point in train_set:
        if train_set[point][1]==0:
            X_Blue[point]=train_set[point][0].values()
            keys=train_set[point][0].keys()
        else:
            X_Red[point]=train_set[point][0].values()


    blue_train=[X_Blue[i] for i in X_Blue.keys()]

    blue_keys=X_Blue.keys()

    if len(blue_train)>0:
        #clf=neigh.LocalOutlierFactor()

        clf = en.IsolationForest(max_samples=100, random_state=rng)
        clf.fit(blue_train)
        y_pred_train = clf.predict(blue_train)

        no_red_among_blue=[n for n in blue_keys if G.node[n]['color']=="Red" ]
        no_identified=0
        no_removed=0

        for j in range(len(y_pred_train)):

            if y_pred_train[j]!=1:

                if blue_keys[j]  in no_red_among_blue:
                    no_identified+=1
                no_monitors=terrorist_graph.node[blue_keys[j]]["MonitorNumber"]
                # print no_monitors
                # prob_red=pow(prob_lying,no_monitors)
                # print prob_red
                #if terrorist_graph.node[blue_keys[j]]["MonitorNumber"]<4: #or terrorist_graph.node[blue_keys[j]]["RedConfidence"]>0.1 :

                if no_monitors<threshold+1:
                    #print no_monitors

                    predict_set[blue_keys[j]]=train_set[blue_keys[j]][0]
                    no_removed+=1
                    #predict_set.append()
                    del train_set[blue_keys[j]]
                    terrorist_graph.node[blue_keys[j]]["color"]="Black"
                    terrorist_graph.node[blue_keys[j]]['tempColor']="Black"
                    terrorist_graph.node[blue_keys[j]]["IsMonitor"]=False

        #print (no_identified,len(no_red_among_blue),no_removed)



    return train_set,predict_set,terrorist_graph
def outliers_isolation_forest(sparse_data):
    iso_forest = ensemble.IsolationForest(contamination=0.15)
    iso_forest.fit(sparse_data)
    y = iso_forest.predict(sparse_data)
    # list of outlier samples
    print([i for i in range(len(y)) if y[i] < 0])
def ex_1():
    X, y = datasets.fetch_openml('diabetes', as_frame=True, return_X_y=True)
    # print(X)

    # print(X.info())
    # print(X.describe())

    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_2 = X_train.copy()

    plt.figure()
    X_train.boxplot()
    X_train.hist(bins=20)
    plt.figure()
    sns.boxplot(x=X_train['mass'])

    imputer_mass = impute.SimpleImputer(missing_values=0.0, strategy='mean')
    imputer_skin = impute.SimpleImputer(missing_values=0.0, strategy='mean')

    X_train[['mass']] = imputer_mass.fit_transform(X_train[['mass']])
    X_train[['skin']] = imputer_skin.fit_transform(X_train[['skin']])

    X_test[['mass']] = imputer_mass.transform(X_test[['mass']])
    X_test[['skin']] = imputer_mass.transform(X_test[['skin']])

    df_mass = X_train[['mass']]
    # print(df_mass.head(5))

    # Wykrywanie anomalii czyli odstających danych

    X_train_isolation = X_train.values
    X_train_isolation = X_train_isolation[:, [1, 5]]
    X_test_isolation = X_test.values
    X_test_isolation = X_test_isolation[:, [1, 5]]

    isolation_forest = ensemble.IsolationForest(contamination=0.05)
    isolation_forest.fit(X_train_isolation)
    y_predicted_outliers = isolation_forest.predict(X_test_isolation)
    print(y_predicted_outliers)

    plot_iris2d(X_test_isolation, y_predicted_outliers)

    clf = svm.SVC(random_state=42)
    clf.fit(X_train, y_train)
    y_predicted = clf.predict(X_test)
    print(metrics.classification_report(y_test, y_predicted))

    X_train.hist()

    imputer_it = impute.IterativeImputer(missing_values=0.0)

    X_train_2[['mass']] = imputer_it.fit_transform(X_train_2[['mass']])
    X_train_2[['skin']] = imputer_it.fit_transform(X_train_2[['skin']])

    X_train_2.hist(bins=20)
    plt.figure()
    X_train_2.boxplot()

    clf_rf = ensemble.RandomForestClassifier(random_state=42)
    clf_rf.fit(X_train, y_train)
    y_predicted = clf_rf.predict(X_test)
    print(metrics.classification_report(y_test, y_predicted))

    importances = clf_rf.feature_importances_
    std = np.std([tree.feature_importances_ for tree in clf_rf.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    for f in range(X.shape[1]):
        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

    # Plot the impurity-based feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(X.shape[1]), importances[indices],
            color="r", yerr=std[indices], align="center")
    plt.xticks(range(X.shape[1]), indices)
    plt.xlim([-1, X.shape[1]])
    plt.show()
 # 椭圆分布假设的异常检测
 if 0:
     from sklearn import covariance
     contamination = 0.05 # 需设置异常比例
     clf = covariance.EllipticEnvelope(assume_centered=False, support_fraction=None, \
                                       contamination=contamination, random_state=42)
     clf.fit(BigFeatures)
     y_detection=clf.predict(BigFeatures)
     print(BigSamplenames[y_detection==-1])
     
 # 隔离森林异常检测,适于多维数据集
 if 1:
     print('******************** 剔除异常样本 ********************\n')
     from sklearn import ensemble
     contamination = 0.05 # 需设置异常比例
     clf = ensemble.IsolationForest(max_samples='auto', contamination=contamination, \
                                    max_features=1.0, bootstrap=False, random_state=42)
     clf.fit(BigFeatures)
     y_detection=clf.predict(BigFeatures)
     print('异常样本类别:\n',BigSamplenames[y_detection==-1])
     Samplenames,Labels,Features = \
     BigSamplenames[y_detection!=-1],BigLabels[y_detection!=-1],BigFeatures[y_detection!=-1,:]
     
 # OCSVM异常检测,超参数不易设置
 if 0:
     from sklearn import svm
     clf = svm.OneClassSVM(kernel='rbf', nu=0.5, max_iter=-1, random_state=42)
     clf.fit(BigFeatures)
     y_detection=clf.predict(BigFeatures)
     print(BigSamplenames[y_detection==-1])
 
 if 1:   
Exemple #18
0
import numpy as np
import kmapper as km
import sklearn
from sklearn import ensemble

# For data we use the Wisconsin Breast Cancer Dataset
# Via: https://www.kaggle.com/uciml/breast-cancer-wisconsin-data
df = pd.read_csv("data.csv")
feature_names = [c for c in df.columns if c not in ["id", "diagnosis"]]
df["diagnosis"] = df["diagnosis"].apply(lambda x: 1 if x == "M" else 0)
X = np.array(df[feature_names].fillna(0)) # quick and dirty imputation
y = np.array(df["diagnosis"])

# We create a custom 1-D lens with Isolation Forest
model = ensemble.IsolationForest(random_state=1729)
model.fit(X)
lens1 = model.decision_function(X).reshape((X.shape[0], 1))

# We create another 1-D lens with L2-norm
mapper = km.KeplerMapper(verbose=3)
lens2 = mapper.fit_transform(X, projection="l2norm")

# Combine both lenses to create a 2-D [Isolation Forest, L^2-Norm] lens
lens = np.c_[lens1, lens2]

# Create the simplicial complex
graph = mapper.map(lens,
                   X,
                   cover=km.Cover(n_cubes=15, perc_overlap=0.7),
                   clusterer=sklearn.cluster.KMeans(n_clusters=2,
Exemple #19
0
def iso_forest(X):
	clf = ensemble.IsolationForest(max_samples=X.shape[0], random_state=None)
	return clf.fit(X)
Exemple #20
0
    def trainAnomalyModel(self, data, logFolder, newPMMLFileName, lock,
                          kwargs):

        print('here' * 20, kwargs)
        paramToTrainModel = kwargs['data']
        idforData = kwargs['idforData']
        dataPath = kwargs['filePath']
        try:
            targetVar = kwargs['target_variable']
        except:
            targetVar = None
        algorithmToUse = kwargs['parameters']['algorithm']

        projectName = idforData
        projectPath = logFolder + projectName
        dataFolder = projectPath + '/dataFolder/'
        statusfileLocation = dataFolder + 'status' + '.txt'

        def upDateStatus():
            lock.acquire()
            sFile = open(statusfileLocation, 'r')
            sFileText = sFile.read()
            lock.release()
            data_details = json.loads(sFileText)
            return data_details

        try:
            dataMapperInner = autoMLutilities.createDataMapper(
                paramToTrainModel, targetVar)
        except Exception as e:
            data_details = upDateStatus()
            data_details['status'] = 'Training Failed'
            data_details[
                'errorMessage'] = 'Error while creating DataFrameMapper >> ' + str(
                    e)
            data_details['errorTraceback'] = traceback.format_exc()
            with open(statusfileLocation, 'w') as filetosave:
                json.dump(data_details, filetosave)
            # sys.exit()
            return

        mapper1 = DataFrameMapper(dataMapperInner)
        featureVar = list(data.columns)

        if algorithmToUse == 'IsolationForest':
            print('came here')
            from sklearn import ensemble
            modelT = ensemble.IsolationForest()
        elif algorithmToUse == 'OneClassSVM':
            from sklearn import svm
            modelT = svm.OneClassSVM()
        elif algorithmToUse == 'LinearSVR':
            print('Came to SVR')
            from sklearn import svm
            modelT = svm.LinearSVR()
        # else:
        #     data_details=upDateStatus()
        #     data_details['status']='Training Failed'
        #     data_details['errorMessage']='Model not supported >> '
        #     data_details['errorTraceback']='None'
        #     with open(statusfileLocation,'w') as filetosave:
        #         json.dump(data_details, filetosave)
        #     # sys.exit()
        #     return

        try:
            print('training started')
            pipeline = Pipeline([('feature_mapper', mapper1),
                                 ('model', modelT)])
            pipelObj = pipeline.fit(data)
            print('training completed')

        except Exception as e:
            data_details = upDateStatus()
            data_details['status'] = 'Training Failed'
            data_details[
                'errorMessage'] = 'Error while preparing Data and training model >> ' + str(
                    e)
            data_details['errorTraceback'] = traceback.format_exc()
            with open(statusfileLocation, 'w') as filetosave:
                json.dump(data_details, filetosave)
            # sys.exit()
            return

        data_details = upDateStatus()
        data_details['listOfModelAccuracy'] = []
        data_details['pmmlFilelocation'] = ''

        with open(statusfileLocation, 'w') as filetosave:
            json.dump(data_details, filetosave)

        finalPMMLfile = '../ZMOD/Models/' + newPMMLFileName
        toExportDict = {
            'model1': {
                'data': None,
                'hyperparameters': None,
                'preProcessingScript': None,
                'pipelineObj': Pipeline(pipelObj.steps[:-1]),
                'modelObj': pipelObj.steps[-1][1],
                'featuresUsed': featureVar,
                'targetName': None,
                'postProcessingScript': None,
                'taskType': 'score'
            }
        }
        try:
            print('toExportDict >>>>>>>>>>>> ', toExportDict)
            from nyoka.skl.skl_to_pmml import model_to_pmml
            model_to_pmml(toExportDict, PMMLFileName=finalPMMLfile)
            print('>>>>>>>>>>>>>>>>>>>>>>> Success')
        except Exception as e:
            data_details = upDateStatus()
            data_details['status'] = 'Training Failed'
            data_details[
                'errorMessage'] = 'Error while Saving Model >> ' + str(e)
            data_details['errorTraceback'] = traceback.format_exc()
            with open(statusfileLocation, 'w') as filetosave:
                json.dump(data_details, filetosave)
            # sys.exit()
            return
            print('>>>>>>>>>>>>>>>>>>>>>>> Failed Saving Trying again')

        with open(statusfileLocation, 'r') as sFile:
            sFileText = sFile.read()
        model_accuracy = []
        data_details = json.loads(sFileText)
        data_details['status'] = 'Complete'
        data_details['pmmlFilelocation'] = finalPMMLfile
        data_details['listOfModelAccuracy'] = model_accuracy
        with open(statusfileLocation, 'w') as filetosave:
            json.dump(data_details, filetosave)
Exemple #21
0
def main():
    URLKeyword, URLchar, action, title = get_dic()

    #load training dataset
    mainfile = './data/file_list_20170430_new的副本.txt'
    WebDirectory = './data/file的副本/'
    MD5_list, flag_list, URL_list = traverse_directory(WebDirectory, mainfile)
    X_train = list()
    Y_train = flag_list
    for i in range(len(MD5_list)):
        URL = URL_list[i]
        Web_data = read_file(MD5_list[i])
        web_vec = Web_feature(Web_data, title, action, MD5_list[i])
        URL_vec = URL_feature(URL, URLKeyword, URLchar)
        feature = np.hstack((web_vec, URL_vec))
        X_train.append(feature)
        # print(len(feature))
    print(len(X_train), len(Y_train))

    X_train = np.asarray(X_train)
    Y_train = np.asarray(Y_train)

    #feature selection
    X_train, Y_train, F_index = feature_selection(X_train, Y_train)
    print(F_index)

    #参数选择
    #tuned_parameters = {'n_estimators':range(10,100,10),"max_depth":range(3,25,2),"max_features":range(3,20,2)}
    #split dataset
    # X_train, X_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.25, random_state=0, stratify=Y_train)

    #train model
    clf1 = ensemble.RandomForestClassifier(bootstrap=True,
                                           criterion='gini',
                                           max_depth=21,
                                           max_features=15,
                                           n_estimators=10)
    clf1.fit(X_train, Y_train)
    clf2 = ensemble.IsolationForest(contamination=0.06,
                                    n_estimators=90,
                                    max_samples=150,
                                    bootstrap=True)
    clf2.fit(X_train, Y_train)
    clf3 = XGBClassifier(learning_rate=0.5,
                         max_depth=6,
                         n_estimators=100,
                         objective="multi:softmax",
                         num_class=2)
    clf3.fit(X_train, Y_train)

    # print("best parameter:",clf.best_params_)
    # print(clf.grid_scores_)
    # joblib.dump(clf,'RF_model.m')
    """
    print("Traing Score:%f" % clf.score(X_train, Y_train))
    # print("Testing Score:%f"%clf.score(X_test,y_test))
    middle = time.clock()
    print(middle-start)
    """

    #load testing dataset
    mainfile1 = './data/file_list_10000.txt'
    WebDirectory1 = './data/file1/'
    MD5_list1, flag_list1, URL_list1 = traverse_directory_t(
        WebDirectory1, mainfile1)
    X_test = list()
    Y_test = flag_list1
    for h in range(len(MD5_list1)):
        s_fea = []
        URL1 = URL_list1[h]
        Web_data1 = read_file(MD5_list1[h])
        web_vec1 = Web_feature(Web_data1, title, action, MD5_list1[h])
        URL_vec1 = URL_feature(URL1, URLKeyword, URLchar)
        feature1 = np.hstack((web_vec1, URL_vec1))
        for j in F_index:
            s_fea.append(feature1[j])
        X_test.append(s_fea)
        # print("********")
    print(len(X_test), len(Y_test))

    #testing model
    y_score_1 = clf1.predict_proba(X_test)[:, 1]
    y_score_2 = clf2.decision_function(X_test)
    y_score_3 = clf3.predict_proba(X_test)[:, 1]
    fig_plot(Y_test, y_score_1, y_score_2, y_score_3)
def outliers_isolation_forest_dense(matrix_data):
    iso_forest = ensemble.IsolationForest(contamination=0.10, behaviour='new')
    iso_forest.fit(matrix_data)
    y = iso_forest.predict(matrix_data)
    # list of outlier samples
    print([i for i in range(len(y)) if y[i] < 0])
###Hay que tener en cuenta en cuenta el alto porcentaje de superposición.                               ###
#######################################################################################################################
# Create a custom 1-D lens with **Isolation Forest**                                                                ###
#Return the anomaly score of each sample using the IsolationForest algorithm                                        ###
#The IsolationForest ‘isolates’ observations by randomly selecting a feature and then randomly                      ###
#selecting a split value between the maximum and minimum values of the selected feature.                            ###
#Since recursive partitioning can be represented by a tree structure, the number of splittings                      ###
#required to isolate a sample is equivalent to the path length from the root node to the terminating node.          ###
#This path length, averaged over a forest of such random trees, is a measure of normality and our decisionfunction. ###
#Random partitioning produces noticeably shorter paths for anomalies. Hence, when a forest of random trees          ###
#collectively produce shorter path lengths for particular samples, they are highly likely to be anomalies.          ###
#######################################################################################################################

# Create a custom 1-D lens with Isolation Forest
model = ensemble.IsolationForest(
    random_state=1729
)  #If int, random_state is the seed used by the random number generator;
model.fit(X)
lens1 = model.decision_function(X).reshape((X.shape[0], 1))

# Create another 1-D lens with L2-norm
mapper = km.KeplerMapper(verbose=0)
lens2 = mapper.fit_transform(X, projection="l2norm")

# Combine both lenses to get a 2-D [Isolation Forest, L^2-Norm] lens
lens = np.c_[lens1, lens2]

###########################################################################################################################################
### Aplicacion del cluster Affinity Propagation proveniente de la libreria SKLearn                                                       ##
#                                                                                                                                        ##
#AffinityPropagation creates clusters by sending messages between pairs of samples until convergence.                                    ##
Exemple #24
0
def filter_outliers(x, y, **kwargs):
    xy = np.column_stack((x, y))
    filter_estimator = ensemble.IsolationForest(random_state=42, **kwargs)
    filter_estimator.fit(xy)
    is_inlier = filter_estimator.predict(xy)
    return x[is_inlier == 1], y[is_inlier == 1]
Exemple #25
0
    def iso_forest(self, label, result_list):
        x_train = self.train_test_split['x_train']
        clf = ensemble.IsolationForest(max_samples=x_train.shape[0], random_state=None)

        return execute_decision_function(clf, self.train_test_split, label, result_list, self.image_creator,
                                         unsupervised=True)
Exemple #26
0
    return Y_pred_collection, Y


if __name__ == "__main__":
    np.random.seed(712)

    sz_t = 14
    sz_height_span = [
        1,
    ]
    sz_image_size = 24
    sz_downsample_size = 3

    rf_learner = ensemble.RandomForestClassifier(n_estimators=100, n_jobs=4)
    xgc_learner = xgb.XGBClassifier(max_depth=5,
                                    learning_rate=0.1,
                                    n_estimators=50,
                                    min_child_weight=1,
                                    subsample=1,
                                    colsample_bytree=1)
    isof_learner = ensemble.IsolationForest(contamination=0.001,
                                            max_samples=0.5)

    cross_validataion_classification_one_timeslot(sz_t,
                                                  sz_height_span,
                                                  sz_image_size,
                                                  sz_downsample_size,
                                                  xgc_learner,
                                                  detect_outlier=False)
ax = mtp.axes()
xx, yy = make_meshgrid(outd5x[:, 0], outd5x[:, 1])
plot_contours(ax, model, xx, yy, cmap=mtp.cm.coolwarm)
ax.scatter(outd5x[:, 0], outd5x[:, 1], c=outd5y, edgecolors='black')
ax.set_xlabel('feature 1')
ax.set_ylabel('feature 2')
purple_patch = mpatches.Patch(color='purple', label='class 0')
yellow_patch = mpatches.Patch(color='yellow', label='class 1')
mtp.legend(handles=[purple_patch, yellow_patch])
mtp.title(
    "Outlier removed data 5 plot with decision boundary using One class SVM (linear kernel)"
)
mtp.show()

# Using Isolation Forest
model = ensemble.IsolationForest(contamination=0.52)
out4_0 = model.fit_predict(d4x0)
# print(out4_0)
out4_1 = model.fit_predict(d4x1)

outd4x = np.zeros(shape=(np.count_nonzero(out4_0 == 1) +
                         np.count_nonzero(out4_1 == 1), 2))
outd4y = np.zeros(
    (np.count_nonzero(out4_0 == 1) + np.count_nonzero(out4_1 == 1)))
# d4o = np.zeros(shape=(np.count_nonzero(out4_0 == -1), 2))
# d4y0 = np.zeros(np.count_nonzero(d4y == 0))
# d4y1 = np.ones(np.count_nonzero(d4y == 1))

a = 0
for i in range(len(d4x0)):
    if out4_0[i] == 1:
Exemple #28
0
import matplotlib.pyplot as plt
from sklearn import ensemble
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, plot_confusion_matrix
from bunch import Bunch
import database_config
import config as cfg

# Configuration
TEST_SET_SIZE = 1500
VIEW_GRAPH = False

# Model Setup
# isolation_forest = ensemble.IsolationForest(n_estimators=50, max_features=3, random_state=cfg.RANDOM_SEED_MODEL)
isolation_forest = ensemble.IsolationForest(n_estimators=50,
                                            max_features=3,
                                            contamination=0.10,
                                            random_state=cfg.RANDOM_SEED_MODEL)

# Get Data
database_config.db.load(cfg.STORAGE_BASE_PATH_SIMULATED_DATA)
changes_df = database_config.db.get_table('MTA_CHANGES').get_data()
update_changes_df = pd.DataFrame(
    changes_df[changes_df['change_type'] == 'update'])

update_changes_df['price_delta'] = update_changes_df.old_value.astype(
    float) - update_changes_df.new_value.astype(float)

dataset = Bunch(
    # data=update_changes_df[['price_delta']].values,
    # data=update_changes_df[['old_value', 'new_value']].values,
    data=update_changes_df[['old_value', 'new_value', 'price_delta']].values,
Exemple #29
0
print(features.size)

features = sklearn.preprocessing.scale(features)
train_unlabeled = sklearn.preprocessing.scale(np.array(train_unlabeled))

# gnb = nb.MultinomialNB()
# gnb.fit(features,lables)
#
# yresult = gnb.predict(train_unlabeled)
# np.savetxt('gaussianNB.csv',yresult,delimiter=',')
validate = np.loadtxt(open('benchmark2100.csv'),delimiter = ",")
valiStored = validate
train_unlabeledStored = train_unlabeled
# print(accuracy_score(validate,yresult))

iso = en.IsolationForest(n_estimators=100, max_samples='auto', contamination=0.3, max_features=128, bootstrap=False, n_jobs=-1, random_state=None, verbose=2)

iso.fit(train_unlabeled,validate)
truthTable = iso.predict(train_unlabeled)

inlier = []
inlierLabel = []
count = 0
for i in range(21000):
    if truthTable[i] == 1:
        temp = train_unlabeledStored[i,]
        inlier = np.append(inlier,temp)
        inlierLabel = np.append(inlierLabel,valiStored[i])

inlier = np.reshape(inlier,[int(len(inlier)/128), 128])
all_features = np.concatenate((features,inlier),axis=0)
Exemple #30
0
from common_utils import *
from outlier_utils import *
from feature_reduction_utils import *
from sklearn.model_selection import train_test_split
from sklearn import metrics, preprocessing, tree, covariance, linear_model, ensemble, neighbors, svm, model_selection, feature_selection, kernel_ridge
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd
import numpy as np

card_data = pd.read_csv(os.path.join(path, 'creditcard.csv'))
card_data.info()
X = drop_features(card_data, ['Time', 'Amount', 'Class'])
y = card_data['Class']

tnse_data = feature_reduction_tsne(X, 3)
plot_data_3d_outliers(tnse_data, y, title="Credit card data")

iso_forest_estimator = ensemble.IsolationForest()
iso_forest_grid = {'contamination': [0.1, 0.2, 0.25, 0.3]}
grid_search_plot_models_outliers(iso_forest_estimator,
                                 iso_forest_grid,
                                 X,
                                 y,
                                 xlim=[-7, 7],
                                 ylim=[-7, 7])
iso_best_model = grid_search_best_model_outliers(iso_forest_estimator,
                                                 iso_forest_grid,
                                                 X,
                                                 y,
                                                 scoring='roc_auc')
plot_model_2d_outliers(iso_best_model, X, y)