Exemple #1
0
def random_miss_prop(train_x, label, miss_column, algorithm, file_name):
    """
    Assume all column of train_x can miss.
    :param train_x:
    :param train_y:
    :param algorithm:
    :return:
    """
    miss_prop = np.arange(0.0, 0.45, 0.05)
    result = pd.DataFrame()
    fraction_missing_features = int(np.ceil(len(miss_column) * 0.8))
    ad_detector = ADDetector(alg_type=algorithm)
    mvi_object = MissingValueInjector()
    num_missing = 0
    ad_detector.train(train_x, ensemble_size=1)
    for alpha in miss_prop:
        # print alpha, num_missing
        test_x = train_x.copy()
        #alpha = 0.15
        if alpha > 0:
            num_missing, _ = mvi_object.inject_missing_in_random_cell(
                test_x, alpha)
            # impute value and perform detection.
            mt_impute = metric(
                label,
                ad_detector.score(mvi_object.impute_value(test_x.copy(),
                                                          method="MICE"),
                                  check_miss=False))
            ms_score = ad_detector.score(mvi_object.impute_value(
                test_x.copy(), method="SimpleFill"),
                                         check_miss=False)
            mt_raw = metric(label, ms_score)
        else:
            ms_score = ad_detector.score(test_x, check_miss=False)
            mt_raw = metric(label, ms_score)
            mt_impute = mt_raw  # just assign 0 value when imputation is not applicable.

        reduced_score = ad_detector.score(test_x, True)

        #print test_x[nanv,:]
        mt_reduced = metric(label, reduced_score)
        # print "For ", [num_missing] + mt + [alpha]

        result = result.append(
            pd.Series([alpha] + [num_missing] + [mt_raw[0]] + [mt_reduced[0]] +
                      [mt_impute[0]] + [algorithm]),
            ignore_index=True)
        #print result
    result.rename(columns={
        0: "miss_prop",
        1: "num_max_miss_features",
        2: "auc",
        3: "auc_reduced",
        4: "auc_impute",
        5: "algorithm"
    },
                  inplace=True)
    return result
Exemple #2
0
def miss_proportions_exp(train_x, label, miss_column, algorithm):
    ensemble_size = 2  # run upto 4 times
    result = pd.DataFrame()
    miss_prop = np.arange(0.05, 0.45, 0.05)
    fraction_missing_features = int(np.ceil(len(miss_column) * 0.8))
    ad_detector = ADDetector(alg_type=algorithm)
    mvi_object = MissingValueInjector()
    for en_size in range(1, ensemble_size):
        ad_detector.train(train_x, ensemble_size=en_size)
        for alpha in miss_prop:
            for num_missing in range(0, fraction_missing_features):
                # print alpha, num_missing
                test_x = train_x.copy()
                ms_score = ad_detector.score(test_x, False)
                mt_raw = metric(label, ms_score)

                if num_missing > 0:
                    mvi_object.inject_missing_value(test_x, num_missing, alpha,
                                                    miss_column)
                    # impute value and perform detection.
                    mt_impute = metric(
                        label,
                        ad_detector.score(mvi_object.impute_value(test_x),
                                          False))
                else:
                    mt_impute = [
                        0, 0
                    ]  # just assign 0 value when imputation is not applicable.
                mt_reduced = metric(label,
                                    ad_detector.score(test_x.copy(), True))
                # print "For ", [num_missing] + mt + [alpha]

                result = result.append(
                    pd.Series([alpha] +
                              [num_missing / float(len(miss_column))] +
                              [mt_raw[0]] + [mt_reduced[0]] + [mt_impute[0]] +
                              [en_size] + [algorithm]),
                    ignore_index=True)

    result.rename(columns={
        0: "anom_prop",
        1: "num_max_miss_features",
        2: "auc",
        3: "auc_reduced",
        4: "auc_impute",
        5: "ensemble_size",
        6: "algorithm"
    },
                  inplace=True)
    return result
Exemple #3
0
def test_cell_injector():
    w = np.random.randn(10,5)
    test = w.copy()
    ad_in = MissingValueInjector()
    ix = ad_in.inject_missing_in_random_cell(w,0.3)
   # print w
    #print ix
    ff = pft.IsolationForest(ntree=10)
    ff.train(test)
    print ff.score(test)
    print ff.average_depth()[0:3]

    print ff.score(w)
    print ff.average_depth()[0:3]

    print ff.score(w, cmv=True)
    print ff.average_depth()[0:3]
    print pft.__file__
Exemple #4
0
def algo_miss_featuresX(train_x,
                        label,
                        miss_column,
                        algorithm,
                        file_name,
                        label_field=0):
    """
    For running locally with threaded process. This is useful, if the job can only run on a single node.
    :param train_x:
    :param label:
    :param miss_column:
    :param algorithm:
    :param file_name:
    :param label_field:
    :return:
    """
    global ad_detector, mvi_object
    # Train the forest
    result = pd.DataFrame()
    miss_prop = np.arange(0, 1.1, 0.1)
    d = len(miss_column)
    fraction_missing_features = int(np.ceil(
        d * 0.8))  # int(np.ceil(d / np.sqrt(d)))
    ad_detector = ADDetector(alg_type=algorithm, label=label_field)
    mvi_object = MissingValueInjector()
    ad_detector.train(train_x, ensemble_size=1, file_name=file_name)
    num_cores = multiprocessing.cpu_count()
    result = Parallel(n_jobs=num_cores)(
        delayed(benchmarks, check_pickle=False)(train_x, label, miss_column,
                                                algorithm, alpha, num_miss)
        for alpha in miss_prop
        for num_miss in range(1, fraction_missing_features))

    result = pd.DataFrame(result)
    result.rename(columns={
        0: "miss_prop",
        1: "miss_features_prop",
        2: "auc_mean_impute",
        3: "auc_reduced",
        4: "auc_MICE_impute",
        5: "ensemble_size",
        6: "algorithm"
    },
                  inplace=True)

    return result
Exemple #5
0
def algo_miss_featuresX(train_x,
                        label,
                        miss_column,
                        algorithm,
                        file_name,
                        label_field=0):

    global ad_detector, mvi_object
    # Train the forest
    result = pd.DataFrame()
    miss_prop = np.arange(0, 1.1, 0.1)
    d = len(miss_column)
    fraction_missing_features = int(np.ceil(
        d * 0.8))  #int(np.ceil(d / np.sqrt(d)))
    ad_detector = ADDetector(alg_type=algorithm, label=label_field)
    mvi_object = MissingValueInjector()
    ad_detector.train(train_x, ensemble_size=1, file_name=file_name)
    # for alpha in miss_prop:
    #     for num_miss in range(1, fraction_missing_features):
    #         result.append(pd.Series(benchmarks(train_x, label, miss_column, algorithm, alpha, num_miss, mvi_object,
    #                                            ad_detector)),
    #                       ignore_index=True)

    num_cores = multiprocessing.cpu_count()
    #pool= multiprocessing.Pool()
    #result = pool.map()
    result = Parallel(n_jobs=num_cores)(
        delayed(benchmarks, check_pickle=False)(train_x, label, miss_column,
                                                algorithm, alpha, num_miss)
        for alpha in miss_prop
        for num_miss in range(1, fraction_missing_features))

    result = pd.DataFrame(result)
    result.rename(columns={
        0: "miss_prop",
        1: "miss_features_prop",
        2: "auc_mean_impute",
        3: "auc_reduced",
        4: "auc_MICE_impute",
        5: "ensemble_size",
        6: "algorithm"
    },
                  inplace=True)

    return result
Exemple #6
0
def algo_miss_features(train_x,
                       label,
                       miss_column,
                       algorithm,
                       file_name,
                       label_field=0):
    # Train the forest
    ensemble_size = 2  # run upto 4 times
    result = pd.DataFrame()
    miss_prop = np.arange(0, 1.1, 0.1)
    d = len(miss_column)
    fraction_missing_features = int(np.ceil(
        len(miss_column) * 0.8))  # int(np.ceil(d /np.sqrt(d)))
    ad_detector = ADDetector(alg_type=algorithm, label=label_field)
    mvi_object = MissingValueInjector()

    for en_size in range(1, ensemble_size):

        ad_detector.train(train_x, ensemble_size=en_size, file_name=file_name)
        for alpha in miss_prop:
            for num_missing in range(1, fraction_missing_features):

                #print alpha, num_missing
                test_x = train_x.copy()
                mvi_object.inject_missing_value(test_x, num_missing, alpha,
                                                miss_column)
                if algorithm.upper() != 'BIFOR':

                    # Check with missing values.
                    if num_missing * alpha > 0:
                        # Check the value.
                        ms_score = ad_detector.score(
                            mvi_object.impute_value(test_x.copy(),
                                                    "SimpleFill"), False)
                        mt = metric(label, ms_score)
                        mt_impute = metric(label,
                                           ad_detector.score(
                                               mvi_object.impute_value(
                                                   test_x.copy(),
                                                   method="MICE"),
                                               False))  #impute
                    else:
                        ms_score = ad_detector.score(test_x, False)
                        mt = metric(label, ms_score)
                        mt_impute = mt
                else:
                    mt = mt_impute = [
                        0.0, 0.0
                    ]  # Just to reduce computation, only run reduced approach when BIFOR is used.
                mt_reduced = metric(label,
                                    ad_detector.score(test_x,
                                                      True))  #Bagging approach
                result = result.append(
                    pd.Series([alpha] +
                              [num_missing / float(len(miss_column))] +
                              [mt[0]] + [mt_reduced[0]] + [mt_impute[0]] +
                              [en_size] + [algorithm]),
                    ignore_index=True)

    result.rename(
        columns={
            0: "miss_prop",
            1: "miss_features_prop",
            2: "auc_mean_impute",
            3: "auc_reduced",
            4: "auc_MICE_impute",
            5: "ensemble_size"
        },  #, 6:"algorithm"},
        inplace=True)
    return result
Exemple #7
0
def single_benchmark(train_x,
                     label,
                     miss_column,
                     file_name,
                     label_field,
                     algorithm_list=ALGORITHMS,
                     task_id=1):
    d = len(miss_column)  # size of missing features.
    frac_missing_prop, frac_features = algo_parameters(task_id)
    num_missing_att = int(np.ceil(d * frac_features))
    test_x = train_x.copy()
    miss_index = []
    scores_result = []

    algorithms = {}
    mvi_object = MissingValueInjector()
    x_impute = lambda method: mvi_object.impute_value(test_x, method=method)

    for algo in algorithm_list:
        try:

            algorithms[algo] = ADDetector(alg_type=algo, label=label_field)
            algorithms[algo].train(train_x,
                                   ensemble_size=1,
                                   file_name=file_name)
        except Exception as e:
            print "Error from {0:s}".format(algo), e.message
            continue

    def score_algo(test_x, method, score_bool=False):
        scores = []
        local_score = []
        for algo in algorithms:
            try:
                if (method in ["mean", "MICE"]) and algo == "BIFOR":
                    continue
                local_score = algorithms[algo].score(test_x, score_bool)
                logging.debug("score {0:d} - {1:s} - {2:s}-{3:s}".format(
                    len(local_score), file_name, str(frac_features),
                    str(frac_missing_prop)))
                auc_score = metric(label, local_score)[0]

                scores.append([
                    frac_missing_prop, frac_features, auc_score, algo, method,
                    os.path.basename(file_name)
                ])
            except Exception as e:
                print "Error from {0:s}".format(algo), e.message
                continue

        return scores

    if num_missing_att * frac_missing_prop > 0:
        miss_index = mvi_object.inject_missing_value(
            data=test_x,
            num_missing_attribute=num_missing_att,
            alpha=frac_missing_prop,
            miss_att_list=miss_column)
        x_na_mean = x_impute("SimpleFill")
        scores_result += score_algo(x_na_mean, method="mean", score_bool=False)
        replace_with_nan(miss_index, test_x)
        x_na_mice = x_impute("MICE")
        scores_result += score_algo(x_na_mice, method="MICE",
                                    score_bool=False)  # append the list

    else:
        scores_result += score_algo(test_x, method="NoImpute")

    replace_with_nan(miss_index, test_x)
    scores_result += score_algo(test_x, method="reduced", score_bool=True)
    return scores_result