Example #1
0
def fisherProc(X,y):
	# obtain the score of each feature on the training set
	score = fisher_score.fisher_score(X, y)

	# rank features in descending order according to score
	idx = fisher_score.feature_ranking(score)
	return idx
def fisher(data):
    rank = []
    for i in range(6):
        X = data[i][:, :-1]
        Y = data[i][:, -1]
        score = fisher_score.fisher_score(X, Y)
        idx1 = fisher_score.feature_ranking(score)
        idx = samp(idx1.tolist())
        rank.append(idx)
    R = rankaggregate(rank)
    return R
Example #3
0
def seleciona_caracteristicas(vetor_caracteristicas, classes):
	caracteristicas_selecionadas  = []
	limiar_consideracao = 0

	score = fisher_score.fisher_score(vetor_caracteristicas, classes)
	rank = fisher_score.feature_ranking(score)
	features_consideradas = conta_features_limiar(score, limiar_consideracao)
	if features_consideradas > 1:
		rank_considerado = rank[0:features_consideradas:1]
		caracteristicas_selecionadas = vetor_caracteristicas[:, rank_considerado]

	return caracteristicas_selecionadas, rank_considerado
Example #4
0
def seleciona_caracteristicas(vetor_caracteristicas, classes):
    caracteristicas_selecionadas = []
    limiar_consideracao = 0

    score = fisher_score.fisher_score(vetor_caracteristicas, classes)
    rank = fisher_score.feature_ranking(score)
    features_consideradas = conta_features_limiar(score, limiar_consideracao)
    if features_consideradas > 1:
        rank_considerado = rank[0:features_consideradas:1]
        caracteristicas_selecionadas = vetor_caracteristicas[:,
                                                             rank_considerado]

    return caracteristicas_selecionadas, rank_considerado
Example #5
0
    def get_fisher_scores(self, max_dim):
        """ Получить меру Фишера и качество распознавания на основе AUC ROC.

        Выполняется отбор признаков для размерностей пространства признаков от 1 до max_dim. Для каждой размерности
        выполняется перекрестная проверка (cross-validation) и вычисляется интегральное значение меры Фишера и
        среднее по всем подвыборкам значение меры AUC ROC.

        Args:

            max_dim(int): число признаков до которого следует производить отбор.

        Returns:

            fisher_summary_scores: - вычисленные суммарные значения меры Фишера.
            auc_roc_scores: - вычисленные значения площади под кривой ROC.

        """

        x_train = scale(self.features)  # normalize features
        y_train = self.targets  # target ids
        # Fisher score estimation
        f_score = fisher_score.fisher_score(
            x_train, y_train)  # calculate Fisher score value
        ranked_f_score = fisher_score.feature_ranking(f_score)  # rank features
        print('Последовательность отобранных коэффициентов:')
        print(*list(self.feature_header[ranked_f_score[0:max_dim]]), sep=', ')
        fisher_summary_scores = list(
            it.accumulate(
                f_score[ranked_f_score[0:max_dim]]))  # integral Fisher scores
        # Cross validation
        k_fold = KFold(n_splits=5,
                       shuffle=True)  # setup cross-validation pattern
        ar_scorer = make_scorer(roc_auc_score)  # make scorer
        clf = SGDRegressor(max_iter=100, tol=1e-3, random_state=241
                           )  # stochastic gradient descend regression as a clf
        auc_roc_scores = []  # list for AUC ROC values
        for i in range(1,
                       max_dim + 1):  # iterate by number of features selected
            features = x_train[:, ranked_f_score[0:i]]  # select features
            t = y_train
            vect_auc_roc_score = cross_val_score(clf,
                                                 features,
                                                 t,
                                                 scoring=ar_scorer,
                                                 cv=k_fold)  # train
            auc_roc_scores.append(np.mean(vect_auc_roc_score)
                                  )  # add mean (over CV-subsets) AUC ROC value

        return fisher_summary_scores, auc_roc_scores
def get_fisher_score(data,label,k = 30):
    score = fisher_score.fisher_score(data, label)
    #print(score)
    ranking = fisher_score.feature_ranking(score)
    #print(idx)
    
    
    dfscores = pd.DataFrame(score)
    dfcolumns = pd.DataFrame(data.columns)
    #df_rank =pd.DataFrame(idx)
    
    featureScores = pd.concat([dfcolumns,dfscores],axis=1)
    featureScores.columns = ['Feature','Score']  #naming the dataframe columns
    #print(featureScores.nlargest(k,'Score'))  #print 20 best features
    result = featureScores.nlargest(k,'Score')
    
    return result, ranking
Example #7
0
def run_fold(trial,P,X,y,method,dataset,parttype):
    print 'Obtaining features for %s %s %s fold: %2d' % (parttype,method,dataset,trial)
    n_samples, n_features = X.shape
    train = P[:,trial] == 1
    trnX = X[train]
    trnY = y[train]

    start_time = time.time()
    if method == 'fisher': 
        score = fisher_score.fisher_score(trnX,trnY)
        features = fisher_score.feature_ranking(score)
    elif method == 'chi2':
        score = chi_square.chi_square(trnX,trnY)
        features = chi_square.feature_ranking(score)
    elif method == 'relieff':
        score = reliefF.reliefF(trnX,trnY)
        features = reliefF.feature_ranking(score)
    elif method == 'jmi':
        features = JMI.jmi(trnX,trnY,  n_selected_features=n_features)
    elif method == 'mrmr':
        features = MRMR.mrmr(trnX,trnY,n_selected_features=n_features)
    elif method == 'infogain':
        features = MIM.mim(trnX,trnY,n_selected_features=n_features)
    elif method == 'svmrfe':
        features = svmrfe(trnX,trnY)
    elif method == 'hdmr':
        sobol_set_all = scipy.io.loadmat('sobol_set.mat')
        sobol_set     = sobol_set_all['sobol_set']
        sobol_set     = sobol_set.astype(float)
        params = {'sobol_set':sobol_set,'k':1,'p':3,'M':1000,'b':'L'}
        models  = hdmrlearn(trnX,trnY,params)
        features,w = hdmrselect(X,models)
    elif method == 'hdmrhaar':
        sobol_set_all = scipy.io.loadmat('sobol_set.mat')
        sobol_set     = sobol_set_all['sobol_set']
        sobol_set     = sobol_set.astype(float)
        params = {'sobol_set':sobol_set,'k':1,'p':255,'M':1000,'b':'H'}
        models  = hdmrlearn(trnX,trnY,params)
        features,w = hdmrselect(X,models)
    else:
        print(method + 'does no exist')

    cputime = time.time() - start_time
    print features
    print 'cputime %f' % cputime
    return {'features': features, 'cputime': cputime}
def naiveBayes(processed_train_features, processed_valid_features,
               train_labels, valid_labels, processed_test_features,
               test_labels):
    model1 = GaussianNB()
    model1.fit(processed_train_features, train_labels)
    naive_bayes_predict_train = model1.predict(processed_train_features)
    naive_bayes_predict_valid = model1.predict(processed_valid_features)
    #print("Naive Bayes Training accuracy ",accuracy_score(train_labels, naive_bayes_predict_train))
    print("Naive Bayes Valid accuracy ",
          accuracy_score(valid_labels, naive_bayes_predict_valid))
    naive_bayes_predict_train_before_fisher = model1.predict(
        processed_test_features)
    print("Naive Bayes Testing accuracy ",
          accuracy_score(test_labels, naive_bayes_predict_train_before_fisher))
    XFisher = processed_test_features.to_numpy()
    score = fs.fisher_score(XFisher, test_labels)
    ranked_featrues = fs.feature_ranking(score)
    topFeatures = ranked_featrues[:50]
    print(topFeatures)
    print(score.shape)
    print(XFisher.shape)
    intersection_cols = topFeatures
    colnamelist = []
    for i in topFeatures:
        colname = processed_train_features.columns[i]
        colnamelist.append(colname)
    test = processed_test_features.copy()
    valid_for_bayes = processed_valid_features.copy()
    size = 188
    test.drop(test.columns.difference(colnamelist), 1, inplace=True)
    valid_for_bayes.drop(valid_for_bayes.columns.difference(colnamelist),
                         1,
                         inplace=True)
    model = GaussianNB()
    model.fit(test, test_labels)
    naive_bayes_predict_train_after_fisher = model.predict(test)
    print("Naive Bayes Testing accuracy ",
          accuracy_score(test_labels, naive_bayes_predict_train_after_fisher))
    naive_bayes_predict_valid_after_fisher = model.predict(valid_for_bayes)
    print("Naive Bayes Validation accuracy",
          accuracy_score(valid_labels, naive_bayes_predict_valid_after_fisher))
def main():
    # load data
    mat = scipy.io.loadmat("../data/COIL20.mat")
    X = mat["X"]  # data
    X = X.astype(float)
    y = mat["Y"]  # label
    y = y[:, 0]
    n_samples, n_features = X.shape  # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 100  # number of selected features
    clf = svm.LinearSVC()  # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the score of each feature on the training set
        score = fisher_score.fisher_score(X[train], y[train])

        # rank features in descending order according to score
        idx = fisher_score.feature_ranking(score)

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(selected_features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(selected_features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print "Accuracy:", float(correct) / 10
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']  # data
    X = X.astype(float)
    y = mat['Y']  # label
    y = y[:, 0]
    n_samples, n_features = X.shape  # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 100  # number of selected features
    clf = svm.LinearSVC()  # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the score of each feature on the training set
        score = fisher_score.fisher_score(X[train], y[train])

        # rank features in descending order according to score
        idx = fisher_score.feature_ranking(score)

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(selected_features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(selected_features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print('Accuracy:', old_div(float(correct), 10))
    def rank_features_using_fisherscore(cls,
                                        data_frame,
                                        target_key,
                                        cols_to_ignore=None):
        X = data_frame.values
        keys = list(data_frame.keys())
        target_col_idx = keys.index(target_key)

        # Removing the target column from keys
        del keys[target_col_idx]

        # Remove all columns that are asked to be ignored
        if cols_to_ignore is not None:
            for col in cols_to_ignore:
                idx = keys.index(col)
                del keys[idx]

        Y = data_frame.loc[:, target_key].values
        X = data_frame.loc[:, keys]

        score = fisher_score.fisher_score(X, Y)
        rank = fisher_score.feature_ranking(score)
        ranked_features = [keys[i] for i in rank]
        return score, ranked_features, keys
Example #12
0
    def create_data_aware_features(self, train_log, test_log, ignored):
        # given log
        # 0.0. Extract events

        # 1.1. Apriori mine events to be used for constraints

        # 2. Find declare constraints to be used, On a limited set of declare templates
        # 2.1. Find support for all positive and negative cases for the constraints
        # 2.2. Filter the constraints according to support
        # -- Encode the data
        # 3. Sort constraints according to Fisher score (or other metric)
        # 4. Pick the constraint with highest Fisher score.
        # 5. Refine the constraint with data
        # 5.1. Together with data, try to create a better rule.
        # ---- In this case, every node will become a small decision tree of its own!

        # 5.2. If the Fisher score of new rule is greater, change the current rule to a refined rule
        # --- Refined rule is - constraint + a decision rules / tree, learne
        # Reorder constraints for next level of decision tree .. It is exactly like Gini impurity or sth..

        # Get templates from fabrizios article
        """
        responded existence(A, B), data on A
        response(A, B), data on A
        precedence(A, B), data on B
        alternate response(A, B), data on A
        alternate precedence(A, B), data on B
        chain response(A,B), data on A
        chain precedence(A, B), data on B
        not resp. existence (A, B), data on A
        not response (A, B), data on A
        not precedence(A, B), data on B
        not chain response(A,B), data on A
        not chain precedence(A,B), data on B

        :param log:
        :param label:
        :return:
        """

        not_templates = [
            "not_responded_existence", "not_precedence", "not_response",
            "not_chain_response", "not_chain_precedence"
        ]

        templates = [
            "alternate_precedence", "alternate_response", "chain_precedence",
            "chain_response", "responded_existence", "response", "precedence"
        ]

        inp_templates = templates + not_templates

        # play around with thresholds

        constraint_threshold = 0.1
        candidate_threshold = 0.1

        # Extract unique activities from log
        events_set = extract_unique_events_transformed(train_log)

        # Brute force all possible candidates
        candidates = [(event, ) for event in events_set] + [
            (e1, e2) for e1 in events_set for e2 in events_set if e1 != e2
        ]

        # Count by class
        normal_count, deviant_count = count_classes(train_log)
        print("{} deviant and {} normal traces in train set".format(
            deviant_count, normal_count))
        ev_support_norm = int(normal_count * candidate_threshold)
        ev_support_dev = int(deviant_count * candidate_threshold)

        print("Filtering candidates by support")
        candidates = filter_candidates_by_support(candidates, train_log,
                                                  ev_support_norm,
                                                  ev_support_dev)
        print("Support filtered candidates:", len(candidates))

        constraint_support_dev = int(deviant_count * constraint_threshold)
        constraint_support_norm = int(normal_count * constraint_threshold)

        train_results = generate_train_candidate_constraints(
            candidates,
            inp_templates,
            train_log,
            constraint_support_norm,
            constraint_support_dev,
            filter_t=True)

        test_results = generate_test_candidate_constraints(
            candidates, inp_templates, test_log, train_results)
        print("Candidate constraints generated")

        ## Given selected constraints, find fulfillments and violations for each of the constraint.
        ## In this manner build positive and negative samples for data

        X_train, y_train, feature_names, train_trace_names = transform_results_to_numpy(
            train_results, train_log)
        X_test, y_test, _, test_trace_names = transform_results_to_numpy(
            test_results, test_log)

        # Turn to pandas df
        train_df = pd.DataFrame(X_train,
                                columns=feature_names,
                                index=train_trace_names)

        train_df = train_df.transpose().drop_duplicates().transpose()

        # remove no-variance, constants
        train_df = train_df.loc[:, (train_df != train_df.iloc[0]).any()]

        X_train = train_df.values

        # Perform selection by Fisher

        scores = fisher_calculation(X_train, y_train)
        selected_ranks = fisher_score.feature_ranking(scores)

        threshold = 15
        #chosen = 500

        real_selected_ranks = []
        # Start selecting from selected_ranks until every trace is covered N times
        trace_remaining = dict()
        for i, trace_name in enumerate(train_df.index.values):
            trace_remaining[i] = threshold

        chosen = 0
        # Go from higher to lower
        for rank in selected_ranks:
            if len(trace_remaining) == 0:
                break
            chosen += 1
            # Get column
            marked_for_deletion = set()
            added = False
            for k in trace_remaining.keys():
                if train_df.iloc[k, rank] > 0:
                    if not added:
                        added = True
                        real_selected_ranks.append(rank)

                    trace_remaining[k] -= 1
                    if trace_remaining[k] <= 0:
                        marked_for_deletion.add(k)

            for k in marked_for_deletion:
                del trace_remaining[k]

        print("Constraints chosen {}".format(len(real_selected_ranks)))

        feature_names = train_df.columns[real_selected_ranks]

        print("Considered template count:", len(feature_names))
        train_df = train_df[feature_names]

        new_train_feature_names = []
        new_train_features = []

        new_test_feature_names = []
        new_test_features = []

        count = 0

        for key in train_df.columns:

            count += 1
            #print(key)
            # Go over all and find with data
            template = key[0]
            candidate = key[1]

            # First have to find all locations of fulfillments
            outp_train = find_fulfillments_violations(candidate, template,
                                                      train_log)
            outp_test = find_fulfillments_violations(candidate, template,
                                                     test_log)

            # Take data snapshots on all fulfilled indices - positives samples
            # Take data snapshots on all unfulfilled indices - negative samples
            # Build a decision tree with fulfilled and unfulfilled samples
            train_positive_samples = []
            train_negative_samples = []

            test_positive_samples = []
            test_negative_samples = []

            for i, trace in enumerate(outp_train):
                fulfilled = trace[1]
                violated = trace[2]
                positive, negative = get_data_snapshots(
                    train_log[i], fulfilled, violated)
                label = train_log[i]["label"]
                for s in positive:
                    train_positive_samples.append((s, label, i))
                for s in negative:
                    train_negative_samples.append((s, label, i))

            for i, trace in enumerate(outp_test):
                fulfilled = trace[1]
                violated = trace[2]
                positive, negative = get_data_snapshots(
                    test_log[i], fulfilled, violated)
                label = train_log[i]["label"]

                for s in positive:
                    test_positive_samples.append((s, label, i))

                for s in negative:
                    test_negative_samples.append((s, label, i))

            # Get all where fulfilled only. Train on train_positive_samples vs Label of log
            ignored_features = set(ignored)  # set([('Diagnose', 'literal')])

            collected_features = set()
            # Get all possible features for
            for pos_act, _, __ in train_positive_samples:
                for key2, val in pos_act.items():
                    collected_features.add(key2)

            for neg_act, _, __ in train_negative_samples:
                for key2, val in neg_act.items():
                    collected_features.add(key2)

            features = list(collected_features)

            # Keep only features of boolean, literal, continuous and discrete
            features = [
                feature for feature in features if feature[1] in set(
                    ["boolean", "continuous", "discrete", "literal"])
            ]
            features = [
                feature for feature in features
                if feature[0] not in ignored_features
            ]

            # collect positive and negative samples for finding data condition:
            positive_samples = [(sample[2], sample[0])
                                for sample in train_positive_samples
                                if sample[1] == 1]
            negative_samples = [(sample[2], sample[0])
                                for sample in train_positive_samples
                                if sample[1] == 0]

            pos_activations = [(sample[2], sample[0])
                               for sample in train_positive_samples]
            neg_activations = [(sample[2], sample[0])
                               for sample in train_negative_samples]

            feature_train_samples = self.create_sample(
                pos_activations, features, 1) + self.create_sample(
                    neg_activations, features, 0)
            # Crete pos and neg samples
            pos_samples = self.create_sample(positive_samples, features, 1)
            neg_samples = self.create_sample(negative_samples, features, 0)
            features_data = pos_samples + neg_samples
            features_label = ["id"] + features + ["Label"]
            # one-hot encode literal features
            literal_features = [
                feature for feature in features if feature[1] == "literal"
            ]

            # Extract positive test samples, where fulfillments where fulfilled
            train_df = pd.DataFrame(features_data, columns=features_label)
            test_pos_smpl = [
                (sample[2], sample[0]) for sample in test_positive_samples
            ]  # if sample[1] == 1]
            test_neg_smpl = [
                (sample[2], sample[0]) for sample in test_negative_samples
            ]  # if sample[1] == 0]

            pos_test_samples = self.create_sample(test_pos_smpl, features, 1)
            neg_test_samples = self.create_sample(test_neg_smpl, features, 0)
            test_features_data = pos_test_samples + neg_test_samples

            feature_train_df = pd.DataFrame(feature_train_samples,
                                            columns=features_label)
            test_df = pd.DataFrame(test_features_data, columns=features_label)
            train_df.pop("id")
            train_ids = feature_train_df.pop("id")
            test_ids = test_df.pop("id")

            # Possible values for each literal value is those in train_df or missing

            if len(literal_features) > 0:
                for selection in literal_features:
                    train_df[selection] = pd.Categorical(train_df[selection])
                    test_df[selection] = pd.Categorical(test_df[selection])
                    feature_train_df[selection] = pd.Categorical(
                        feature_train_df[selection])
                    le = LabelEncoder()

                    le.fit(
                        list(test_df[selection]) +
                        list(feature_train_df[selection]))
                    classes = le.classes_
                    train_df[selection] = le.transform(train_df[selection])
                    test_df[selection] = le.transform(test_df[selection])
                    feature_train_df[selection] = le.transform(
                        feature_train_df[selection])

                    ohe = OneHotEncoder(
                        categories="auto")  # Remove this for server.
                    ohe.fit(
                        np.concatenate((test_df[selection].values.reshape(
                            -1, 1), feature_train_df[selection].values.reshape(
                                -1, 1)),
                                       axis=0), )

                    train_transformed = ohe.transform(
                        train_df[selection].values.reshape(-1, 1)).toarray()
                    test_transformed = ohe.transform(
                        test_df[selection].values.reshape(-1, 1)).toarray()
                    feature_train_transformed = ohe.transform(
                        feature_train_df[selection].values.reshape(
                            -1, 1)).toarray()

                    dfOneHot = pd.DataFrame(
                        train_transformed,
                        columns=[(selection[0] + "_" + classes[i],
                                  selection[1])
                                 for i in range(train_transformed.shape[1])])
                    train_df = pd.concat([train_df, dfOneHot], axis=1)
                    train_df.pop(selection)
                    dfOneHot = pd.DataFrame(
                        test_transformed,
                        columns=[(selection[0] + "_" + classes[i],
                                  selection[1])
                                 for i in range(train_transformed.shape[1])])
                    test_df = pd.concat([test_df, dfOneHot], axis=1)
                    test_df.pop(selection)

                    dfOneHot = pd.DataFrame(
                        feature_train_transformed,
                        columns=[(selection[0] + "_" + classes[i],
                                  selection[1])
                                 for i in range(train_transformed.shape[1])])
                    feature_train_df = pd.concat([feature_train_df, dfOneHot],
                                                 axis=1)
                    feature_train_df.pop(selection)

            data_dt = DecisionTreeClassifier(max_depth=3)
            y_train = train_df.pop("Label")
            train_data = train_df.values

            y_test = test_df.pop("Label")
            data_dt.fit(train_data, y_train)

            y_train_new = feature_train_df.pop("Label")
            feature_train_data = feature_train_df.values

            train_predictions = data_dt.predict(feature_train_data)
            test_predictions = data_dt.predict(test_df.values)

            train_fts = feature_train_df.columns
            # Go through all traces again
            # Save decision trees here. For later interpretation
            feature_train_df["id"] = train_ids
            test_df["id"] = test_ids

            feature_train_df["prediction"] = train_predictions
            test_df["prediction"] = test_predictions

            # Check for which activations the data condition holds. Filter everything else out.

            feature_train_df["Label"] = y_train_new
            test_df["Label"] = y_test

            new_train_feature = []
            for i, trace in enumerate(outp_train):
                # Get from train_df by number
                trace_id = i
                freq = trace[0]

                # Find all related to the id

                if freq == 0:
                    # vacuous case, no activations, will be same here.
                    new_train_feature.append(0)
                else:
                    # Previous violation case
                    # Find samples related to trace
                    samples = feature_train_df[feature_train_df.id == trace_id]
                    # Find samples related for which data condition holds
                    samples = samples[samples.prediction == 1]
                    # Count number of positive and negative labels
                    positive = samples[samples.Label == 1].shape[0]
                    negative = samples[samples.Label == 0].shape[0]

                    if negative > 0:
                        new_train_feature.append(-1)
                    else:
                        new_train_feature.append(positive)

            new_test_feature = []

            for i, trace in enumerate(outp_test):
                # Get from train_df by number
                trace_id = i
                freq = trace[0]

                # Find all related to the id

                if freq == 0:
                    # vacuous case, no activations, will be same here.
                    new_test_feature.append(0)
                else:
                    # Previous violation case
                    # Find samples related to trace
                    samples = test_df[test_df.id == trace_id]
                    # Find samples related for which data condition holds
                    samples = samples[samples.prediction == 1]
                    # Count number of positive and negative activations
                    positive = samples[samples.Label == 1].shape[0]
                    negative = samples[samples.Label == 0].shape[0]

                    if negative > 0:
                        new_test_feature.append(-1)
                    else:
                        new_test_feature.append(positive)

            # Find all activatio

            count_fulfilled_train = sum(1 for i in new_train_feature if i > 0)
            count_fulfilled_test = sum(1 for i in new_test_feature if i > 0)

            if count_fulfilled_train > 0 and count_fulfilled_test > 0:
                # only then add new feature..
                new_train_features.append(new_train_feature)
                new_train_feature_names.append(
                    template +
                    ":({},{}):Data".format(candidate[0], candidate[1]))

                new_test_features.append(new_test_feature)
                new_test_feature_names.append(
                    template +
                    ":({},{}):Data".format(candidate[0], candidate[1]))

                # Save decision tree
                save_dt = False
                if save_dt:
                    export_graphviz(
                        data_dt,
                        out_file="sample_dwd_trees/outputfile_{}.dot".format(
                            str(key)),
                        feature_names=list(map(str, train_fts)))

        return new_train_feature_names, new_train_features, new_test_feature_names, new_test_features
Example #13
0
X = dataset.iloc[:, 2:
                 32]  # [all rows, col from index 2 to the last one excluding 'Unnamed: 32']
y = dataset.iloc[:,
                 1]  # [all rows, col one only which contains the classes of cancer]
labelencoder_Y = LabelEncoder()
y = labelencoder_Y.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)
X_train = X_train.values
X_test = X_test.values

# compute fisher scores
score = fisher_score(X_train, y_train)
idx = feature_ranking(score)
np.save('features/fisher.npy', idx)
print('Features saved')
#idx = np.load('features/fisher.npy')

# create copies of the data
X_train_copy = X_train
y_train_copy = y_train
X_test_copy = X_test
y_test_copy = y_test

# train and compute accuracy of final model trained on selected features
final_list = []
for num_fea in range(30, 0, -1):
    # load the copies of the original data
    X_train = X_train_copy
Example #14
0
def fisher_score_FS(X_train, y_train):
    score = fisher_score.fisher_score(X_train, y_train)
    idx = fisher_score.feature_ranking(score)
    return (idx, score)
Example #15
0
def fun_classify(inputFile, groupsSel, FeatSelect, Nfeats, scaleFeats=1):
    """
    AllStatsMean, AllStatsSTD = fun_classify(inputFile, groupsSel, FeatSelect, Nfeats)
    inputFile: the .csv file containt feature tables
    groups: The selected groups to classify. Full set is ["S","F","Z","N","O"],
    but ["S","F","Z"] are of most interest for the article (ictal, inter-ictal and normal EEG)
    FeatSelect: feature selection method: PCA, RFE, fisher or none
    Nfeats: number of selected features
    Returns:
    AllStatsMean: mean performance values
    AllStatsSTD: standard deviation of performance values  
    """
    #reads input features
    dfFeats = pd.read_csv(inputFile, sep=',', header=0)

    #only selected groups
    dfFeats = dfFeats[dfFeats["Group"].isin(groupsSel)]
    if "decTaime" in dfFeats:
        x = dfFeats.iloc[:, 2:]  #ignores decomposition method execution time
    else:
        x = dfFeats.iloc[:, 1:]
    y = dfFeats.iloc[:, 0].values
    if scaleFeats:  #scale feats?
        x = StandardScaler().fit_transform(x)
    #Feature selection
    if x.shape[1] > Nfeats:
        #RFE
        if FeatSelect == "RFE":
            rfeModel = SVC(kernel="linear",
                           C=0.025,
                           probability=True,
                           gamma='scale')
            rfeSelect = RFE(rfeModel, n_features_to_select=Nfeats)
            rfe_fit = rfeSelect.fit(x, y)
            x = x[:, rfe_fit.support_]

        if FeatSelect == "PCA":
            pca = PCA(n_components=Nfeats)
            x = pca.fit_transform(x)

        if FeatSelect == "fisher":
            fisherScore = fisher_score.fisher_score(x, y)
            idx = fisher_score.feature_ranking(fisherScore)
            x = x[:, idx[:Nfeats]]

    names = ["KNN", "Linear SVM", "RBF SVM", "GPC", "MLP"]

    classifiers = [
        KNeighborsClassifier(3),
        SVC(kernel="linear", C=0.025, probability=True, gamma='scale'),
        SVC(probability=True, gamma='scale'),
        GaussianProcessClassifier(1.0 * RBF(1.0)),
        MLPClassifier(alpha=1, max_iter=200)
    ]

    #initialize performance variable
    AllStats = {}
    AllStatsMean = {}
    AllStatsSTD = {}

    for name in names:
        AllStats[name] = {
            "Accuracy": np.zeros([realizations, K_folds]),
            "SensitivityMean": np.zeros([realizations, K_folds]),
            "SpecificityMean": np.zeros([realizations, K_folds]),
            "AUC_Mean": np.zeros([realizations, K_folds]),
            "SensitivityIctal": np.zeros([realizations, K_folds]),
            "SpecificityIctal": np.zeros([realizations, K_folds]),
            "AUC_Ictal": np.zeros([realizations, K_folds]),
            "TTtimes": np.zeros([realizations, K_folds])
        }
        AllStatsMean[name] = {
            "Accuracy": 0.,
            "SensitivityMean": 0.,
            "SpecificityMean": 0,
            "AUC_Mean": 0.,
            "SensitivityIctal": 0.,
            "SpecificityIctal": 0.,
            "AUC_Ictal": 0.,
            "TTtimes": 0.
        }
        AllStatsSTD[name] = {
            "Accuracy": 0.,
            "SensitivityMean": 0.,
            "SpecificityMean": 0,
            "AUC_Mean": 0.,
            "SensitivityIctal": 0.,
            "SpecificityIctal": 0.,
            "AUC_Ictal": 0.,
            "TTtimes": 0.
        }
        #for each realization
    for i in range(realizations):
        skf = StratifiedKFold(n_splits=K_folds,
                              shuffle=True)  #5-fold validation

        for tupTemp, ki in zip(skf.split(x, y), range(K_folds)):
            train_idx, test_idx = tupTemp[0], tupTemp[1]
            X_train, X_test = x[train_idx], x[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            for name, clf in zip(names, classifiers):  #for each classifier
                tic = time.time(
                )  #check training/testing time of each classifier
                #Fit model and predict
                modelFit = clf.fit(X_train, y_train)
                yPredicted = modelFit.predict(X_test)
                probsTest = modelFit.predict_proba(X_test)
                toc = time.time()
                # AUC -  #ictal class as positive
                if len(np.unique(y)) > 2:
                    AUCs = roc_auc_score(
                        LabelBinarizer().fit_transform(y_test),
                        probsTest,
                        average=None)
                else:
                    AUCs = roc_auc_score(y_test, probsTest[:, 1], average=None)
                #Sensitivity and Specificity
                cMatrix = confusion_matrix(y_test, yPredicted)
                FP = cMatrix.sum(axis=0) - np.diag(cMatrix)
                FN = cMatrix.sum(axis=1) - np.diag(cMatrix)
                TP = np.diag(cMatrix)
                TN = cMatrix.sum() - (FP + FN + TP)
                # Sensitivity
                TPR = TP / (TP + FN)
                # Specificity or true negative rate
                TNR = TN / (TN + FP)
                #fill performance variable
                AllStats[name]["Accuracy"][i, ki] = accuracy_score(
                    y_test, yPredicted)
                AllStats[name]["SensitivityMean"][i, ki] = np.mean(TPR)
                AllStats[name]["SpecificityMean"][i, ki] = np.mean(TNR)
                AllStats[name]["SensitivityIctal"][i, ki] = TPR[0]
                AllStats[name]["SpecificityIctal"][i, ki] = TNR[0]
                AllStats[name]["AUC_Mean"][i, ki] = np.mean(AUCs)
                AllStats[name]["TTtimes"][i, ki] = toc - tic
                if len(np.unique(y)) > 2:
                    AllStats[name]["AUC_Ictal"][i, ki] = AUCs[0]
    AllStatsDF = [0] * len(names)
    for idx, name in enumerate(names):
        for istat in AllStats[name].keys():
            AllStats[name][istat] = np.mean(AllStats[name][istat], axis=1)
            AllStatsMean[name][istat] = np.mean(AllStats[name][istat])
            AllStatsSTD[name][istat] = np.std(AllStats[name][istat])
        AllStatsDF[idx] = pd.DataFrame.from_dict(AllStats[name])
        AllStatsDF[idx]["Nmodes"] = Nmodes
        AllStatsDF[idx]["Classifier"] = name

    return pd.DataFrame.from_dict(AllStatsMean), pd.DataFrame.from_dict(
        AllStatsSTD), pd.concat(AllStatsDF)
Example #16
0
    idx_rel = reliefF.feature_ranking(score_rel)
    #Laplacian score
    kwargs_W = {
        "metric": "euclidean",
        "neighbor_mode": "knn",
        "k": 7,
        't': 1,
        'reliefF': True
    }
    W = construct_W.construct_W(X_train, **kwargs_W)
    score_lap = lap_score.lap_score(X_train, W=W)
    idx_lap = lap_score.feature_ranking(score_lap)
    #Fisher
    score_fish = fisher_score.fisher_score(X_train, y_train)
    print(score_fish)
    idx_fish = fisher_score.feature_ranking(score_fish)
    ###################################### Feature Integration
    idxM = idx_rel[:threshold]
    idxN = idx_lap[:threshold]
    idxO = idx_fish[:threshold]

    if combination_method == 1:
        #AND
        idx_and = reduce(np.intersect1d, (idxO, idxM, idxN))
        idx = idx_and
        print("number of selectes features (bins) = ", idx.shape[0])

    if combination_method == 2:
        #OR
        idx = np.concatenate((idxM, idxN, idxO))
        idx = np.unique(idx)
Example #17
0
    def fit(self, X, y):

        idx = []

        if self.tp == 'ITB':

            if self.name == 'MRMR':
                idx = MRMR.mrmr(X,
                                y,
                                n_selected_features=self.params['num_feats'])

        elif self.tp == 'filter':

            if self.name == 'Relief':
                score = reliefF.reliefF(X, y, k=self.params['k'])
                idx = reliefF.feature_ranking(score)

            if self.name == 'Fisher':
                # obtain the score of each feature on the training set
                score = fisher_score.fisher_score(X, y)

                # rank features in descending order according to score
                idx = fisher_score.feature_ranking(score)

            if self.name == 'MI':
                idx = np.argsort(
                    mutual_info_classif(
                        X, y, n_neighbors=self.params['n_neighbors']))[::-1]

        elif self.tp == 'wrapper':

            model_fit = self.model.fit(X, y)
            model = SelectFromModel(model_fit, prefit=True)
            idx = model.get_support(indices=True)
        elif self.tp == 'SLB':

            # one-hot-encode on target
            y = construct_label_matrix(y)

            if self.name == 'SMBA':
                scba = fs.SCBA(data=X,
                               alpha=self.params['alpha'],
                               norm_type=self.params['norm_type'],
                               verbose=self.params['verbose'],
                               thr=self.params['thr'],
                               max_iter=self.params['max_iter'],
                               affine=self.params['affine'],
                               normalize=self.params['normalize'],
                               step=self.params['step'],
                               PCA=self.params['PCA'],
                               GPU=self.params['GPU'],
                               device=self.params['device'])

                nrmInd, sInd, repInd, _ = scba.admm()
                if self.params['type_indices'] == 'nrmInd':
                    idx = nrmInd
                elif self.params['type_indices'] == 'repInd':
                    idx = repInd
                else:
                    idx = sInd

            if self.name == 'RFS':
                W = RFS.rfs(X, y, gamma=self.params['gamma'])
                idx = feature_ranking(W)

            if self.name == 'll_l21':
                # obtain the feature weight matrix
                W, _, _ = ll_l21.proximal_gradient_descent(X,
                                                           y,
                                                           z=self.params['z'],
                                                           verbose=False)
                # sort the feature scores in an ascending order according to the feature scores
                idx = feature_ranking(W)
            if self.name == 'ls_l21':
                # obtain the feature weight matrix
                W, _, _ = ls_l21.proximal_gradient_descent(X,
                                                           y,
                                                           z=self.params['z'],
                                                           verbose=False)

                # sort the feature scores in an ascending order according to the feature scores
                idx = feature_ranking(W)

            if self.name == 'LASSO':

                LASSO = Lasso(alpha=self.params['alpha'], positive=True)

                y_pred_lasso = LASSO.fit(X, y)

                if y_pred_lasso.coef_.ndim == 1:
                    coeff = y_pred_lasso.coef_
                else:
                    coeff = np.asarray(y_pred_lasso.coef_[0, :])

                idx = np.argsort(-coeff)

            if self.name == 'EN':  # elastic net L1

                enet = ElasticNet(alpha=self.params['alpha'],
                                  l1_ratio=1,
                                  positive=True)
                y_pred_enet = enet.fit(X, y)

                if y_pred_enet.coef_.ndim == 1:
                    coeff = y_pred_enet.coef_
                else:
                    coeff = np.asarray(y_pred_enet.coef_[0, :])

                idx = np.argsort(-coeff)

        return idx
Example #18
0
            "neighbor_mode": "knn",
            "weight_mode": "heat_kernel",
            "k": 5,
            't': 1
        }
        W = construct_W.construct_W(X_train, **kwargs_W)
        score = lap_score.lap_score(X_train, W=W)
        idx = lap_score.feature_ranking(score)
        selected_fea_train = X_train[:, idx[0:num_features]]
        selected_fea_test = X_test[:, idx[0:num_features]]
        clf.fit(selected_fea_train, y_train)
        acc.append(accuracy_score(y_test, clf.predict(selected_fea_test)))

        # fisher_score
        score = fisher_score.fisher_score(X_train, y_train)
        idx = fisher_score.feature_ranking(score)
        selected_fea_train = X_train[:, idx[0:num_features]]
        selected_fea_test = X_test[:, idx[0:num_features]]
        clf.fit(selected_fea_train, y_train)
        acc.append(accuracy_score(y_test, clf.predict(selected_fea_test)))

        # reliefF
        score = reliefF.reliefF(X_train, y_train)
        idx = reliefF.feature_ranking(score)
        selected_fea_train = X_train[:, idx[0:num_features]]
        selected_fea_test = X_test[:, idx[0:num_features]]
        clf.fit(selected_fea_train, y_train)
        acc.append(accuracy_score(y_test, clf.predict(selected_fea_test)))

        # chi_square
        score = chi_square.chi_square(np.abs(X_train), y_train)
Example #19
0
 def Fisher_Score(self):
     score = fisher_score.fisher_score(X_train, y_train)
     idx = fisher_score.feature_ranking(score)
Example #20
0
def fisher(train, test, K):
    score = fisher_score.fisher_score(train[0], train[1])
    indices = fisher_score.feature_ranking(score)[:K]
    return indices