Example #1
0
class UnivariateSelectChiFPRPrim(primitive):
    def __init__(self, random_state=0):
        super(UnivariateSelectChiFPRPrim, self).__init__(name='UnivariateSelectChiFPR')
        self.id = 27
        self.PCA_LAPACK_Prim = []
        self.type = 'feature selection'
        self.description = "Filter: Select the pvalues below alpha based on a FPR test with Chi-square. FPR test stands for False Positive Rate test. It controls the total amount of false detections."
        self.hyperparams_run = {'default': True}
        self.selector = None
        self.accept_type = 'd'

    def can_accept(self, data):
        return self.can_accept_d(data, 'Classification')

    def is_needed(self, data):
        if data['X'].shape[1] < 3:
            return False
        return True

    def fit(self, data):
        data = handle_data(data)
        self.selector = SelectFpr(chi2, alpha=0.05)
        self.selector.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        cols = list(output['X'].columns)
        try:
            mask = self.selector.get_support(indices=False)
            final_cols = list(compress(cols, mask))
            output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols)
        except Exception as e:
            print(e)
        final_output = {0: output}
        return final_output
Example #2
0
class f_regressionFPRPrim(primitive):
    def __init__(self, random_state=0):
        super(f_regressionFPRPrim, self).__init__(name='f_regressionFPR')
        self.id = 29
        self.PCA_LAPACK_Prim = []
        self.type = 'feature selection'
        self.description = "Filter: Select the pvalues below alpha based on a FPR test with F-value between label/feature for regression tasks. FPR test stands for False Positive Rate test. It controls the total amount of false detections."
        self.hyperparams_run = {'default': True}
        self.selector = None
        self.accept_type = 'c_r'

    def can_accept(self, data):
        return self.can_accept_c(data, 'Regression')

    def is_needed(self, data):
        if data['X'].shape[1] < 3:
            return False
        return True

    def fit(self, data):
        data = handle_data(data)
        self.selector = SelectFpr(f_regression)
        self.selector.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        cols = list(output['X'].columns)
        mask = self.selector.get_support(indices=False)
        final_cols = list(compress(cols, mask))
        output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols)
        final_output = {0: output}
        return final_output
Example #3
0
def multisplit(skf, X, y, stepsize=1000):
    total_score = 0
    for train_index, test_index in skf:
        wl = []
        pred1 = np.matrix([])
        # Training
        for x in range(0, len(X[0]), stepsize):
            clf1 = plib.classif(X[train_index, x:x + stepsize], y[train_index])
            tmp_p = np.matrix(
                clf1.decision_function(X[train_index, x:x + stepsize]))
            if pred1.size == 0:
                pred1 = tmp_p
            else:
                pred1 = np.concatenate((pred1, tmp_p), axis=1)
            wl.append(clf1)
        #selectf = SelectKBest(f_classif, k=5).fit(pred1, y[train_index])
        selectf = SelectFpr().fit(pred1, y[train_index])
        clf3 = AdaBoostClassifier(n_estimators=100)
        #clf3 = svm.SVC(class_weight='auto')
        #clf3 = RandomForestClassifier(n_estimators=20)
        clf3.fit(selectf.transform(pred1), y[train_index])
        # Testing
        predtest = np.matrix([])
        k = 0
        for x in range(0, len(X[0]), stepsize):
            tmp_p = np.matrix(wl[k].decision_function(X[test_index,
                                                        x:x + stepsize]))
            if predtest.size == 0:
                predtest = tmp_p
            else:
                predtest = np.concatenate((predtest, tmp_p), axis=1)
            k += 1
        # Final prediction
        predfinal = clf3.predict(selectf.transform(predtest))
        print "Target     : ", y[test_index]
        print "Prediction : ", predfinal
        matchs = np.equal(predfinal, y[test_index])
        score = np.divide(np.sum(matchs), np.float64(matchs.size))
        total_score = score + total_score
    return np.divide(total_score, skf.n_folds)
Example #4
0
def multisplit(skf,X,y,stepsize=1000):
    total_score = 0
    for train_index, test_index in skf:
        wl = []
        pred1 = np.matrix([])
        # Training
        for x in range(0, len(X[0]), stepsize):
            clf1 = plib.classif(X[train_index, x:x + stepsize], y[train_index])
            tmp_p = np.matrix(clf1.decision_function(X[train_index, x:x + stepsize]))
            if pred1.size == 0:
                pred1 = tmp_p
            else:
                pred1 = np.concatenate((pred1, tmp_p), axis=1)
            wl.append(clf1)
        #selectf = SelectKBest(f_classif, k=5).fit(pred1, y[train_index])
        selectf = SelectFpr().fit(pred1, y[train_index])
        clf3 = AdaBoostClassifier(n_estimators=100)
        #clf3 = svm.SVC(class_weight='auto')
        #clf3 = RandomForestClassifier(n_estimators=20)
        clf3.fit(selectf.transform(pred1), y[train_index])
        # Testing
        predtest = np.matrix([])
        k = 0
        for x in range(0, len(X[0]), stepsize):
            tmp_p = np.matrix(wl[k].decision_function(X[test_index, x:x + stepsize]))
            if predtest.size == 0:
                predtest = tmp_p
            else:
                predtest = np.concatenate((predtest, tmp_p), axis=1)
            k += 1
        # Final prediction
        predfinal = clf3.predict(selectf.transform(predtest))
        print "Target     : ", y[test_index]
        print "Prediction : ", predfinal
        matchs = np.equal(predfinal, y[test_index])
        score = np.divide(np.sum(matchs), np.float64(matchs.size))
        total_score = score + total_score
    return np.divide(total_score, skf.n_folds)
Example #5
0
def train_decisiontree_FPR(configurationname,
                           train_data,
                           score_function,
                           undersam=False,
                           oversam=False,
                           export=False):
    print("Training with configuration " + configurationname)
    X_train, y_train, id_to_a_train = train_data
    dtc = DecisionTreeClassifier(random_state=0)

    print("Feature Selection")
    # selector = SelectFpr(score_function)
    selector = SelectFpr(score_function)
    result = selector.fit(X_train, y_train)
    X_train = selector.transform(X_train)

    fitted_ids = [i for i in result.get_support(indices=True)]

    print("Apply Resampling")
    print(Counter(y_train))
    if undersam and not oversam:
        renn = RepeatedEditedNearestNeighbours()
        X_train, y_train = renn.fit_resample(X_train, y_train)
    if oversam and not undersam:
        # feature_indices_array = list(range(len(f_to_id)))
        # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0)
        # X_train, y_train = smote_nc.fit_resample(X_train, y_train)
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    if oversam and undersam:
        smote_enn = SMOTEENN(random_state=0)
        X_train, y_train = smote_enn.fit_resample(X_train, y_train)
    print(Counter(y_train))

    print("Train Classifier")
    dtc = dtc.fit(X_train, y_train, check_input=True)

    # if export:
    print("Exporting decision tree image...")
    export_graphviz(dtc,
                    out_file=DATAP + "/temp/trees/sltree_" +
                    configurationname + ".dot",
                    filled=True)
    transform(fitted_ids)

    print("Self Accuracy: " + str(dtc.score(X_train, y_train)))

    return selector, dtc
def feature_Univarselection(data, y, Alpha):
    xx = data.sort_values('pid').values
    xx_label = y.sort_values('pid')[sep].values
    select = SelectFpr(f_classif, alpha=Alpha).fit(xx, xx_label)
    # select = SelectFdr(f_classif, alpha=Alpha).fit(xx,xx_label)
    # select = SelectFwe(f_classif, alpha=Alpha).fit(xx,xx_label)
    # select = SelectKBest(chi2, k=num_feature).fit(xx,xx_label)
    # select = SelectFromModel(estimator=Lasso(), threshold=-np.inf, max_features=num_feature).fit(data,y)
    reduced_xx = select.transform(xx)
    new_data = select.inverse_transform(reduced_xx)
    new_data = pd.DataFrame(new_data,
                            index=data.sort_values('pid').index,
                            columns=data.sort_values('pid').columns)
    # idx = select.get_support()
    # print(idx)
    # new_data = np.delete(new_data,idx,1)
    return new_data
Example #7
0
def train_decisiontree_FPR(configurationname, train_data, score_function, undersam=False, oversam=False, export=False):
    print("Training with configuration " + configurationname)
    X_train, y_train, id_to_a_train = train_data
    dtc = DecisionTreeClassifier(random_state=0)

    print("Feature Selection")
    # selector = SelectFpr(score_function)
    selector = SelectFpr(score_function)
    result = selector.fit(X_train, y_train)
    X_train = selector.transform(X_train)

    fitted_ids = [i for i in result.get_support(indices=True)]

    print("Apply Resampling")
    print(Counter(y_train))
    if undersam and not oversam:
        renn = RepeatedEditedNearestNeighbours()
        X_train, y_train = renn.fit_resample(X_train, y_train)
    if oversam and not undersam:
        # feature_indices_array = list(range(len(f_to_id)))
        # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0)
        # X_train, y_train = smote_nc.fit_resample(X_train, y_train)
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    if oversam and undersam:
        smote_enn = SMOTEENN(random_state=0)
        X_train, y_train = smote_enn.fit_resample(X_train, y_train)
    print(Counter(y_train))

    print("Train Classifier")
    dtc = dtc.fit(X_train, y_train, check_input=True)

    if export:
        export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True)
        transform(fitted_ids)

    print("Self Accuracy: " + str(dtc.score(X_train, y_train)))

    return selector, dtc
Example #8
0
class ExamDropExtractor:
    """ The Exam Drop Extractor deals with obtaining the train data and predict data for the Exam Layer. For this we use
    the following techniques: polynomial transformations, local outlier removal, zero variance removal, ANOVA F filter
    and PCA. Furthermore we resample the data such that every episode has the same likelihood of being picked. """

    BIN_STRATEGY = 'kmeans' # The method used to select the splits between the bins.

    def __init__(self, predict_season: int, predict_episode: int, train_seasons: Set[int], anova_f_significance: float,
                 pca_explain: float, max_splits: int):
        """ Constructor of the Exam Drop Extractor

        Arguments:
            predict_season (int): The season for which we make the prediction.
            predict_episode (int): The latest episode in the predict season that could be used.
            train_seasons (Set[int]): The seasons which are used as train data.
            anova_f_significance (float): Only features with a p-value lower than this value will be selected by the
                ANOVA F filter.
            pca_explain (float): PCA will select the least number of components that at least explain this amount
                of variance in the features.
            max_splits (int): How many additional bins should be used to discretize the features.
        """
        self.__predict_season = predict_season
        self.__predict_episode = predict_episode
        self.__train_seasons = train_seasons
        self.__anova_f_significance = anova_f_significance
        self.__pca_explain = pca_explain
        self.__max_splits = max_splits

    def get_train_data(self) -> Tuple[np.array, np.array, np.array]:
        """ Get the formatted and sampled train data with train weights useable for machine learning algorithms.

        Returns:
            The train input, train output and train weights in this order. The train input is a 2d array where each row
            represents a different train element. The train output is 1d array of labels, such that the ith row of the
            train input corresponds to the ith element of the train output.
        """
        train_data = []
        for season in self.__train_seasons:
            train_data.extend(self.__get_season_data(season, sys.maxsize, True))
        train_input = np.array([ExamDropEncoder.extract_features(sample, sys.maxsize) for sample in train_data])
        train_output = np.array([1.0 if get_is_mol(sample.selected_player) else 0.0 for sample in train_data])

        num_bins = self.get_num_bins(train_input, self.__max_splits)
        self.__discretizer = KBinsDiscretizer(n_bins = num_bins, encode = "onehot-dense",
                                              strategy = ExamDropExtractor.BIN_STRATEGY)
        train_input = self.__discretizer.fit_transform(train_input)
        train_input = self.__add_answered_on_feature(train_data, train_input)
        self.__anova_f_filter = SelectFpr(f_classif, alpha = self.__anova_f_significance)
        train_input = self.__anova_f_filter.fit_transform(train_input, train_output)
        self.__pca = PCA(n_components = self.__pca_explain)
        train_input = self.__pca.fit_transform(train_input)
        return train_input, train_output, self.__get_train_weights(train_data)

    def get_predict_data(self) -> List[PredictSample]:
        """ Get all formatted predict data useable for the machine learning algorithms to do a prediction.

        Returns:
            A list of prediction samples, where a prediction sample consists of a set of players included in the answer
            and not included in the answer. Also a prediction sample consist of the features for the participants
            included in the answer and not included in the answer.
        """
        predict_data = self.__get_season_data(self.__predict_season, self.__predict_episode, False)
        if not predict_data:
            return []

        predict_input = np.array([ExamDropEncoder.extract_features(sample, self.__predict_episode) for sample in predict_data])
        predict_input = self.__discretizer.transform(predict_input)
        predict_input = self.__add_answered_on_feature(predict_data, predict_input)
        predict_input = self.__anova_f_filter.transform(predict_input)
        predict_input = self.__pca.transform(predict_input)

        predict_samples = []
        weights = self.__get_train_weights(predict_data)
        for data, in_features, out_features, weight in zip(predict_data[::2], predict_input[1::2], predict_input[::2], weights):
            in_answer = data.answer
            out_answer = set(data.exam_episode.players).difference(data.answer)
            predict_samples.append(PredictSample(in_answer, out_answer, in_features, out_features, weight))
        return predict_samples

    @staticmethod
    def __get_season_data(season_num: int, max_episode: int, training_data: bool) -> List[TrainSample]:
        """ Get all raw answer data from a season.

        Arguments:
            season_num (int): The season from which we obtain this data.
            max_episode (int): The latest episode which can still be extracted. If this value is sys.maxsize then all
                raw answer data is obtained from this season.
            training_data (bool): True if the data is used as training data and false if the data is used as prediction
                data. The difference is that in case it is used for predictions we use a bool value as selected_player
                and otherwise selected_player is a Player.

        Returns:
            A list of prediction samples, where a prediction sample consists of a set of players included in the answer
            and not included in the answer. Also a prediction sample consist of the features for the participants
            included in the answer and not included in the answer.
        """
        season = EXAM_DATA[season_num]
        drop_players = season.get_drop_mapping(DropType.EXECUTION_DROP, max_episode)
        all_answers = season.get_all_answers(set(drop_players.keys()), max_episode)
        season_data = []
        for answer in all_answers:
            exam_episode = answer.episode
            drop_episodes = drop_players[answer.player]
            drop_episodes = [episode for episode in drop_episodes if exam_episode <= episode]
            if drop_episodes:
                if training_data:
                    for player in exam_episode.players:
                        season_data.append(TrainSample(answer.player, season_num, min(drop_episodes), answer.episode,
                                                       answer.question, answer.answer, player))
                else:
                    for answer_on in [False, True]:
                        season_data.append(TrainSample(answer.player, season_num, min(drop_episodes), answer.episode,
                                                       answer.question, answer.answer, answer_on))
        return season_data

    @staticmethod
    def get_num_bins(train_input: np.array, max_splits: int) -> List[int]:
        """ Get the number of bins for all features. To determine this we use a forward stepwise information gain
        algorithm, which gives the feature with the highest entropy increase an additional bin.

        Arguments:
            train_input (np.array): All non-transformed train input.
            max_splits (int): How many times an additional bin should be added.

        Returns:
            A list of integers, which represent the number of bins for each feature.
        """
        num_bins = [2 for _ in train_input[0]]
        max_bins = [len(set(column)) for column in train_input.T]
        entropies = [ExamDropExtractor.__entropy(np.expand_dims(column, axis = 1), 2) for column in train_input.T]
        options = PriorityQueue()

        for i, data in enumerate(train_input.T):
            if max_bins[i] > 2:
                data = np.expand_dims(data, axis = 1)
                new_entropy = ExamDropExtractor.__entropy(data, 3)
                options.put((-(new_entropy - entropies[i]), i))

        for _ in range(max_splits):
            if options.empty():
                break

            entropy, i = options.get()
            num_bins[i] = num_bins[i] + 1
            entropies[i] = entropies[i] - entropy
            if num_bins[i] != max_bins[i]:
                data = np.expand_dims(train_input[:, i], axis = 1)
                new_entropy = ExamDropExtractor.__entropy(data, num_bins[i] + 1)
                options.put((-(new_entropy - entropies[i]), i))

        return num_bins

    @staticmethod
    def __entropy(data: np.array, bins: int) -> float:
        """ Compute the entropy of a KBinsDiscretizer split using a certain number of bins.

        Arguments:
            data (np.array): A list of all values for a particular feature.
            bins (int): In how many bins the values should be split.

        Returns:
            The entropy of this split.
        """
        discretizer = KBinsDiscretizer(n_bins = bins, encode = "onehot-dense", strategy = ExamDropExtractor.BIN_STRATEGY)
        data = discretizer.fit_transform(data)
        new_entropy = np.sum(data, axis = 0)
        return sc.stats.entropy(new_entropy / np.sum(new_entropy))

    @staticmethod
    def __add_answered_on_feature(samples: List[TrainSample], all_features: np.array) -> np.array:
        """ Translate the features such that answered_on is also included as feature which represents whether the player
        is included in the answer or not. This translation adds a new feature which is 1 if the player is in the answer
        and 0 if not. Also it adds all features multiplied by this answered on features.

        Arguments:
            samples (List[TrainSample]): All raw data corresponding to each row in all_features. This data is used
                to check if a player was included in the answer or not.
            all_features (np.array): All feature values so far.

        Returns:
            All new feature values translated by adding the answered_on feature to it.
        """
        new_features = []
        for sample, features in zip(samples, all_features):
            if isinstance(sample.selected_player, bool):
                answered_on = 1.0 if sample.selected_player else 0.0
            else:
                answered_on = 1.0 if sample.selected_player in sample.answer else 0.0
            features = np.append(features, answered_on * features)
            features = np.append(features, [answered_on])
            new_features.append(features)
        return np.array(new_features)

    @staticmethod
    def __get_train_weights(train_data: List[TrainSample]) -> np.array:
        """ Get the weight for the training data, which is 1 / num_answers where num_answers is the number of answers
        given by that player in that episode.

        Arguments:
            train_data (List[TrainSample]): All raw train data from which it is extracted to which episode an answer
                belongs.

        Returns:
            An 1d array of weights which pairwise corresponds to the train_data (and therefore pairwise corresponds
            with each row in the train input).
        """
        train_weights = []
        for sample in train_data:
            num_answers = sample.exam_episode.get_all_answers({sample.player}, sys.maxsize)
            train_weights.append(1 / len(num_answers))
        return np.array(train_weights)
print "SelectPercentile -- chi2"
print X_fitted_4.scores_
print X_fitted_4.pvalues_
print X_fitted_4.get_support()
X_transformed_4 = X_fitted_4.transform(X)
print X_transformed_4.shape

#SelectFpr --- chi2
from sklearn.feature_selection import SelectFpr
from sklearn.feature_selection import chi2
X_fitted_5 = SelectFpr(chi2, alpha=2.50017968e-15).fit(X,y)
print "SelectFpr --- chi2"
print X_fitted_5.scores_
print X_fitted_5.pvalues_
print X_fitted_5.get_support()
X_transformed_5 = X_fitted_5.transform(X)
print X_transformed_5.shape

#SelectFpr --- f_classif
from sklearn.feature_selection import SelectFpr
from sklearn.feature_selection import f_classif
X_fitted_6 = SelectFpr(f_classif, alpha=1.66966919e-31 ).fit(X,y)
print "SelectFpr --- f_classif"
print X_fitted_6.scores_
print X_fitted_6.pvalues_
print X_fitted_6.get_support()
X_transformed_6 = X_fitted_6.transform(X)
print X_transformed_6.shape

# SelectFdr  和 SelectFwe 的用法和上面类似,只是选择特征时候的依据不同,真正决定得分不同的是
#统计检验方法,从上面可以看到,使用f_classif的得出的参数都相同。
    selector.fit(features, labels)

    print 'performing 6-fold cross-validation'
    kf = KFold(len(features), 6, shuffle=False, random_state=None)
    roc_scores = []
    for train_indices, test_indices in kf:
        X_train, X_test = [
            features[train_index] for train_index in train_indices
        ], [features[test_index] for test_index in test_indices]
        y_train, y_test = [
            labels[train_index] for train_index in train_indices
        ], [labels[test_index] for test_index in test_indices]

        test_model = LogisticRegression()

        X_train = selector.transform(X_train)
        X_test = selector.transform(X_test)

        test_model.fit(X_train, y_train)

        y_predicted = test_model.predict(X_test)
        predict_probabilities = test_model.predict_proba(X_test)
        positive_probabilities = [
            predict_probability[1]
            for predict_probability in predict_probabilities
        ]
        roc_scores.append(roc_auc_score(y_test, positive_probabilities))
    print "Features left: " + str(len(X_train[0])) + " out of " + str(
        len(features[0]))
    print "ROC auc score: " + str(sum(roc_scores) / float(len(roc_scores)))
    print ""
print X_fitted_4.scores_
print X_fitted_4.pvalues_
print X_fitted_4.get_support()
X_transformed_4 = X_fitted_4.transform(X)
print X_transformed_4.shape

#SelectFpr --- chi2
from sklearn.feature_selection import SelectFpr
from sklearn.feature_selection import chi2

X_fitted_5 = SelectFpr(chi2, alpha=2.50017968e-15).fit(X, y)
print "SelectFpr --- chi2"
print X_fitted_5.scores_
print X_fitted_5.pvalues_
print X_fitted_5.get_support()
X_transformed_5 = X_fitted_5.transform(X)
print X_transformed_5.shape

#SelectFpr --- f_classif
from sklearn.feature_selection import SelectFpr
from sklearn.feature_selection import f_classif

X_fitted_6 = SelectFpr(f_classif, alpha=1.66966919e-31).fit(X, y)
print "SelectFpr --- f_classif"
print X_fitted_6.scores_
print X_fitted_6.pvalues_
print X_fitted_6.get_support()
X_transformed_6 = X_fitted_6.transform(X)
print X_transformed_6.shape

# SelectFdr  和 SelectFwe 的用法和上面类似,只是选择特征时候的依据不同,真正决定得分不同的是