def selectionFwe(X, y, paramlist): k = paramlist['number _of_features'] fwe = SelectFpr(chi2, k=k) Xnew = fwe.fit_transform(X, y) indexarr = fwe.get_support(indices=True) scores_arr = fwe.scores_ return [Xnew, indexarr, scores_arr]
def select_with_fpr(train, test): train_data = train.drop('ID', axis=1) test_data = test.drop('ID', axis=1) train_y = train_data['TARGET'] train_X = train_data.drop('TARGET', 1) fpr = SelectFpr(alpha = 0.001) features = fpr.fit_transform(train_X, train_y) print('Fpr выбрал {} признаков.'.format(features.shape[1])) col_numbers = fpr.get_support() columns = np.delete(train_data.columns.values, train_data.shape[1] - 1, axis=0) features = [] i = 0 for i in range(len(columns)): if col_numbers[i] == True: features.append(columns[i]) new_train = train[['ID'] + features + ['TARGET']] new_train.to_csv('train_after_fpr.csv') new_test = test[['ID'] + features] new_test.to_csv('test_after_fpr.csv')
def SelectFpr_selector(data, target, sf): selector = SelectFpr(score_func=sf) data_new = selector.fit_transform(data.values, target.values.ravel()) outcome = selector.get_support(True) new_features = [] # The list of your K best features for ind in outcome: new_features.append(data.columns.values[ind]) return pd.DataFrame(data_new, columns=new_features)
#if covariates, #then include the FeatureUnion object if numcovar>0: cv_pipeline=Pipeline(vertex_covars_union+linear_est) #if not covar list #then just do: ##feature selection, ##avg/sum vertices, ##run estimator else: cv_pipeline=Pipeline(vertex_pipe+linear_est) only_vertices_train=selectvertices.fit_transform( X_train ) num_features_selected=anova_select.fit_transform( X=only_vertices_train, y=y_train ).shape[1] ####run pipeline if greater than 0 features or npdata as features#### ##if there are features select --> ##there are no features selected (can't fit) #no features selected for npdata as features so num_feat=NA if num_features_selected>0 or npdata_features==True: #fit cv_pipeline.fit(X=X_train,y=y_train) #predictions X_train_pred=cv_pipeline.predict(X_train) X_test_pred=cv_pipeline.predict(X_test) #metrics train_rsq=r2_score(y_train,X_train_pred) train_mse=mean_squared_error(y_train,X_train_pred) test_rsq=r2_score(y_test,X_test_pred)
class ExamDropExtractor: """ The Exam Drop Extractor deals with obtaining the train data and predict data for the Exam Layer. For this we use the following techniques: polynomial transformations, local outlier removal, zero variance removal, ANOVA F filter and PCA. Furthermore we resample the data such that every episode has the same likelihood of being picked. """ BIN_STRATEGY = 'kmeans' # The method used to select the splits between the bins. def __init__(self, predict_season: int, predict_episode: int, train_seasons: Set[int], anova_f_significance: float, pca_explain: float, max_splits: int): """ Constructor of the Exam Drop Extractor Arguments: predict_season (int): The season for which we make the prediction. predict_episode (int): The latest episode in the predict season that could be used. train_seasons (Set[int]): The seasons which are used as train data. anova_f_significance (float): Only features with a p-value lower than this value will be selected by the ANOVA F filter. pca_explain (float): PCA will select the least number of components that at least explain this amount of variance in the features. max_splits (int): How many additional bins should be used to discretize the features. """ self.__predict_season = predict_season self.__predict_episode = predict_episode self.__train_seasons = train_seasons self.__anova_f_significance = anova_f_significance self.__pca_explain = pca_explain self.__max_splits = max_splits def get_train_data(self) -> Tuple[np.array, np.array, np.array]: """ Get the formatted and sampled train data with train weights useable for machine learning algorithms. Returns: The train input, train output and train weights in this order. The train input is a 2d array where each row represents a different train element. The train output is 1d array of labels, such that the ith row of the train input corresponds to the ith element of the train output. """ train_data = [] for season in self.__train_seasons: train_data.extend(self.__get_season_data(season, sys.maxsize, True)) train_input = np.array([ExamDropEncoder.extract_features(sample, sys.maxsize) for sample in train_data]) train_output = np.array([1.0 if get_is_mol(sample.selected_player) else 0.0 for sample in train_data]) num_bins = self.get_num_bins(train_input, self.__max_splits) self.__discretizer = KBinsDiscretizer(n_bins = num_bins, encode = "onehot-dense", strategy = ExamDropExtractor.BIN_STRATEGY) train_input = self.__discretizer.fit_transform(train_input) train_input = self.__add_answered_on_feature(train_data, train_input) self.__anova_f_filter = SelectFpr(f_classif, alpha = self.__anova_f_significance) train_input = self.__anova_f_filter.fit_transform(train_input, train_output) self.__pca = PCA(n_components = self.__pca_explain) train_input = self.__pca.fit_transform(train_input) return train_input, train_output, self.__get_train_weights(train_data) def get_predict_data(self) -> List[PredictSample]: """ Get all formatted predict data useable for the machine learning algorithms to do a prediction. Returns: A list of prediction samples, where a prediction sample consists of a set of players included in the answer and not included in the answer. Also a prediction sample consist of the features for the participants included in the answer and not included in the answer. """ predict_data = self.__get_season_data(self.__predict_season, self.__predict_episode, False) if not predict_data: return [] predict_input = np.array([ExamDropEncoder.extract_features(sample, self.__predict_episode) for sample in predict_data]) predict_input = self.__discretizer.transform(predict_input) predict_input = self.__add_answered_on_feature(predict_data, predict_input) predict_input = self.__anova_f_filter.transform(predict_input) predict_input = self.__pca.transform(predict_input) predict_samples = [] weights = self.__get_train_weights(predict_data) for data, in_features, out_features, weight in zip(predict_data[::2], predict_input[1::2], predict_input[::2], weights): in_answer = data.answer out_answer = set(data.exam_episode.players).difference(data.answer) predict_samples.append(PredictSample(in_answer, out_answer, in_features, out_features, weight)) return predict_samples @staticmethod def __get_season_data(season_num: int, max_episode: int, training_data: bool) -> List[TrainSample]: """ Get all raw answer data from a season. Arguments: season_num (int): The season from which we obtain this data. max_episode (int): The latest episode which can still be extracted. If this value is sys.maxsize then all raw answer data is obtained from this season. training_data (bool): True if the data is used as training data and false if the data is used as prediction data. The difference is that in case it is used for predictions we use a bool value as selected_player and otherwise selected_player is a Player. Returns: A list of prediction samples, where a prediction sample consists of a set of players included in the answer and not included in the answer. Also a prediction sample consist of the features for the participants included in the answer and not included in the answer. """ season = EXAM_DATA[season_num] drop_players = season.get_drop_mapping(DropType.EXECUTION_DROP, max_episode) all_answers = season.get_all_answers(set(drop_players.keys()), max_episode) season_data = [] for answer in all_answers: exam_episode = answer.episode drop_episodes = drop_players[answer.player] drop_episodes = [episode for episode in drop_episodes if exam_episode <= episode] if drop_episodes: if training_data: for player in exam_episode.players: season_data.append(TrainSample(answer.player, season_num, min(drop_episodes), answer.episode, answer.question, answer.answer, player)) else: for answer_on in [False, True]: season_data.append(TrainSample(answer.player, season_num, min(drop_episodes), answer.episode, answer.question, answer.answer, answer_on)) return season_data @staticmethod def get_num_bins(train_input: np.array, max_splits: int) -> List[int]: """ Get the number of bins for all features. To determine this we use a forward stepwise information gain algorithm, which gives the feature with the highest entropy increase an additional bin. Arguments: train_input (np.array): All non-transformed train input. max_splits (int): How many times an additional bin should be added. Returns: A list of integers, which represent the number of bins for each feature. """ num_bins = [2 for _ in train_input[0]] max_bins = [len(set(column)) for column in train_input.T] entropies = [ExamDropExtractor.__entropy(np.expand_dims(column, axis = 1), 2) for column in train_input.T] options = PriorityQueue() for i, data in enumerate(train_input.T): if max_bins[i] > 2: data = np.expand_dims(data, axis = 1) new_entropy = ExamDropExtractor.__entropy(data, 3) options.put((-(new_entropy - entropies[i]), i)) for _ in range(max_splits): if options.empty(): break entropy, i = options.get() num_bins[i] = num_bins[i] + 1 entropies[i] = entropies[i] - entropy if num_bins[i] != max_bins[i]: data = np.expand_dims(train_input[:, i], axis = 1) new_entropy = ExamDropExtractor.__entropy(data, num_bins[i] + 1) options.put((-(new_entropy - entropies[i]), i)) return num_bins @staticmethod def __entropy(data: np.array, bins: int) -> float: """ Compute the entropy of a KBinsDiscretizer split using a certain number of bins. Arguments: data (np.array): A list of all values for a particular feature. bins (int): In how many bins the values should be split. Returns: The entropy of this split. """ discretizer = KBinsDiscretizer(n_bins = bins, encode = "onehot-dense", strategy = ExamDropExtractor.BIN_STRATEGY) data = discretizer.fit_transform(data) new_entropy = np.sum(data, axis = 0) return sc.stats.entropy(new_entropy / np.sum(new_entropy)) @staticmethod def __add_answered_on_feature(samples: List[TrainSample], all_features: np.array) -> np.array: """ Translate the features such that answered_on is also included as feature which represents whether the player is included in the answer or not. This translation adds a new feature which is 1 if the player is in the answer and 0 if not. Also it adds all features multiplied by this answered on features. Arguments: samples (List[TrainSample]): All raw data corresponding to each row in all_features. This data is used to check if a player was included in the answer or not. all_features (np.array): All feature values so far. Returns: All new feature values translated by adding the answered_on feature to it. """ new_features = [] for sample, features in zip(samples, all_features): if isinstance(sample.selected_player, bool): answered_on = 1.0 if sample.selected_player else 0.0 else: answered_on = 1.0 if sample.selected_player in sample.answer else 0.0 features = np.append(features, answered_on * features) features = np.append(features, [answered_on]) new_features.append(features) return np.array(new_features) @staticmethod def __get_train_weights(train_data: List[TrainSample]) -> np.array: """ Get the weight for the training data, which is 1 / num_answers where num_answers is the number of answers given by that player in that episode. Arguments: train_data (List[TrainSample]): All raw train data from which it is extracted to which episode an answer belongs. Returns: An 1d array of weights which pairwise corresponds to the train_data (and therefore pairwise corresponds with each row in the train input). """ train_weights = [] for sample in train_data: num_answers = sample.exam_episode.get_all_answers({sample.player}, sys.maxsize) train_weights.append(1 / len(num_answers)) return np.array(train_weights)
def feature_select(x, y): vt = SelectFpr(f_regression, alpha=0.05) samples_selected = vt.fit_transform(x, y) get_index_selected = vt.get_support(indices=True) return samples_selected, get_index_selected
T2 = np.hstack((XTEST, ytest[:, np.newaxis])) T = np.vstack((T, T2)) print 'Joined data:' print T.shape print len(F) df = pd.DataFrame(data=T, columns=F) # df.dropna(inplace=True) print XTRAIN.shape print ytrain.shape print len(aFeatNames_tr) fs = SelectFpr(f_classif, alpha=0.05) tmp = fs.fit_transform(XTRAIN, ytrain) feat_importance = fs.pvalues_ ind = np.argsort(feat_importance) # ind = ind[::-1] print 'Selected features:', sum(feat_importance <= 0.05) print '\n\n #### Sorted by importance' for i in range(0, len(ind)): print '{0:30}: {1}'.format(aFeatNames_tr[ind[i]], feat_importance[ind[i]]) # plt.figure() # plt.stem(feat_importance) # plt.xticks(range(0, len(feat_importance)), aFeatNames) # plt.grid(True) # dummy2, labels = plt.xticks() # plt.setp(labels, rotation=90)