def drop_low_var_columns(train_X, test_df, threshold): select_features = feature_selection.VarianceThreshold(threshold=threshold) select_features.fit(train_X) cols_to_drop = train_X.columns[select_features.get_support() != True] train_X.drop(columns=cols_to_drop, inplace=True) test_df.drop(columns=cols_to_drop, inplace=True)
def feature_select_low_var(dataframe, vars_name=None, threshold_=1, ret_normal=False): ''' Feature selection using low variance method to the normalized applied features in dataframe. Those features with variance below threshold are eliminated. Parameters ---------- dataframe = features-only dataframe (response variable excluded) vars_name = list/array of strings with each feature name threshold = '1' default. Beware data is normalized so 'threshold = 1' is tantamount to 1x std deviations. ret_normal = 'False' default. If 'False' dataframe returned is normalized, otherwise the original data is returned. ''' # Dataframe column names automatic obtention unless provided: if vars_name == None: try: names = dataframe.columns except: names = vars_name else: names = vars_name # taking out non-numeric vars: type_list = map(lambda x: type(x), dataframe.iloc[0, :]) type_idx = [i != str for i in type_list] df = dataframe.iloc[:, type_idx] # normalizing numeric variables to get rid of scale effect over variance: df_t = preprocessing.StandardScaler().fit_transform(df) # Low Variance variable selection: selection = fs.VarianceThreshold(threshold=threshold_).fit_transform(df_t) # Retrieve selected variable names: sel_fit = fs.VarianceThreshold(threshold=threshold_).fit(df_t) selec_vars = selec_vars = names[sel_fit.get_support()] for i in selection[0, :]: temp = i == df_t[0, :] selec_vars.append(names[temp][0]) if ret_normal == True: df_sel = pd.DataFrame(selection, columns=selec_vars) print('Returned variables are normalized') else: df_sel = pd.DataFrame(dataframe[selec_vars], columns=selec_vars) print('Returned variables are original') return df_sel
def get_lovvar_cols(df, threshold): selector = feature_selection.VarianceThreshold(threshold=threshold) selector.fit(df) cols = df.columns[~selector.get_support()] cols = [col for col in cols if col not in ["Field9", "Field8", 'Field11', 'Field12']] return cols
def variance_threshold(arr0, threshold): matrix = np.array(arr0) temp = feature_selection.VarianceThreshold(threshold=threshold).fit(matrix) scores = [np.var(el) for el in matrix.T] indx = temp.get_support().tolist() # result = data_utility.retrieve_nan_index(temp.transform(matrix).tolist(), index) result = temp.transform(matrix).tolist() return scores, indx, result
def stand(): """ 归一化处理 :return: none """ value = [[10, 30, 80], [0, 0.5, 1], [1, 1, 2]] std = feature_selection.VarianceThreshold(200) print(std.fit_transform(value))
def removeZeroVarianceFeatures(X): '''Function to remove features which use zero variance Parameters: X (np.array): Features for the dataset Return: X (np.array): Modified array of features with no zero-variance features ''' varianceSelector = feature_selection.VarianceThreshold() X = varianceSelector.fit_transform(X) return X
def get_fs_model(model, method, train, target=None, cv=None): """Connects given model with specified feature selection method and trains the final structure. """ if method == "RFE": model = fs_scikit.RFE(model, 2, step=5) if target is not None: return model.fit(train, target) else: return model.fit(train) if method == "RFECV": model = fs_scikit.RFECV(model, 3, cv=cv) if target is not None: return model.fit(train, target) else: return model.fit(train) elif method == "linearSVC": sel = SelectFromModel(LinearSVC(penalty='l1', dual=False)) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "fromModel": fm = fs_scikit.SelectFromModel(model) if target is not None: fm.fit(train, target) else: fm.fit(train) model = Pipeline([('feature_selection', fm), ('data_mining', model)]) # elif method == "Anova": # ANOVA SVM-C # anova_filter = fs_scikit.SelectKBest(f_regression, k=5) # model = Pipeline([ # ('feature_selection', anova_filter), # ('data_mining', model) # ]) elif method == "VarianceThreshold": sel = fs_scikit.VarianceThreshold(threshold=(.8 * (1 - .8))) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectPercentile": sel = fs_scikit.SelectPercentile(fs_scikit.f_classif, percentile=30) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectFpr": sel = fs_scikit.SelectFpr(alpha=0.2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectFdr": sel = fs_scikit.SelectFdr(alpha=0.2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectFwe": sel = fs_scikit.SelectFwe(alpha=0.2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "ch2": sel = fs_scikit.SelectKBest(fs_scikit.chi2, k=2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) else: print("Feature selection method was not found: " + method) sys.exit(1) return model
def near_zero_var_df_sklearn( self, df: PandasDataFrame, excludes: List, file_name: str, thresh_variance: float = 0.05, to_search: bool = True ) -> [PandasDataFrame, CollectionsOrderedDict]: """Find and optionally remove the selected near-zero-variance features (Scikit algorithm). Feature selector that removes all low-variance features. This feature selection algorithm looks only at the features (X), not the desired outputs (y), and can thus be used for unsupervised learning. :param df: the features dataframe. :param excludes: the name of excluded features. :param file_name: the name of the summary output file. :param thresh_variance: Features with a training-set variance lower than this threshold will be removed. The default is to keep all features with non-zero variance, i.e. remove the features that have the same value in all samples. :param to_search: to search or use the saved configuration. :return: the inputted dataframe with exclusion of features that were selected to be removed. """ self.__logger.debug( "Remove features with near-zero-variance (if applicable), using Scikit algorithm." ) df_excludes = df[excludes] excludes = set(excludes) matches = [] indices = OrderedDict() summaries = OrderedDict() # find indices for label in df.columns.values(): indices[df.columns.get_loc(label)] = label # search if to_search is True: variances_ = feature_selection.VarianceThreshold(thresh_variance) matches_indices = variances_.get_support(indices=True) matches_labels = [indices[index] for index in matches_indices] for match in matches_labels: if match not in excludes: matches += [match] # delete df = self.__remove( df, {'NZV': list(matches)}, to_search, os.path.join(self.__output_path, file_name + ".ini")) for name in excludes: df[name] = df_excludes[name] if any(np.isnan(df.index)): df = df.reset_index(drop=True) # summaries if to_search is True: summaries["Features Matches"] = matches return df, summaries
def remove_low_variance(data): #https://scikit-learn.org/stable/modules/feature_selection.html #X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]] #if features have low variance in 40% of samples, trim sel = feature_selection.VarianceThreshold(threshold=(1 - 0.05) * 0.05) new_dataset = sel.fit_transform(data) selected_feature_indicies = sel.get_support(indices=True) print(len(selected_feature_indicies)) print(selected_feature_indicies) np.save("list of chosen features_05", selected_feature_indicies) np.save("low_variance_dataset_alpha05_new", new_dataset) return new_dataset
def lars(): behavior_data, conn_data = pu.load_data_full_subjects() conn_data.astype(float) categorical_variables = ['smoking', 'deanxit_antidepressants', 'rivotril_antianxiety', 'sex'] categorical_data = behavior_data[categorical_variables] dummy_coded_categorical = pu.dummy_code_binary(categorical_data) covariate_data = pd.concat([behavior_data['age'], dummy_coded_categorical], axis=1) ml_data = pd.concat([conn_data, covariate_data], axis=1) target = behavior_data['distress_TQ'].values.astype(float) feature_names = list(ml_data) continuous_features = [f for f in feature_names if 'categorical' not in f] continuous_indices = [ml_data.columns.get_loc(cont) for cont in continuous_features] categorical_features = [f for f in feature_names if 'categorical' in f] categorical_indices = [ml_data.columns.get_loc(cat) for cat in categorical_features] ml_continuous = ml_data.values[:, continuous_indices] ml_categorical = ml_data.values[:, categorical_indices] # Standardization for continuous data preproc = preprocessing.StandardScaler().fit(ml_continuous) ml_z = preproc.transform(ml_continuous) # Variance threshold for categorical data varthresh = feature_selection.VarianceThreshold(threshold=0).fit(ml_categorical) ml_v = varthresh.transform(ml_categorical) ml_preprocessed = np.hstack((ml_z, ml_v)) # Feature selection with extra trees clf = ensemble.ExtraTreesRegressor() model = feature_selection.SelectFromModel(clf, threshold="2*mean") # Transform train and test data with feature selection model ml_cleaned = model.fit_transform(ml_preprocessed, target) feature_indices = model.get_support(indices=True) cleaned_features = [feature_names[i] for i in feature_indices] lars_classifier = linear_model.LarsCV(cv=3, normalize=False, fit_intercept=False) lars_classifier.fit(ml_cleaned, target) predicted = lars_classifier.predict(ml_cleaned) r2 = lars_classifier.score(ml_cleaned, target) exp_var = metrics.explained_variance_score(target, predicted) max_err = metrics.max_error(target, predicted) mae = metrics.mean_absolute_error(target, predicted) mse = metrics.mean_squared_error(target, predicted) print(r2)
def variance_threshold(data, alpha=0.05): """A wrapper of scikit-learn VarianceThreshold.""" X_train, X_test, y_train, y_test = data # Z-scores. X_train_std, X_test_std = utils.train_test_z_scores(X_train, X_test) selector = feature_selection.VarianceThreshold(threshold=alpha) # NB: Cannot filter variance from standardized data. selector.fit(X_train, y_train) support = _check_support(selector.get_support(indices=True), X_train_std) return _check_feature_subset(X_train_std, X_test_std, support)
def key_features(X_train, y_train, sub, varience_test=True): print 'Features before reduction: ' + str(len(X_train[0])) if varience_test: #remove features with low variance sel = feature_selection.VarianceThreshold(threshold=(.8 * (1 - .8))) X_train = sel.fit_transform(X_train) sub = sel.transform(sub) print 'Features after variance reduction: ' + str(len(X_train[0])) estimator = linear_model.SGDClassifier(n_jobs=-1, class_weight='auto') selector = feature_selection.RFECV(estimator, step=1, cv=5) features = selector.fit_transform(X_train, y_train) submission = selector.transform(sub) print 'Features after recursive elimination: ' + str(len(features[0])) return (features, submission)
def clean_features(data, header, **kwargs): #extract parameters min_feature_variance = kwargs.get('min_feature_variance', .8 * (1 - .8)) #remove features with variance below the threshold feature_selector = feature_selection.VarianceThreshold( threshold=min_feature_variance) reduced_data = feature_selector.fit_transform(data) #create a mask of features selected mask = feature_selector.get_support(indices=True) #select the same indexes from the header reduced_header = np.take(header, mask) return reduced_data, reduced_header
def learning(X, y, estimator, search_params): scoring = dict(roc_auc='roc_auc', pr_auc=metrics.make_scorer(pr_auc_score), accuracy='accuracy', balanced_accuracy='balanced_accuracy', precision='precision', recall='recall', f1='f1', mcc=metrics.make_scorer(metrics.matthews_corrcoef)) pipe = Pipeline([ ('var', feature_selection.VarianceThreshold( threshold=0)), # To remove constant features ('bal', None), # To balance the training data See search_params['bal'] below) ('pre', None), # To scale (and center) data. See search_params['pre'] below ('clf', estimator) ]) groups = X.group.tolist() X = X.drop(['group'], axis=1) search_params['bal'] = [ None, RandomUnderSampler(sampling_strategy='majority', random_state=42), RandomOverSampler(sampling_strategy='minority', random_state=42) ] search_params['pre'] = [ None, preprocessing.MinMaxScaler(), preprocessing.StandardScaler() ] search = model_selection.RandomizedSearchCV(pipe, search_params, cv=release_split(X, y, groups), scoring=scoring, refit='pr_auc', verbose=0) search.fit(X, y) return search.cv_results_, search.best_index_
def fit_transform(self, data): """Fit and transform using feature filtering. Fit and transform using several kind of feature filtering methods to select features in data. :param data: Dataframe. The Pandas dataframe, to be converted. :return: Dataframe. The converted dataframe after feature filtering. """ # Removing features with low variance. threshold = 0.0 var_thre = fe.VarianceThreshold(threshold=threshold) result = var_thre.fit_transform(data[data.columns.difference( [self.target_column])]) feature_select = data.columns.difference([self.target_column ])[var_thre.get_support()] result = pd.DataFrame(columns=feature_select, data=result) result[self.target_column] = data[self.target_column] # Store converter. self.variance_threshold = var_thre # Univariate feature selection, using univariate statistical tests. data = result univar_select = fe.GenericUnivariateSelect( score_func=fe.mutual_info_classif, mode='fwe', param=0.05) # Check whether it's regression or classification. # If classification, skip univariate. if len(data[self.target_column].value_counts()) <= 2: return result # If Regression. result = univar_select.fit_transform( data[data.columns.difference([self.target_column])], np.asarray(data[self.target_column])) feature_select = data.columns.difference( [self.target_column])[univar_select.get_support()] result = pd.DataFrame(columns=feature_select, data=result) result[self.target_column] = data[self.target_column] # Store converter. self.univar_select = univar_select return result
def get_low_var_cols(df, threshold): """Analyse a Pandas DataFrame, extract the numeric columns and return a list of those which have a variance below the threshold. Args: :param df: Pandas DataFrame. :param threshold: Variance threshold. Returns: List of columns with variance below the threshold. Example: low_var_cols = get_low_var_cols(df, 0.01) """ df = df.select_dtypes(['number']) selector = feature_selection.VarianceThreshold(threshold=threshold) selector.fit(df) return df.columns[~selector.get_support()]
def fitness(data, metric='euclidean', k=10, seed=None): if metric == 'variance': sel = fs.VarianceThreshold() sel.fit(data) return np.average(np.array(sel.variances_)) if seed < 0: random_seed = None else: random_seed = seed km = KMeans(n_clusters=k, random_state=random_seed, n_jobs=-1) labels = km.fit_predict(data) if metric == 'euclidean': return silhouette_score(data, labels) elif metric == 'cosine': return silhouette_score(data, labels, metric='cosine') else: return km.inertia_
def get_features(X, y, fsm): if fsm == '1': # SelectKBest kBest = math.ceil(len(X[0]) / 2) feature_scores = f_selection.SelectKBest(chi2, k=kBest).fit_transform(X, y) return feature_scores elif fsm == '2': # VarianceThreshold feature_scores = f_selection.VarianceThreshold( threshold=vThreshold).fit_transform(X) return feature_scores elif fsm == '3': # SelectFromModel clf = ExtraTreesClassifier(random_state=200) clf = clf.fit(X, y) model = f_selection.SelectFromModel(clf).fit(X, y) feature_scores = model.transform(X) return feature_scores else: raise ValueError("invalid fsm")
def get_model(self, resume=False): if not resume: if self.method == 'variance': # Unsupervised p = .5 selector = feature_selection.VarianceThreshold( threshold=(p * (1 - p))) elif self.method == 'rfe': estimator = LogisticRegression() selector = feature_selection.RFE( estimator, n_features_to_select=self.feat_limit, step=1, verbose=0) elif self.method == 'forward': estimator = ExtraTreesClassifier(n_estimators=100) selector = SelectFromModel(estimator) elif self.method == 'seq_bwd': estimator = LogisticRegression(solver='lbfgs') selector = SFS(estimator, k_features=self.feat_limit, forward=False, floating=False, scoring='roc_auc', cv=4, n_jobs=-1) elif self.method == 'seq_fwd': estimator = LogisticRegression(solver='lbfgs') selector = SFS(estimator, k_features=self.feat_limit, forward=True, floating=False, scoring='roc_auc', cv=4, n_jobs=-1) else: selector = joblib.load(self.model_save_path) if self.verbose > 2: print(selector) return selector
def train(self, features): # Setup: start_time = time.time() # Check feature set: assert (np.isfinite(features).all()) # Normalizer: if self.normalize: standardizer = preprocessing.StandardScaler() features = standardizer.fit_transform(features) self.normalizer = standardizer # Option 1 (Random Projection): if self.reducer_type == Reducers.random_projection: transformer = random_projection.GaussianRandomProjection() transformer.fit(features) self.reducer = transformer # Option 2 (Feature Selection): if self.reducer_type == Reducers.feature_selection: threshold = (self.explained_variance) * (1 - self.explained_variance) selector = feature_selection.VarianceThreshold(threshold=threshold) selector.fit(features) self.reducer = selector # Option 3 (PCA): if self.reducer_type == Reducers.pca: pca = decomposition.PCA(n_components=self.explained_variance, svd_solver="full") pca.fit(features) self.reducer = pca # Calculate elapsed time: end_time = time.time() elapsed_time = end_time - start_time print("Training preprocessor took %.2f seconds" % elapsed_time)
def pre_process(X, X_pred, mode='full'): preprocessings = { 'standards':preprocessing.StandardScaler(), 'minmaxs':preprocessing.MinMaxScaler(), 'robusts':preprocessing.RobustScaler(), 'PCA': PCA(), 'PowerTransformer':preprocessing.Normalizer(), 'variance_threshold':feature_selection.VarianceThreshold(threshold=0.5), } if mode == 'full': pipe_preprocessing = make_pipeline( # preprocessings['variance_threshold'], preprocessings['standards'], <<<<<<< HEAD preprocessings['robusts'], preprocessings['PCA'], preprocessings['PowerTransformer'], preprocessings['minmaxs'], ======= # preprocessings['minmaxs'], preprocessings['robusts'], # preprocessings['PCA'], preprocessings['PowerTransformer'], >>>>>>> bb38b4fdedf3f7cc3dab38fea065353aeef512fa ) elif mode == 'min_max_scale': pipe_preprocessing = make_pipeline( preprocessings['minmaxs'], preprocessings['PCA'], ) full = np.concatenate((X, X_pred), axis=0) pipe_preprocessing.fit(full) X = pipe_preprocessing.transform(X) X_pred = pipe_preprocessing.transform(X_pred) return X, X_pred
def feature_selection_with_covariates(x_train, x_test, y_train, continuous_indices, categorical_indices, feature_names): # Split data for continuous, categorical preprocessing x_train_cont, x_test_cont = x_train[:, continuous_indices], x_test[:, continuous_indices] x_train_cat, x_test_cat = x_train[:, categorical_indices], x_test[:, categorical_indices] # Standardization for continuous data preproc = preprocessing.StandardScaler().fit(x_train_cont) x_train_z = preproc.transform(x_train_cont) x_test_z = preproc.transform(x_test_cont) # Variance threshold for categorical data varthresh = feature_selection.VarianceThreshold( threshold=0).fit(x_train_cat) x_train_v = varthresh.transform(x_train_cat) x_test_v = varthresh.transform(x_test_cat) x_train_data = np.hstack((x_train_z, x_train_v)) x_test_data = np.hstack((x_test_z, x_test_v)) # Feature selection with extra trees extra_tree_fs = ensemble.ExtraTreesClassifier(random_state=seed) feature_model = feature_selection.SelectFromModel(extra_tree_fs, threshold="2*mean") # Transform train and test data with feature selection model x_train_feature_selected = feature_model.fit_transform( x_train_data, y_train) x_test_feature_selected = feature_model.transform(x_test_data) feature_indices = feature_model.get_support(indices=True) cleaned_features = [feature_names[i] for i in feature_indices] return x_train_feature_selected, x_test_feature_selected, cleaned_features
def clean_features(data, name): df = data[name] X_columns = df.columns # remove stellar classes flt = ~(("nbg" == X_columns) | X_columns.str.endswith("_id") | X_columns.str.contains("_scls_") | X_columns.str.endswith("AndersonDarling") | X_columns.str.endswith("StetsonJ") | X_columns.str.endswith("StetsonK")) X_columns = X_columns[flt] X_columns # remove signatures X_columns = X_columns[~X_columns.str.startswith("Signature_")] X_columns # columns with nan and null with_nulls = set() for df in data.values(): for c in X_columns: if df[c].isnull().any(): with_nulls.add(c) print("Removing {} because null".format(list(with_nulls))) X_columns = X_columns[~X_columns.isin(with_nulls)] # low variance df = pd.concat(data.values()) y = df["nbg"].values vt = fs.VarianceThreshold() vt.fit(df[X_columns].values, y) print("Removing {} because lowvariance".format( list(X_columns[~vt.get_support()]))) X_columns = X_columns[vt.get_support()] return X_columns
def fitness(self): data_projected = self.data[self.attributes] if self.metric == 'variance': sel = fs.VarianceThreshold() sel.fit(data_projected) return np.average(np.array( sel.variances_)) - np.log(1 + self.violations) km = KMeans(n_clusters=self.k, random_state=self.random_seed, n_jobs=-1) labels = km.fit_predict(data_projected) if self.metric == 'euclidean': return silhouette_score(data_projected, labels) - np.log(1 + self.violations) elif self.metric == 'cosine': return silhouette_score( data_projected, labels, metric=self.metric) - np.log(1 + self.violations) else: inertia = km.inertia_ order = self.getOrder(inertia) return inertia + self.violations * order * 10**2
import scipy.stats as ss import math %matplotlib inline Features = np.array(pd.read_csv('Credit_Features.csv')) Labels = np.array(pd.read_csv('Credit_Labels.csv')) print(Features.shape) print(Labels.shape) ## -->> Eliminate low variance features # VarianceThreshold function print(Features.shape) ## Define the variance threhold and fit the threshold to the feature array. sel = fs.VarianceThreshold(threshold=(.8 * (1 - .8))) Features_reduced = sel.fit_transform(Features) ## Print the support and shape for the transformed features print(sel.get_support()) print(Features_reduced.shape) ## -->>Select k best features # RFECV function ## Reshape the Label array Labels = Labels.reshape(Labels.shape[0],) ## Set folds for nested cross validation nr.seed(988) feature_folds = ms.KFold(n_splits=10, shuffle = True)
def get_zero_variance_filter(X_train): tmp = feature_selection.VarianceThreshold() return tmp.fit()
# 特征变换 for i in range(len(names_without_sex)): data[names_without_sex[i]+'_log1p']=data[names_without_sex[i]].map(np.log1p) data[names_without_sex[i]+'_sqrt']=data[names_without_sex[i]].map(np.sqrt) for j in range(i+1, len(names_without_sex)): data[names_without_sex[i]+'*'+names_without_sex[j]]=data[names_without_sex[i]]*data[names_without_sex[j]] del i,j # 特征选择 names=data.columns.drop(['id', '血糖']) #划分训练集和测试集 train_xs=data.loc[data['血糖'] != 'unknown', names] train_ys=data.loc[data['血糖'] != 'unknown', '血糖'] test_x =data.loc[data['血糖'] == 'unknown', names] test_y =data.loc[data['血糖'] == 'unknown', ['id', '血糖']] #特征选择:方差选择 VarianceThreshold=feature_selection.VarianceThreshold(threshold=0.1).fit(train_xs) train_xs=VarianceThreshold.transform(train_xs) test_x =VarianceThreshold.transform(test_x) #特征选择:SelectPercentile SelectPercentile=feature_selection.SelectPercentile(feature_selection.f_regression, percentile=50).fit( train_xs, train_ys.map(np.float64)) train_xs=SelectPercentile.transform(train_xs) test_x =SelectPercentile.transform(test_x) # 降维 PCA=decomposition.PCA(n_components=50).fit(train_xs) train_xs=PCA.transform(train_xs) test_x =PCA.transform(test_x) #标准化 scaler=preprocessing.StandardScaler().fit(train_xs) train_xs=scaler.transform(train_xs) test_x =scaler.transform(test_x)
def get_variances(self): sel = fs.VarianceThreshold() sel.fit(self.data) return zip(self.attributes, sel.variances_)
dir = 'E:/10.kaggle(dont-overfit2)' train = pd.read_csv(os.path.join(dir, 'train.csv')) print(train.info()) print(train.columns) sns.countplot(x='target',data=train) #filter unique value features train1 = train.iloc[:,2:] y = train['target'].astype(int) X_train, X_eval, y_train, y_eval = model_selection.train_test_split(train1, y, test_size=0.1, random_state=1) stages = [ ('imputer', preprocessing.Imputer()), ('zv_filter', feature_selection.VarianceThreshold()), ('feature_selector', feature_selection.SelectKBest(score_func=feature_selection.f_classif)), ('classifier', linear_model.LogisticRegression()) ] pipeline_ml = pipeline.Pipeline(stages) pipeline_grid = {'feature_selector__k':[70, 75, 100], 'classifier__C':[0.001, 0.01, 0.1, 0.2, 0.5],'classifier__penalty':['l1', 'l2'], 'classifier__class_weight':['balanced', None]} pipeline_generated = cutils.grid_search_best_model(pipeline_ml, pipeline_grid, X_train, y_train, scoring="roc_auc") final_estimator = pipeline_generated.named_steps['classifier'] print(pipeline_generated.score(X_eval, y_eval)) test = pd.read_csv(os.path.join(dir, 'test.csv')) print(test.info()) print(test.columns) test1 = test.iloc[:,1:] test['target'] = np.round(pipeline_generated.predict_proba(test1)[:,1], 2)
print('Loading Trainset') DATA_FOLDER = '../../../data/' RES_FOLDER = '../../../results/base_features/' train_x, gene_names_x, cell_names_x = dl.load_data(DATA_FOLDER + 'train_data.csv.gz') # TODO: FIX DATA_LOAD FUNCTION SUCH THAT THIS ISN'T NECESSARY train_x = train_x[:, 1:] gene_names_x = gene_names_x[1:] cell_names_x = cell_names_x[1:] train_y, cell_names_y = dl.load_response(DATA_FOLDER + 'response.csv.gz') # Preprocessing varThresh = fsel.VarianceThreshold(threshold=0.1).fit(train_x) scaled_x = prep.scale(varThresh.transform(train_x)) print('Loading Herring 2017') herring_x, gene_names_herring, cell_names_herring = dl.load_data( DATA_FOLDER + 'herring2017_data.csv.gz') # TODO: FIX DATA_LOAD FUNCTION SUCH THAT THIS ISN'T NECESSARY herring_x = herring_x[:, 1:] gene_names_herring = gene_names_herring[1:] herring_scaled = prep.scale(varThresh.transform(herring_x)) # Load Joost 2016 Data print('Loading Joost 2016') joost_x, gene_names_joost, cell_names_joost = dl.load_data( DATA_FOLDER + 'joost2016_data.csv.gz') # TODO: FIX DATA_LOAD FUNCTION SUCH THAT THIS ISN'T NECESSARY