def analze_feature_importance(x_train, y_train): rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True, bootstrap=True, random_state=42) rf.fit(x_train, y_train) print(rf.get_support())
def select(cfg, args): matrix_genus = pd.read_hdf(args.i, key='genus') phylo = pd.read_csv(args.phylo, index_col=0) C = args.C print(matrix_genus) if not args.use_var and not args.use_rf: cols = set(phylo.columns.tolist()) mat = phylo.set_index('genus').join(matrix_genus).fillna(0) mat = mat.drop(columns=(cols - {'genus'})) print(mat.head()) mat.to_hdf(args.o, key='genus') elif args.use_rf: X = (matrix_genus / matrix_genus.sum()).T # abundance Y = pd.concat([ pd.read_hdf(args.labels, key='l' + str(layer)) for layer in range(args.dmax) ], axis=1) selector = RandomForestRegressor(n_estimators=500, max_depth=10, n_jobs=args.p, random_state=1, verbose=5) selector.fit(X, Y) print('Done') importance = selector.feature_importances_ select_idx = importance > C * importance.mean() new_phylo = phylo.iloc[select_idx, :].reset_index(drop=True) print('New phylogeny:') print(new_phylo) matrix_genus.loc[new_phylo['genus'], :].to_hdf(args.o, key='genus') new_phylo.to_csv( os.path.join( args.tmp, 'phylogeny_selected_using_rf_importance_C{}.csv'.format(C))) elif args.use_var: X = (matrix_genus / matrix_genus.sum()).T variance = X.var(axis=0) selector = VarianceThreshold(threshold=C * variance.mean()) selector.fit(X) select_idx = selector.get_support() new_phylo = phylo.iloc[select_idx, :].reset_index(drop=True) print('New phylogeny:') print(new_phylo) matrix_genus.loc[new_phylo['genus'], :].to_hdf(args.o, key='genus') new_phylo.to_csv( os.path.join( args.tmp, 'phylogeny_selected_using_varianceThreshold_C{}.csv'.format( C))) else: raise InterruptedError( 'Please specify `-use-var` or `-use-rf` or none of them. See GitHub for details.' )
def cleaning(data, y, params): size = len(data) ids = ["ID", "id", "Id", "iD"] def isId(i): for j in ids: if j in i: return True return False print("=" * 100) print("Selecting unnecessary columns for dropping...") drp = set() for i in data.columns: if (isId(i)) or (data[i].dtype == 'O' and len(data[i].value_counts()) > 300): drp.add(i) params["nullcount"] = sum(data.isnull().sum() > 0) for i in data.columns: if data[i].isnull().sum() >= size // 2: drp.add(i) data.drop(drp, axis=1, inplace=True) print("Dropped Columns - ", drp) print("=" * 100) print("Selecting Categorical and Numerical columns...") cat_col = [] num_col = [] for i in data.columns: if data[i].dtype == 'O': cat_col.append(i) num_col = list(data.select_dtypes(include=np.number).columns) print("Categorical columns are: ", cat_col) print("Numerical Columns are: ", num_col) print("=" * 100) print("Filling NULL values...") for i in cat_col: data[i] = data[i].fillna("Missing") imp = IterativeImputer(random_state=123) data[num_col] = pd.DataFrame(imp.fit_transform(data[num_col]), columns=num_col) print("NULL Values removed!") print("Checking is any null values are left: ", any(data.isnull().sum() > 0)) print("=" * 100) if len(cat_col) != 0: for i in cat_col: temp = data.groupby(i).size() / size data.loc[:, i + '_val'] = data[i].map(temp) data.loc[data[i + '_val'] <= 0.01, i] = 'Rare' data.drop(i + '_val', axis=1, inplace=True) print("Creating dummy variables...") data = pd.get_dummies(data, drop_first=True) print("Final shape of dataset: ", data.shape) print("=" * 100) print("Dropping Highly correlated features...") corr_matrix = data.corr().abs() upper = corr_matrix.where( np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) to_drop = [ column for column in upper.columns if any(upper[column] >= 0.80) ] data.drop(to_drop, axis=1, inplace=True) print("Features Dropped: ", to_drop) print("=" * 100) print("Selecting important Features...") mms = MinMaxScaler() data_scaled = mms.fit_transform(data) selector1 = LassoCV() selector1 = SelectFromModel(selector1) selector1.fit(data_scaled, y) selector1.get_support() feat_selected1 = list(data.columns[(selector1.get_support())]) feat_imp = set() temp1 = list((selector1.estimator_.coef_)[(selector1.get_support())]) mx = max(temp1) mn = min(temp1) def transform(x): return (x - mn) / (mx - mn) for i in range(len(temp1)): feat_imp.add( (feat_selected1[i], round(transform(abs(temp1[i])) * 100, 2))) feat_imp = list(feat_imp) feat_imp.sort(key=lambda x: x[1], reverse=True) feat_show = feat_imp[:min(5, len(feat_imp))] feat_show[-1] = (feat_show[-1][0], feat_show[-1][1] + 1) params["features"] = feat_show print(params["features"]) selector2 = RandomForestRegressor() selector2 = SelectFromModel(selector2) selector2.fit(data_scaled, y) selector2.get_support() feat_selected2 = list(data.columns[(selector2.get_support())]) feat_selected = list(set(feat_selected1 + feat_selected2)) print("Found {} important features".format(len(feat_selected))) params["feat_count"] = len(feat_selected) data_feat = data[feat_selected] print("final shape of dataset", data_feat.shape) print("=" * 100) return data_feat, y, params