print "Number of features {}".format(df.shape[1]) column_with_nan = df.isnull().sum() # drop columns with no. of NaN values greater than 3000 df = df.loc[:, column_with_nan <= 3000] # fill missing values with mean column values df = df.fillna(df.mean()) # Above results in 4 fields with still NaN values Nan_columns = df.columns[df.isnull().any()].tolist() print "Columns with NaN values left are after removing features with Nan greater than 3000 :{}".format( Nan_columns) df = df.drop(columns=Nan_columns) df = df.apply(LabelEncoder().fit_transform) print "Number of features left {}".format(df.shape[1]) x = df.values # Normalize the features standard_scaler = StandardScaler() x_std = standard_scaler.fit_transform(x) pca = PCA(n_components=None) pca.fit(x_std) # uncomment lines below to see variance retained vs the number of # components # number_components = 0 for x in range(x_std.shape[1]): pca = PCA(n_components=x) pca.fit_transform(x_std) # Achieve around 94% of variance retention if sum(pca.explained_variance_ratio_) > 0.94: break x_std = pca.fit_transform(x_std) # Uncomment to see how variance varies with no. of components # plt.plot(range(0, 119), pca.explained_variance_ratio_)
def __init__(self, df, sample_target_dict, columns_to_drop, apply_pca=True, pca_perc=.8, project_name="Default", overwrite_figure_path=None, show_visuals=True, ): """ df: Must be a pandas dataframe object sample_target_dict: Column name(s) to value(s) in the dataframe to create a pandas dataframe with just those value(s). columns_to_drop: Column names to drop from the dataframe apply_pca: Data had already pca_perc: PCA cutoff point project_name: Starting folder name where the system overwrite_figure_path: Overwrites the absolute path for the images to be generated """ def enum(**enums): return type('Enum', (), enums) if overwrite_figure_path: output_fig_sub_dir = overwrite_figure_path else: if pca_perc > 1: pca_perc = 1 output_fig_sub_dir = "/Figures/" + project_name + \ "/SampleRemoval_PCA_Features={0}".format( pca_perc) # Project directory structure self.__PROJECT = enum( PATH_TO_OUTPUT_FOLDER=''.join( os.getcwd().partition('/Libraries')[0:1]) + output_fig_sub_dir) # Copy dataframes for later use df = copy.deepcopy(df) # Create dataframe of only target values for col, df_value in sample_target_dict.items(): if isinstance(df_value, int): df_value = [df_value] for val in df_value: df = df[df[col] == val] for col in columns_to_drop: df.drop(columns=[col], inplace=True) # --- Apply pca --- if apply_pca: # Create scaler object scaler = StandardScaler() scaled = scaler.fit_transform(df) print("\nInspecting scaled results!") self.__inspect_feature_matrix(matrix=scaled, feature_names=df.columns) pca, scaled = self.__visualize_pca_variance(scaled, show_visuals) # Generate "dummy" feature names pca_feature_names = ["PCA_Feature_" + str(i) for i in range(1, scaled.shape[1] + 1)] print("\nInspecting applied pca results!") self.__inspect_feature_matrix(matrix=scaled, feature_names=pca_feature_names) # Use only some of the features based on the PCA percentage if pca_perc < 1.0: cutoff_index = np.where( pca.explained_variance_ratio_.cumsum() > pca_perc)[0][0] # Use all features else: cutoff_index = scaled.shape[1] - 1 print( "After applying pca with a cutoff percentage of {0}%" " for the cumulative index. Using features 1 to {1}".format( pca_perc, cutoff_index + 1)) print("Old shape {0}".format(scaled.shape)) scaled = scaled[:, :cutoff_index + 1] pca_feature_names = pca_feature_names[0: cutoff_index + 1] print("New shape {0}".format(scaled.shape)) scaled = scaler.fit_transform(scaled) print("\nInspecting re-applied scaled results!") self.__inspect_feature_matrix(matrix=scaled, feature_names=pca_feature_names) self.__scaled = scaled # Assumed PCA has already been applied; pass as matrix else: self.__scaled = df.values new_folder_path = ''.join( os.getcwd().partition('/Libraries')[0:1]) + "/Figures/" + \ project_name + "/SampleRemoval_PCA_Features={0}".format( scaled.shape[1]) if not os.path.exists(new_folder_path): os.rename(self.__PROJECT.PATH_TO_OUTPUT_FOLDER, new_folder_path) else: shutil.rmtree(self.__PROJECT.PATH_TO_OUTPUT_FOLDER) self.__PROJECT = enum( PATH_TO_OUTPUT_FOLDER=new_folder_path) self.__df_index_values = df.index.values # Init dummy variables to only be used for multithreading self.__index_array = None self.__total_indexes = None self.__tmp_reduced_scaled = None self.__all_dp_dist_list = None self.__pbar = None
from sklearn.pipeline import Pipeline from scipy import interpolate df=pd.read_csv('ccdefault.csv',index_col='ID') df.head() df.dropna() X=df.iloc[:,0:23].values y=df.iloc[:,23].values X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.1, random_state=33, stratify=y) #part 1 rf=RandomForestClassifier(n_estimators=50,criterion='gini',random_state=1,n_jobs=-1) pipe=Pipeline([['sc',StandardScaler()],['randomforest',rf]]) params={'randomforest__n_estimators':[20,50,75,90,100]} grid=GridSearchCV(estimator=pipe,param_grid=params,cv=2)#,scoring='roc_auc' grid.fit(X_train,y_train) scores=cross_val_score(grid,X_train,y_train,scoring='accuracy',cv=5) y_pred=grid.predict(X_test) results=grid.cv_results_ print('') print('GridSearch:') print('Tuned Model Parameters:{}'.format(grid.best_params_)) #print('In-sample Accuracy:%.4f'% grid.best_score_) print('In-sample CV Accuracy:%.4f +/- %.4f'% (np.mean(scores),np.std(scores))) forest=grid.best_estimator_ forest.fit(X_train,y_train) print('Out-Sample Accuracy:%.4f'% grid.score(X_test,y_test))