print "Number of features {}".format(df.shape[1])
 column_with_nan = df.isnull().sum()
 # drop columns with no. of NaN values greater than 3000
 df = df.loc[:, column_with_nan <= 3000]
 # fill missing values with mean column values
 df = df.fillna(df.mean())
 # Above results in 4 fields with still NaN values
 Nan_columns = df.columns[df.isnull().any()].tolist()
 print "Columns with NaN values left are after removing features with Nan greater than 3000 :{}".format(
     Nan_columns)
 df = df.drop(columns=Nan_columns)
 df = df.apply(LabelEncoder().fit_transform)
 print "Number of features left {}".format(df.shape[1])
 x = df.values
 # Normalize the features
 standard_scaler = StandardScaler()
 x_std = standard_scaler.fit_transform(x)
 pca = PCA(n_components=None)
 pca.fit(x_std)
 # uncomment lines below to see variance retained vs the number of
 # components
 # number_components = 0
 for x in range(x_std.shape[1]):
     pca = PCA(n_components=x)
     pca.fit_transform(x_std)
     # Achieve around 94% of variance retention
     if sum(pca.explained_variance_ratio_) > 0.94:
         break
 x_std = pca.fit_transform(x_std)
 # Uncomment to see how variance varies with no. of components
 # plt.plot(range(0, 119), pca.explained_variance_ratio_)
    def __init__(self,
                 df,
                 sample_target_dict,
                 columns_to_drop,
                 apply_pca=True,
                 pca_perc=.8,
                 project_name="Default",
                 overwrite_figure_path=None,
                 show_visuals=True,
                 ):
        """
        df:
            Must be a pandas dataframe object

        sample_target_dict:
            Column name(s) to value(s) in the dataframe to create a pandas
            dataframe with just those value(s).

        columns_to_drop:
            Column names to drop from the dataframe

        apply_pca:
            Data had already

        pca_perc:
            PCA cutoff point

        project_name:
            Starting folder name where the system

        overwrite_figure_path:
            Overwrites the absolute path for the images to be generated
        """

        def enum(**enums):
            return type('Enum', (), enums)

        if overwrite_figure_path:
            output_fig_sub_dir = overwrite_figure_path
        else:
            if pca_perc > 1:
                pca_perc = 1
            output_fig_sub_dir = "/Figures/" + project_name + \
                                 "/SampleRemoval_PCA_Features={0}".format(
                                     pca_perc)

        # Project directory structure
        self.__PROJECT = enum(
            PATH_TO_OUTPUT_FOLDER=''.join(
                os.getcwd().partition('/Libraries')[0:1]) + output_fig_sub_dir)

        # Copy dataframes for later use
        df = copy.deepcopy(df)

        # Create dataframe of only target values
        for col, df_value in sample_target_dict.items():

            if isinstance(df_value, int):
                df_value = [df_value]

            for val in df_value:
                df = df[df[col] == val]

        for col in columns_to_drop:
            df.drop(columns=[col],
                    inplace=True)

        # --- Apply pca ---
        if apply_pca:

            # Create scaler object
            scaler = StandardScaler()
            scaled = scaler.fit_transform(df)

            print("\nInspecting scaled results!")
            self.__inspect_feature_matrix(matrix=scaled,
                                          feature_names=df.columns)

            pca, scaled = self.__visualize_pca_variance(scaled, show_visuals)

            # Generate "dummy" feature names
            pca_feature_names = ["PCA_Feature_" +
                                 str(i) for i in range(1,
                                                       scaled.shape[1] + 1)]

            print("\nInspecting applied pca results!")
            self.__inspect_feature_matrix(matrix=scaled,
                                          feature_names=pca_feature_names)

            # Use only some of the features based on the PCA percentage
            if pca_perc < 1.0:
                cutoff_index = np.where(
                    pca.explained_variance_ratio_.cumsum() > pca_perc)[0][0]
            # Use all features
            else:
                cutoff_index = scaled.shape[1] - 1

            print(
                "After applying pca with a cutoff percentage of {0}%"
                " for the cumulative index. Using features 1 to {1}".format(
                    pca_perc, cutoff_index + 1))

            print("Old shape {0}".format(scaled.shape))

            scaled = scaled[:, :cutoff_index + 1]
            pca_feature_names = pca_feature_names[0: cutoff_index + 1]

            print("New shape {0}".format(scaled.shape))

            scaled = scaler.fit_transform(scaled)

            print("\nInspecting re-applied scaled results!")
            self.__inspect_feature_matrix(matrix=scaled,
                                          feature_names=pca_feature_names)

            self.__scaled = scaled

        # Assumed PCA has already been applied; pass as matrix
        else:
            self.__scaled = df.values

        new_folder_path = ''.join(
            os.getcwd().partition('/Libraries')[0:1]) + "/Figures/" + \
                          project_name + "/SampleRemoval_PCA_Features={0}".format(
            scaled.shape[1])

        if not os.path.exists(new_folder_path):
            os.rename(self.__PROJECT.PATH_TO_OUTPUT_FOLDER,
                      new_folder_path)
        else:
            shutil.rmtree(self.__PROJECT.PATH_TO_OUTPUT_FOLDER)
        self.__PROJECT = enum(
            PATH_TO_OUTPUT_FOLDER=new_folder_path)

        self.__df_index_values = df.index.values

        # Init dummy variables to only be used for multithreading
        self.__index_array = None
        self.__total_indexes = None
        self.__tmp_reduced_scaled = None
        self.__all_dp_dist_list = None
        self.__pbar = None
from sklearn.pipeline import Pipeline
from scipy import interpolate

df=pd.read_csv('ccdefault.csv',index_col='ID')
df.head()

df.dropna()

X=df.iloc[:,0:23].values
y=df.iloc[:,23].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=33, stratify=y)
#part 1
rf=RandomForestClassifier(n_estimators=50,criterion='gini',random_state=1,n_jobs=-1)
pipe=Pipeline([['sc',StandardScaler()],['randomforest',rf]])
params={'randomforest__n_estimators':[20,50,75,90,100]}
grid=GridSearchCV(estimator=pipe,param_grid=params,cv=2)#,scoring='roc_auc'
grid.fit(X_train,y_train)
scores=cross_val_score(grid,X_train,y_train,scoring='accuracy',cv=5)
y_pred=grid.predict(X_test)
results=grid.cv_results_
print('')
print('GridSearch:')
print('Tuned Model Parameters:{}'.format(grid.best_params_))
#print('In-sample Accuracy:%.4f'% grid.best_score_)
print('In-sample CV Accuracy:%.4f +/- %.4f'% (np.mean(scores),np.std(scores)))
forest=grid.best_estimator_
forest.fit(X_train,y_train)
print('Out-Sample Accuracy:%.4f'% grid.score(X_test,y_test))