for var in cat_vars: cat_list = pd.get_dummies(hr[var], prefix = var) hr = hr.join(cat_list) hr.head() hr.drop(columns = ['department', 'salary'], axis = 1, inplace = True) #Task 4: Visualize class imbalance from yellowbrick.target import ClassBalance plt.style.use("ggplot") plt.rcParams['figure.figsize'] = (12,8) visualizer = ClassBalance(labels = ['stayed','quit']).fit(hr.quit) visualizer.show() #Task 5: Create traning and tests sets X = hr.loc[:, hr.columns != 'quit'] y = hr.quit from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y,random_state = 0, test_size=0.2,stratify=y) #Task 6 : Build an interactive decision tree classifer from sklearn import tree from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score
def main(): script, fname, model = argcheck() df = filecheck(fname) print(df.head(5)) #Data stats #Printing the number of rows and columns print(df.info()) print("The number of rows") print(len(df)) print("The number of columns") print(len(df.columns)) print("Dataframe shape") print(df.shape) #Data preprocessing - step 1(Check for any null - N/A values) print("\n-------Data Preprocessing - Step 1--------") print("------------------------------------------") print("Checking for any N/A values") print(df.isna().values.any()) #Check for any Null values print("Checking for any null values") print(df.isnull().values.any()) #Data Preprocessing - step 2(Addressing class imbalance problem) print("\n-------Data Preprocessing - Step 2--------") print("------------------------------------------") Y = pd.DataFrame(data=df['Activity']) X = df.drop(['Activity'], axis=1) print("Before applying SMOTE algorithm") print("Unique values and count of target column 'Activity -'") print(df.groupby('Activity').nunique()) unique_labels, frequency = np.unique(Y, return_counts=True) #Generating class balance chart before applying SMOTE. The chart is generated as 'Class-balance-Before-SMOTE.png' in the 'output directory' print("The class balance is generated as 'Class-balance-Before-SMOTE.png'") visualizer1 = ClassBalance(labels=unique_labels, size=(1400, 1000)) visualizer1.fit(Y.values.ravel()) visualizer1.show("output/Class-balance-Before-SMOTE.png") #Solving the class imbalance problem by oversampling the data smote = SMOTE(random_state=1) X_1, Y_1 = smote.fit_resample(X, Y) print("After applying SMOTE algorithm") X_1_df = pd.DataFrame(data=X_1, columns=X.columns) Y_1_df = pd.DataFrame(data=Y_1, columns=Y.columns) print("The new shape of the X dataframe") print(X_1_df.shape) print("The new shape of the Y dataframe") print(Y_1_df.shape) unique, frequency = np.unique(Y_1, return_counts=True) # print unique values array print("Unique Values of new Y dataframe:", unique) # print frequency array print("Frequency Values of new Y dataframe:", frequency) #Generating class balance chart after applying SMOTE. The chart is generated as 'Class-balance-After-SMOTE.png' in the 'output directory' print("The class balance is generated as 'Class-balance-After-SMOTE.png'") visualizer2 = ClassBalance(labels=unique_labels, size=(1400, 1000)) visualizer2.fit(Y_1_df.values.ravel()) visualizer2.show("output/Class-balance-After-SMOTE.png") #Data Preprocessing - step 3(Label Encoding) print("\n-------Data Preprocessing - Step 3--------") print("------------------------------------------") #Convert the string labels to integers # 0- 'LAYING' # 1 - 'SITTING' # 2 - 'STANDING' # 3 - 'WALKING' # 4 - 'WALKING_DOWNSTAIRS' # 5 - 'WALKING_UPSTAIRS' label_encoder = preprocessing.LabelEncoder() Y_1_df['Activity'] = label_encoder.fit_transform(Y_1_df['Activity']) print("After label encoding, the target values are") classes = Y_1_df['Activity'].unique() print(Y_1_df['Activity']) #Data Preprocessing - step 4(Covariance/Correlation, standardization) print("\n-------Data Preprocessing - Step 4--------") print("------------------------------------------") #Covariance and correlation - Task 1(Preeti) dfCov = np.cov(X_1_df, Y_1_df, rowvar=False, bias=True) print(dfCov) #Calculates Pearson product-moment correlation coefficients dfCorr = np.corrcoef(X_1_df, Y_1_df, rowvar=False, bias=True) print("Correlation coefficient obtained : ", dfCorr) #Data preprocessing - Step 5(Splitting the training and testing dataset) (JunYong or Preeti) print( "\n-------Data Preprocessing - Step 5(Splitting into training and testing dataset)--------" ) print("------------------------------------------") X_train, X_test, y_train, y_test = train_test_split(X_1_df, Y_1_df, random_state=1, test_size=0.2) #Data preprocessing - Step 6(Standardize the dataset) print("\n-------Data Preprocessing - Step 6--------") print("------------------------------------------") sc_X = preprocessing.StandardScaler() X_trainscaled = sc_X.fit_transform(X_train) X_testscaled = sc_X.transform(X_test) print("Mean of the standardized training set : ", X_trainscaled.mean(axis=0)) print("std of the standardized training set : ", X_trainscaled.std(axis=0)) print("Mean of the standardized test set : ", X_testscaled.mean(axis=0)) print("std of the standardized test set : ", X_testscaled.std(axis=0)) # Execute different model module based on input from user if model == 'decisiontree': decisiontree.decisionTreeTest(X_train, X_test, y_train, y_test, classes, X_1_df, Y_1_df) elif model == 'svm': svm.svmLinearTest(X_train, X_test, y_train, y_test, classes, X_1_df, Y_1_df) elif model == 'svmnonlinear': svmnonlinear.svmNonLinearTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df) elif model == 'naivebayes': naiveBayes.naiveBayesClassifierTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df) elif model == 'logisticregression': logisticregression.logisticRegressionTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df) elif model == 'knn': knn.knnTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df) elif model == 'bagging': bagging.baggingTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df) elif model == 'adaboost': adaboost.adaboostTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df) elif model == 'randomforest': randomforest.randomForestTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df) elif model == 'ensemblevote': ensemble.ensembleClassifier(X_train, X_test, y_train, y_test, X_1_df, Y_1_df) else: print("please enter the correct classifier name") sys.exit()
#morg_2_test_strs_broken #activ_inact_train #activ_inact_test frames = [morg_2_train_strs_broken, activ_inact_train] import pandas as pd dfrad = pd.concat(frames, axis=1) dfrad = dfrad.dropna() #dfrad.iloc[:,[2048]] #dfrad.iloc[:,:100] #CLASS BALANCE - No balanced from yellowbrick.target import ClassBalance visCB = ClassBalance(labels=[1, 0]) visCB.fit(dfrad['activities']) #Fit the data to the visualizer visCB.show() #Finalize and render the figure #RANK 2D "Pearson correlation" -No balanced from yellowbrick.features import Rank2D visualizer = Rank2D(algorithm='pearson') visualizer.fit(dfrad.iloc[:, :50], dfrad['activities']) # Fit the data to the visualizer visualizer.transform(dfrad.iloc[:, :50]) # Transform the data visualizer.show() # Finalize and render the figure #MANIFOLD - No balanced from yellowbrick.features import Manifold classes = [1, 0] from sklearn import preprocessing label_encoder = preprocessing.LabelEncoder( ) #label_encoder object knows how to understand word labels.
import pandas as pd import datetime from yellowbrick.target import ClassBalance print(datetime.datetime.now()) path = 'data/cleaned_data.csv' pathr = 'data/resampled.csv' pathr2 = 'data/resampled_borderline.csv' pathr3 = 'data/resampled_adasyn.csv' pathr4 = 'data/resampled_tomek.csv' randomState = 42 classLabels = ['Not Bankrupt', 'Bankrupt'] df = pd.read_csv(pathr4, index_col=0) print('Import done.') # Extract labels from features y = df['BK'] X = df.drop('BK', axis=1) # Instantiate Visualizer viz = ClassBalance(labels=classLabels) viz.fit(y) viz.show() print(viz.support_)
def describe( context: MLClientCtx, table: Union[DataItem, str], label_column: str, class_labels: List[str], key: str = "table-summary", ) -> None: """Summarize a table TODO: merge with dask version :param context: the function context :param table: pandas dataframe :param key: key of table summary in artifact store """ _gcf_clear(plt) base_path = context.artifact_path os.makedirs(base_path, exist_ok=True) os.makedirs(base_path + "/plots", exist_ok=True) print(f'TABLE {table}') table = pd.read_parquet(str(table)) header = table.columns.values # describe table sumtbl = table.describe() sumtbl = sumtbl.append(len(table.index) - table.count(), ignore_index=True) sumtbl.insert( 0, "metric", ["count", "mean", "std", "min", "25%", "50%", "75%", "max", "nans"]) sumtbl.to_csv(os.path.join(base_path, key + ".csv"), index=False) context.log_artifact(key, local_path=key + ".csv") # plot class balance, record relative class weight _gcf_clear(plt) labels = table.pop(label_column) class_balance_model = ClassBalance(labels=class_labels) class_balance_model.fit(labels) scale_pos_weight = class_balance_model.support_[ 0] / class_balance_model.support_[1] #context.log_artifact("scale_pos_weight", f"{scale_pos_weight:0.2f}") context.log_artifact("scale_pos_weight", str(scale_pos_weight)) class_balance_model.show( outpath=os.path.join(base_path, "plots/imbalance.png")) context.log_artifact(PlotArtifact("imbalance", body=plt.gcf()), local_path="plots/imbalance.html") # plot feature correlation _gcf_clear(plt) tblcorr = table.corr() ax = plt.axes() sns.heatmap(tblcorr, ax=ax, annot=False, cmap=plt.cm.Reds) ax.set_title("features correlation") plt.savefig(os.path.join(base_path, "plots/corr.png")) context.log_artifact(PlotArtifact("correlation", body=plt.gcf()), local_path="plots/corr.html") # plot histogram _gcf_clear(plt) """
#%% # for column in X.columns[2:16]: # plt.scatter(X[column], y) # plt.xlabel(column) # plt.show() #%% from yellowbrick.features.radviz import RadViz features = X.columns[:13] visualizer = RadViz(classes=class_labels, features=features) visualizer.fit(X[features], y) visualizer.transform(df[features]) visualizer.show() #%% from yellowbrick.target import FeatureCorrelation visualizer = FeatureCorrelation(labels=features) visualizer.fit(X[features], y) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure #%% from yellowbrick.features import JointPlotVisualizer visualizer = JointPlotVisualizer() visualizer.fit_transform(X["grade"], y) # Fit and transform the data