def balance_class_balance(path="images/class_balance.png"): data = load_game() y = data["outcome"] oz = ClassBalance(labels=["draw", "loss", "win"]) oz.fit(y) return oz.poof(outpath=path)
def balance(): X, y = load_occupancy() _, _, y_train, y_test = tts(X, y, test_size=0.2) oz = ClassBalance(ax=newfig(), labels=["unoccupied", "occupied"]) oz.fit(y_train, y_test) savefig(oz, "class_balance")
def target_visualizer(self, classes=None, params={'BalancedBinningReference': { 'bins': 5 }}): LOGGER.info('Initializing target visualizer') if os.path.isdir(os.path.join(os.getcwd(), 'visualizer/')) == False: os.makedirs(os.path.join(os.getcwd(), 'visualizer/')) visualizers = [] y = self.y.squeeze() try: LOGGER.info('Visualizer BalancedBinningReference') visualizer = BalancedBinningReference() if visualizer.__class__.__name__ in params.keys(): visualizer = BalancedBinningReference( **params[visualizer.__class__.__name__]) visualizer.fit(y) visualizer.show(outpath=os.path.join( os.getcwd(), f"visualizer/{visualizer.__class__.__name__}.png")) plt.cla() except: LOGGER.warn('ERROR BalancedBinning') try: LOGGER.info('Visualizer CLassBalance') visualizer = ClassBalance() if visualizer.__class__.__name__ in params.keys(): visualizer = ClassBalance( **params[visualizer.__class__.__name__]) visualizer.fit(y) visualizer.show(outpath=os.path.join( os.getcwd(), f"visualizer/{visualizer.__class__.__name__}.png")) plt.cla() except: LOGGER.warn('ERROR ClassBalance') try: LOGGER.info('Visualizer Feature Correlation') visualizer = FeatureCorrelation( method='mutual_info-classification', feature_names=self.X.columns.tolist(), sort=True) if visualizer.__class__.__name__ in params.keys(): visualizer = FeatureCorrelation( **params[visualizer.__class__.__name__]) visualizer.fit(self.X, y) visualizer.show(outpath=os.path.join( os.getcwd(), f"visualizer/{visualizer.__class__.__name__}.png")) plt.cla() except: LOGGER.warn('ERROR FeatureCorrelation')
def compare_class_balance(path="images/class_balance_compare.png"): data = load_occupancy() features = ["temperature", "relative_humidity", "light", "C02", "humidity"] classes = ['unoccupied', 'occupied'] # Extract the numpy arrays from the data frame X = data[features] y = data["occupancy"] # Create the train and test data _, _, y_train, y_test = train_test_split(X, y, test_size=0.2) # Instantiate the classification model and visualizer visualizer = ClassBalance(labels=classes) visualizer.fit(y_train, y_test) return visualizer.poof(outpath=path)
def Imbalance(y): """ Imabalance between the labels Parameters ---------- y: vector of labels Returns ------- - A plot with the class imbalances for Ebola positive or negative """ # Instantiate the visualizer visualizer = ClassBalance(labels=['Ebola negative', 'Ebola positive']) visualizer.fit(y) # Fit the data to the visualizer #visualizer.show('class_balance') # Finalize and render the figure plt.show()
def Imbalance_out(y): """ Imabalance between the labels Parameters ---------- y: vector of labels Returns ------- - A plot with the class imbalances for the outcome """ # Instantiate the visualizer visualizer = ClassBalance(labels=['Survival', 'Death']) visualizer.fit(y) # Fit the data to the visualizer #visualizer.show('class_balance') # Finalize and render the figure plt.show()
def balance_yellowbrick( X, y, features, ): plt.switch_backend('agg') plt.clf() X_train, X_test, y_train, y_test = train_test_split(X[features], y, stratify=y, test_size=0.01) X = pd.DataFrame(X_test, columns=features) y = pd.Series(y_test) visualizer = ClassBalance() visualizer.fit(y) visualizer.finalize() return plt
hr = pd.read_csv("F:/COLLEGE/ML Project/Employee Turnover/Turnover.csv") hr.profile_report(title="Data Report") pd.crosstab(hr.department, hr.quit).plot(kind='bar') plt.title("Turnover Frequency on Salary Bracket") plt.xlabel('Department') plt.ylabel('Frequency of Turnover') cat_vars = ['department', 'salary'] for var in cat_vars: cat_list = pd.get_dummies(hr[var], prefix=var) hr = hr.join(cat_list) print(hr.head()) hr.drop(columns=['department', 'salary'], axis=1, inplace=True) #balance class visualizer = ClassBalance(labels=['stayed', 'left']).fit(hr.quit) visualizer.show() X = hr.loc[:, hr.columns != 'quit'] y = hr.quit X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2, stratify=y) @interact def plot_tree(crit=['gini', 'entropy'], split=['best', 'random'], depth=IntSlider(min=1, max=30, value=2, continuous_update=False), min_split=IntSlider(min=2,
cat_vars = ['department', 'salary'] for var in cat_vars: cat_list = pd.get_dummies(hr[var], prefix = var) hr = hr.join(cat_list) hr.head() hr.drop(columns = ['department', 'salary'], axis = 1, inplace = True) #Task 4: Visualize class imbalance from yellowbrick.target import ClassBalance plt.style.use("ggplot") plt.rcParams['figure.figsize'] = (12,8) visualizer = ClassBalance(labels = ['stayed','quit']).fit(hr.quit) visualizer.show() #Task 5: Create traning and tests sets X = hr.loc[:, hr.columns != 'quit'] y = hr.quit from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y,random_state = 0, test_size=0.2,stratify=y) #Task 6 : Build an interactive decision tree classifer from sklearn import tree from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier
def main(): script, fname, model = argcheck() df = filecheck(fname) print(df.head(5)) #Data stats #Printing the number of rows and columns print(df.info()) print("The number of rows") print(len(df)) print("The number of columns") print(len(df.columns)) print("Dataframe shape") print(df.shape) #Data preprocessing - step 1(Check for any null - N/A values) print("\n-------Data Preprocessing - Step 1--------") print("------------------------------------------") print("Checking for any N/A values") print(df.isna().values.any()) #Check for any Null values print("Checking for any null values") print(df.isnull().values.any()) #Data Preprocessing - step 2(Addressing class imbalance problem) print("\n-------Data Preprocessing - Step 2--------") print("------------------------------------------") Y = pd.DataFrame(data=df['Activity']) X = df.drop(['Activity'], axis=1) print("Before applying SMOTE algorithm") print("Unique values and count of target column 'Activity -'") print(df.groupby('Activity').nunique()) unique_labels, frequency = np.unique(Y, return_counts=True) #Generating class balance chart before applying SMOTE. The chart is generated as 'Class-balance-Before-SMOTE.png' in the 'output directory' print("The class balance is generated as 'Class-balance-Before-SMOTE.png'") visualizer1 = ClassBalance(labels=unique_labels, size=(1400, 1000)) visualizer1.fit(Y.values.ravel()) visualizer1.show("output/Class-balance-Before-SMOTE.png") #Solving the class imbalance problem by oversampling the data smote = SMOTE(random_state=1) X_1, Y_1 = smote.fit_resample(X, Y) print("After applying SMOTE algorithm") X_1_df = pd.DataFrame(data=X_1, columns=X.columns) Y_1_df = pd.DataFrame(data=Y_1, columns=Y.columns) print("The new shape of the X dataframe") print(X_1_df.shape) print("The new shape of the Y dataframe") print(Y_1_df.shape) unique, frequency = np.unique(Y_1, return_counts=True) # print unique values array print("Unique Values of new Y dataframe:", unique) # print frequency array print("Frequency Values of new Y dataframe:", frequency) #Generating class balance chart after applying SMOTE. The chart is generated as 'Class-balance-After-SMOTE.png' in the 'output directory' print("The class balance is generated as 'Class-balance-After-SMOTE.png'") visualizer2 = ClassBalance(labels=unique_labels, size=(1400, 1000)) visualizer2.fit(Y_1_df.values.ravel()) visualizer2.show("output/Class-balance-After-SMOTE.png") #Data Preprocessing - step 3(Label Encoding) print("\n-------Data Preprocessing - Step 3--------") print("------------------------------------------") #Convert the string labels to integers # 0- 'LAYING' # 1 - 'SITTING' # 2 - 'STANDING' # 3 - 'WALKING' # 4 - 'WALKING_DOWNSTAIRS' # 5 - 'WALKING_UPSTAIRS' label_encoder = preprocessing.LabelEncoder() Y_1_df['Activity'] = label_encoder.fit_transform(Y_1_df['Activity']) print("After label encoding, the target values are") classes = Y_1_df['Activity'].unique() print(Y_1_df['Activity']) #Data Preprocessing - step 4(Covariance/Correlation, standardization) print("\n-------Data Preprocessing - Step 4--------") print("------------------------------------------") #Covariance and correlation - Task 1(Preeti) dfCov = np.cov(X_1_df, Y_1_df, rowvar=False, bias=True) print(dfCov) #Calculates Pearson product-moment correlation coefficients dfCorr = np.corrcoef(X_1_df, Y_1_df, rowvar=False, bias=True) print("Correlation coefficient obtained : ", dfCorr) #Data preprocessing - Step 5(Splitting the training and testing dataset) (JunYong or Preeti) print( "\n-------Data Preprocessing - Step 5(Splitting into training and testing dataset)--------" ) print("------------------------------------------") X_train, X_test, y_train, y_test = train_test_split(X_1_df, Y_1_df, random_state=1, test_size=0.2) #Data preprocessing - Step 6(Standardize the dataset) print("\n-------Data Preprocessing - Step 6--------") print("------------------------------------------") sc_X = preprocessing.StandardScaler() X_trainscaled = sc_X.fit_transform(X_train) X_testscaled = sc_X.transform(X_test) print("Mean of the standardized training set : ", X_trainscaled.mean(axis=0)) print("std of the standardized training set : ", X_trainscaled.std(axis=0)) print("Mean of the standardized test set : ", X_testscaled.mean(axis=0)) print("std of the standardized test set : ", X_testscaled.std(axis=0)) # Execute different model module based on input from user if model == 'decisiontree': decisiontree.decisionTreeTest(X_train, X_test, y_train, y_test, classes, X_1_df, Y_1_df) elif model == 'svm': svm.svmLinearTest(X_train, X_test, y_train, y_test, classes, X_1_df, Y_1_df) elif model == 'svmnonlinear': svmnonlinear.svmNonLinearTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df) elif model == 'naivebayes': naiveBayes.naiveBayesClassifierTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df) elif model == 'logisticregression': logisticregression.logisticRegressionTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df) elif model == 'knn': knn.knnTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df) elif model == 'bagging': bagging.baggingTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df) elif model == 'adaboost': adaboost.adaboostTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df) elif model == 'randomforest': randomforest.randomForestTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df) elif model == 'ensemblevote': ensemble.ensembleClassifier(X_train, X_test, y_train, y_test, X_1_df, Y_1_df) else: print("please enter the correct classifier name") sys.exit()
# ============================================================================= #morg_2_train_strs_broken #morg_2_test_strs_broken #activ_inact_train #activ_inact_test frames = [morg_2_train_strs_broken, activ_inact_train] import pandas as pd dfrad = pd.concat(frames, axis=1) dfrad = dfrad.dropna() #dfrad.iloc[:,[2048]] #dfrad.iloc[:,:100] #CLASS BALANCE - No balanced from yellowbrick.target import ClassBalance visCB = ClassBalance(labels=[1, 0]) visCB.fit(dfrad['activities']) #Fit the data to the visualizer visCB.show() #Finalize and render the figure #RANK 2D "Pearson correlation" -No balanced from yellowbrick.features import Rank2D visualizer = Rank2D(algorithm='pearson') visualizer.fit(dfrad.iloc[:, :50], dfrad['activities']) # Fit the data to the visualizer visualizer.transform(dfrad.iloc[:, :50]) # Transform the data visualizer.show() # Finalize and render the figure #MANIFOLD - No balanced from yellowbrick.features import Manifold classes = [1, 0] from sklearn import preprocessing
def visualizeClassImbalance(labels_train, lables_test=None): visualizer = ClassBalance(labels=["boring", "interesting"]) visualizer.fit(labels_train, lables_test) visualizer.poof()
def draw_class_balance(self): visualizer = ClassBalance(labels=self.le.classes_) visualizer.fit(self.training_labels) visualizer.poof()
y = df['eventdeath'] display(X.shape) display(y.shape) #%% import matplotlib.pyplot as plt from yellowbrick.target import ClassBalance _, y_counts = np.unique(y, return_counts=True) class_labels = ["survived", "deceased"] fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(9,4.5)) ax1.pie(y_counts, explode=(0, 0.05), labels = class_labels) visualizer = ClassBalance(labels = class_labels, ax = ax2) visualizer.fit(y) visualizer.finalize() plt.show() #%% print("Number of missing values:", X.isna().sum().sum()) #%% X["timerecurrence"].describe() #%% # for column in X.columns[2:16]: # plt.scatter(X[column], y) # plt.xlabel(column)
# %% pd.crosstab(hr.salary,hr.quit).plot(kind='bar') plt.title('TurnOver Frequency on Salary Bracket') plt.xlabel('Salary') plt.ylabel('Frequency of TurnOver') plt.show() # %% pd.crosstab(hr.department,hr.quit).plot(kind='bar') plt.title('TurnOver Frequency for Department') plt.xlabel('Department') plt.ylabel('Frequency of TurnOver') plt.show() # %% hr.drop(columns=['department','salary'],axis=1,inplace=True) # %% visualizer = ClassBalance(labels=['stayed','quit']) visualizer.fit(hr.quit) visualizer.show() # %% X = hr.loc[:,hr.columns!='quit'] y = hr.quit X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0,stratify=y) # %% @interact def plot_tree(crit=['gini','entropy'], split=['best','random'], depth = IntSlider(min=1,max=30,value=2,continous_update=False), min_split=IntSlider(min=2,max=5,value=2,continous_update=False), min_leaf=IntSlider(min=1,max=5,value=1,continous_update=False)): estimator = DecisionTreeClassifier(random_state=0,criterion=crit,splitter=split,max_depth=depth,min_samples_split=min_split,min_samples_leaf=min_leaf) estimator.fit(X_train,y_train)
# código pronto y_top25 = top_p(test_y) y_top25.mean() # In[69]: # código pronto from yellowbrick.target import ClassBalance visualizer = ClassBalance(labels=["75%", "25%"]) visualizer.fit(y_top25) visualizer.show() # ## Para saber mais: agrupando # # O `yellowbrick` possui uma função para visualizar possíveis binnings. O código a seguir mostra 4 sugestões de pontos para agrupamento. Não usaremos a sugestão do yellowbrick pois no nosso caso o cliente já definiu que queria os X% do topo. # In[70]: # código pronto from yellowbrick.target import BalancedBinningReference
# # ### Task 4: Visualize Class Imbalance # --- # In[16]: from yellowbrick.target import ClassBalance # machine learning diagnostic library plt.style.use("ggplot") plt.rcParams['figure.figsize'] = (12, 8) # In[17]: # SHOW THE TARGET VARIABLES IN 1 AND 0 IN THE FORM OF STAYED OR QUIT visualizer = ClassBalance(labels=['stayed', 'quit']).fit( data.quit) # fit(target_variable) visualizer.show() # ### Task 5: Create Training and Test Sets # --- # In[18]: x = data.loc[:, data.columns != 'quit'] y = data.quit # In[19]: from sklearn.model_selection import train_test_split # since the dataset is unbalanced, lets stratify the target variables
def class_balance(classes, y): from yellowbrick.target import ClassBalance visualizer = ClassBalance(labels=classes) visualizer.fit(y) visualizer.poof()
def describe( context: MLClientCtx, table: Union[DataItem, str], label_column: str, class_labels: List[str], key: str = "table-summary", ) -> None: """Summarize a table TODO: merge with dask version :param context: the function context :param table: pandas dataframe :param key: key of table summary in artifact store """ _gcf_clear(plt) base_path = context.artifact_path os.makedirs(base_path, exist_ok=True) os.makedirs(base_path + "/plots", exist_ok=True) print(f'TABLE {table}') table = pd.read_parquet(str(table)) header = table.columns.values # describe table sumtbl = table.describe() sumtbl = sumtbl.append(len(table.index) - table.count(), ignore_index=True) sumtbl.insert( 0, "metric", ["count", "mean", "std", "min", "25%", "50%", "75%", "max", "nans"]) sumtbl.to_csv(os.path.join(base_path, key + ".csv"), index=False) context.log_artifact(key, local_path=key + ".csv") # plot class balance, record relative class weight _gcf_clear(plt) labels = table.pop(label_column) class_balance_model = ClassBalance(labels=class_labels) class_balance_model.fit(labels) scale_pos_weight = class_balance_model.support_[ 0] / class_balance_model.support_[1] #context.log_artifact("scale_pos_weight", f"{scale_pos_weight:0.2f}") context.log_artifact("scale_pos_weight", str(scale_pos_weight)) class_balance_model.show( outpath=os.path.join(base_path, "plots/imbalance.png")) context.log_artifact(PlotArtifact("imbalance", body=plt.gcf()), local_path="plots/imbalance.html") # plot feature correlation _gcf_clear(plt) tblcorr = table.corr() ax = plt.axes() sns.heatmap(tblcorr, ax=ax, annot=False, cmap=plt.cm.Reds) ax.set_title("features correlation") plt.savefig(os.path.join(base_path, "plots/corr.png")) context.log_artifact(PlotArtifact("correlation", body=plt.gcf()), local_path="plots/corr.html") # plot histogram _gcf_clear(plt) """
if (ngram.isupper()): bs[index] = features else: o[(index * n_ngrams) + i] = features bs = bs[~np.all(bs == 0, axis=1)] o = o[~np.all(o == 0, axis=1)] binding_sites = bs other = o binding_sites_labels = np.ones(binding_sites.shape[0], dtype=np.uint8) other_labels = np.zeros(other.shape[0], dtype=np.uint8) X = np.concatenate((binding_sites, other)) y = np.concatenate((binding_sites_labels, other_labels)) # %% visualizer = ClassBalance(labels=class_names) visualizer.fit(y) visualizer.poof() # %% visualizer = ParallelCoordinates() visualizer.fit_transform(X, y) visualizer.poof() # %% visualizer = Rank1D() visualizer.fit(X, y) visualizer.transform(X) visualizer.poof() # %%
import pandas as pd import datetime from yellowbrick.target import ClassBalance print(datetime.datetime.now()) path = 'data/cleaned_data.csv' pathr = 'data/resampled.csv' pathr2 = 'data/resampled_borderline.csv' pathr3 = 'data/resampled_adasyn.csv' pathr4 = 'data/resampled_tomek.csv' randomState = 42 classLabels = ['Not Bankrupt', 'Bankrupt'] df = pd.read_csv(pathr4, index_col=0) print('Import done.') # Extract labels from features y = df['BK'] X = df.drop('BK', axis=1) # Instantiate Visualizer viz = ClassBalance(labels=classLabels) viz.fit(y) viz.show() print(viz.support_)
#Task 3: Encode Categorical Features cat_vars = ['department', 'salary'] for var in cat_vars: cat_list = pd.get_dummies(hr[var], prefix=var) hr = hr.join(cat_list) hr.head() hr.drop(columns=['department', 'salary'], axis=1, inplace=True) hr.head() #Task 4: Visualize Class Imbalance from yellowbrick.target import ClassBalance plt.style.use("ggplot") plt.rcParams['figure.figsize'] = (12, 8) visualizer = ClassBalance(label=['satyed', 'quit']).fit(hr.quit) visualizer.show() #Task 5: Create Training and Test Sets X = hr.loc[:, hr.columns != 'quit'] y = hr.quit from sklearn.model_selection import train_test_split as tts X_train, X_test, y_train, y_test = tts(X, Y, stratify=Y) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2, stratify=y)
df.head() df.drop(columns=['department', 'salary'], axis=1, inplace=True) df.head() """### Now, it's really important to check for Class Imbalance in our dataset here ### Visualize Class Imbalance --- """ from yellowbrick.target import ClassBalance plt.style.use("ggplot") plt.rcParams['figure.figsize'] = (12,8) visualizer = ClassBalance(labels=["stayed", "quit"]) visualizer.fit(df.quit) """### Create Training and Test Sets --- """ x = df.loc[:,df.columns != 'quit'] y = df.quit from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(x, y, random_state =0, test_size=0.2, stratify=y) """### Building an Interactive Decision Tree Classifier ---
df.head() # Dropping categorical variables df.drop(columns=['department','salary'], axis=1, inplace=True) """# Step 4: Visualize Class Imbalance --- """ from yellowbrick.target import ClassBalance plt.style.use("ggplot") plt.rcParams['figure.figsize'] = (12,8) visualizer = ClassBalance(labels=["stayed", "quit"]) visualizer.fit(df.quit) """# Step 5: Create Training and Test Sets --- """ X = df.loc[:, df.columns != 'quit'] y = df.quit from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y) """# Step 6: Classification using Decision Tree Classifier
top_10 = top_p(pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), p=0.90).values if not np.array_equal(top_10, [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]): print("Não retornou o top 10% corretamente, deveria ser", top_10) # código pronto y_top25 = top_p(test_y) y_top25.mean() # código pronto from yellowbrick.target import ClassBalance visualizer = ClassBalance(labels=["75get_ipython().run_line_magic("",", " \"25%\"])") visualizer.fit(y_top25) visualizer.poof() # código pronto from yellowbrick.target import BalancedBinningReference visualizer = BalancedBinningReference() visualizer.fit(train_y) visualizer.poof() # código pronto from sklearn.dummy import DummyClassifier