for var in cat_vars:
  cat_list = pd.get_dummies(hr[var], prefix = var)
  hr = hr.join(cat_list)

hr.head()

hr.drop(columns = ['department', 'salary'], axis = 1, inplace = True)

#Task 4: Visualize class imbalance

from yellowbrick.target import ClassBalance
plt.style.use("ggplot")
plt.rcParams['figure.figsize'] = (12,8)

visualizer =  ClassBalance(labels = ['stayed','quit']).fit(hr.quit)
visualizer.show()

#Task 5: Create traning and tests sets

X = hr.loc[:, hr.columns != 'quit']
y = hr.quit

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state = 0, test_size=0.2,stratify=y)

#Task 6 : Build an interactive decision tree classifer

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
Ejemplo n.º 2
0
def main():

    script, fname, model = argcheck()

    df = filecheck(fname)

    print(df.head(5))

    #Data stats
    #Printing the number of rows and columns
    print(df.info())

    print("The number of rows")
    print(len(df))

    print("The number of columns")
    print(len(df.columns))

    print("Dataframe shape")
    print(df.shape)

    #Data preprocessing - step 1(Check for any null - N/A values)

    print("\n-------Data Preprocessing - Step 1--------")
    print("------------------------------------------")

    print("Checking for any N/A values")
    print(df.isna().values.any())

    #Check for any Null values
    print("Checking for any null values")
    print(df.isnull().values.any())

    #Data Preprocessing - step 2(Addressing class imbalance problem)

    print("\n-------Data Preprocessing - Step 2--------")
    print("------------------------------------------")

    Y = pd.DataFrame(data=df['Activity'])
    X = df.drop(['Activity'], axis=1)

    print("Before applying SMOTE algorithm")
    print("Unique values and count of target column 'Activity -'")
    print(df.groupby('Activity').nunique())

    unique_labels, frequency = np.unique(Y, return_counts=True)
    #Generating class balance chart before applying SMOTE. The chart is generated as 'Class-balance-Before-SMOTE.png' in the 'output directory'
    print("The class balance is generated as 'Class-balance-Before-SMOTE.png'")
    visualizer1 = ClassBalance(labels=unique_labels, size=(1400, 1000))
    visualizer1.fit(Y.values.ravel())
    visualizer1.show("output/Class-balance-Before-SMOTE.png")

    #Solving the class imbalance problem by oversampling the data
    smote = SMOTE(random_state=1)
    X_1, Y_1 = smote.fit_resample(X, Y)

    print("After applying SMOTE algorithm")
    X_1_df = pd.DataFrame(data=X_1, columns=X.columns)
    Y_1_df = pd.DataFrame(data=Y_1, columns=Y.columns)

    print("The new shape of the X dataframe")
    print(X_1_df.shape)

    print("The new shape of the Y dataframe")
    print(Y_1_df.shape)

    unique, frequency = np.unique(Y_1, return_counts=True)
    # print unique values array
    print("Unique Values of new Y dataframe:", unique)

    # print frequency array
    print("Frequency Values of new Y dataframe:", frequency)

    #Generating class balance chart after applying SMOTE. The chart is generated as 'Class-balance-After-SMOTE.png' in the 'output directory'
    print("The class balance is generated as 'Class-balance-After-SMOTE.png'")
    visualizer2 = ClassBalance(labels=unique_labels, size=(1400, 1000))
    visualizer2.fit(Y_1_df.values.ravel())
    visualizer2.show("output/Class-balance-After-SMOTE.png")

    #Data Preprocessing - step 3(Label Encoding)
    print("\n-------Data Preprocessing - Step 3--------")
    print("------------------------------------------")

    #Convert the string labels to integers
    # 0- 'LAYING'
    # 1 - 'SITTING'
    # 2 - 'STANDING'
    # 3 - 'WALKING'
    # 4 - 'WALKING_DOWNSTAIRS'
    # 5 - 'WALKING_UPSTAIRS'
    label_encoder = preprocessing.LabelEncoder()
    Y_1_df['Activity'] = label_encoder.fit_transform(Y_1_df['Activity'])
    print("After label encoding, the target values are")
    classes = Y_1_df['Activity'].unique()
    print(Y_1_df['Activity'])

    #Data Preprocessing - step 4(Covariance/Correlation, standardization)

    print("\n-------Data Preprocessing - Step 4--------")
    print("------------------------------------------")
    #Covariance and correlation - Task 1(Preeti)
    dfCov = np.cov(X_1_df, Y_1_df, rowvar=False, bias=True)
    print(dfCov)

    #Calculates Pearson product-moment correlation coefficients
    dfCorr = np.corrcoef(X_1_df, Y_1_df, rowvar=False, bias=True)
    print("Correlation coefficient obtained : ", dfCorr)

    #Data preprocessing - Step 5(Splitting the training and testing dataset) (JunYong or Preeti)
    print(
        "\n-------Data Preprocessing - Step 5(Splitting into training and testing dataset)--------"
    )
    print("------------------------------------------")
    X_train, X_test, y_train, y_test = train_test_split(X_1_df,
                                                        Y_1_df,
                                                        random_state=1,
                                                        test_size=0.2)

    #Data preprocessing - Step 6(Standardize the dataset)
    print("\n-------Data Preprocessing - Step 6--------")
    print("------------------------------------------")
    sc_X = preprocessing.StandardScaler()
    X_trainscaled = sc_X.fit_transform(X_train)
    X_testscaled = sc_X.transform(X_test)

    print("Mean of the standardized training set : ",
          X_trainscaled.mean(axis=0))
    print("std of the standardized training set : ", X_trainscaled.std(axis=0))

    print("Mean of the standardized test set : ", X_testscaled.mean(axis=0))
    print("std of the standardized test set : ", X_testscaled.std(axis=0))

    # Execute different model module based on input from user
    if model == 'decisiontree':
        decisiontree.decisionTreeTest(X_train, X_test, y_train, y_test,
                                      classes, X_1_df, Y_1_df)

    elif model == 'svm':
        svm.svmLinearTest(X_train, X_test, y_train, y_test, classes, X_1_df,
                          Y_1_df)

    elif model == 'svmnonlinear':
        svmnonlinear.svmNonLinearTest(X_train, X_test, y_train, y_test, X_1_df,
                                      Y_1_df)

    elif model == 'naivebayes':
        naiveBayes.naiveBayesClassifierTest(X_train, X_test, y_train, y_test,
                                            X_1_df, Y_1_df)

    elif model == 'logisticregression':
        logisticregression.logisticRegressionTest(X_train, X_test, y_train,
                                                  y_test, X_1_df, Y_1_df)

    elif model == 'knn':
        knn.knnTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df)
    elif model == 'bagging':
        bagging.baggingTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df)
    elif model == 'adaboost':
        adaboost.adaboostTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df)
    elif model == 'randomforest':
        randomforest.randomForestTest(X_train, X_test, y_train, y_test, X_1_df,
                                      Y_1_df)
    elif model == 'ensemblevote':
        ensemble.ensembleClassifier(X_train, X_test, y_train, y_test, X_1_df,
                                    Y_1_df)
    else:
        print("please enter the correct classifier name")
        sys.exit()
Ejemplo n.º 3
0
#morg_2_test_strs_broken
#activ_inact_train
#activ_inact_test

frames = [morg_2_train_strs_broken, activ_inact_train]
import pandas as pd
dfrad = pd.concat(frames, axis=1)
dfrad = dfrad.dropna()
#dfrad.iloc[:,[2048]]
#dfrad.iloc[:,:100]

#CLASS BALANCE - No balanced
from yellowbrick.target import ClassBalance
visCB = ClassBalance(labels=[1, 0])
visCB.fit(dfrad['activities'])  #Fit the data to the visualizer
visCB.show()  #Finalize and render the figure

#RANK 2D "Pearson correlation" -No balanced
from yellowbrick.features import Rank2D
visualizer = Rank2D(algorithm='pearson')
visualizer.fit(dfrad.iloc[:, :50],
               dfrad['activities'])  # Fit the data to the visualizer
visualizer.transform(dfrad.iloc[:, :50])  # Transform the data
visualizer.show()  # Finalize and render the figure

#MANIFOLD - No balanced
from yellowbrick.features import Manifold
classes = [1, 0]
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder(
)  #label_encoder object knows how to understand word labels.
import pandas as pd
import datetime
from yellowbrick.target import ClassBalance

print(datetime.datetime.now())

path = 'data/cleaned_data.csv'
pathr = 'data/resampled.csv'
pathr2 = 'data/resampled_borderline.csv'
pathr3 = 'data/resampled_adasyn.csv'
pathr4 = 'data/resampled_tomek.csv'
randomState = 42
classLabels = ['Not Bankrupt', 'Bankrupt']

df = pd.read_csv(pathr4, index_col=0)

print('Import done.')

# Extract labels from features
y = df['BK']
X = df.drop('BK', axis=1)

# Instantiate Visualizer
viz = ClassBalance(labels=classLabels)

viz.fit(y)
viz.show()
print(viz.support_)
Ejemplo n.º 5
0
def describe(
    context: MLClientCtx,
    table: Union[DataItem, str],
    label_column: str,
    class_labels: List[str],
    key: str = "table-summary",
) -> None:
    """Summarize a table

    TODO: merge with dask version

    :param context:         the function context
    :param table:           pandas dataframe
    :param key:             key of table summary in artifact store
    """
    _gcf_clear(plt)

    base_path = context.artifact_path
    os.makedirs(base_path, exist_ok=True)
    os.makedirs(base_path + "/plots", exist_ok=True)

    print(f'TABLE {table}')
    table = pd.read_parquet(str(table))
    header = table.columns.values

    # describe table
    sumtbl = table.describe()
    sumtbl = sumtbl.append(len(table.index) - table.count(), ignore_index=True)
    sumtbl.insert(
        0, "metric",
        ["count", "mean", "std", "min", "25%", "50%", "75%", "max", "nans"])

    sumtbl.to_csv(os.path.join(base_path, key + ".csv"), index=False)
    context.log_artifact(key, local_path=key + ".csv")

    # plot class balance, record relative class weight
    _gcf_clear(plt)

    labels = table.pop(label_column)
    class_balance_model = ClassBalance(labels=class_labels)
    class_balance_model.fit(labels)

    scale_pos_weight = class_balance_model.support_[
        0] / class_balance_model.support_[1]
    #context.log_artifact("scale_pos_weight", f"{scale_pos_weight:0.2f}")
    context.log_artifact("scale_pos_weight", str(scale_pos_weight))

    class_balance_model.show(
        outpath=os.path.join(base_path, "plots/imbalance.png"))
    context.log_artifact(PlotArtifact("imbalance", body=plt.gcf()),
                         local_path="plots/imbalance.html")

    # plot feature correlation
    _gcf_clear(plt)
    tblcorr = table.corr()
    ax = plt.axes()
    sns.heatmap(tblcorr, ax=ax, annot=False, cmap=plt.cm.Reds)
    ax.set_title("features correlation")
    plt.savefig(os.path.join(base_path, "plots/corr.png"))
    context.log_artifact(PlotArtifact("correlation", body=plt.gcf()),
                         local_path="plots/corr.html")

    # plot histogram
    _gcf_clear(plt)
    """
Ejemplo n.º 6
0
#%%
# for column in X.columns[2:16]:
#     plt.scatter(X[column], y)
#     plt.xlabel(column)
#     plt.show()

#%%
from yellowbrick.features.radviz import RadViz 

features = X.columns[:13]
visualizer = RadViz(classes=class_labels, features=features)

visualizer.fit(X[features], y)
visualizer.transform(df[features])
visualizer.show()

#%%
from yellowbrick.target import FeatureCorrelation

visualizer = FeatureCorrelation(labels=features)

visualizer.fit(X[features], y)        # Fit the data to the visualizer
visualizer.show()           # Finalize and render the figure

#%%
from yellowbrick.features import JointPlotVisualizer

visualizer = JointPlotVisualizer()

visualizer.fit_transform(X["grade"], y)        # Fit and transform the data