Exemple #1
0
 def visualize_target(self, df):
     logger.info("In DataVisualisation | visualize_target started")
     try:
         target_encoder = EncoderStore.get('target')
         labels = target_encoder.classes_
         logger.debug("Target Labels : " + str(labels))
         inverse_target = target_encoder.inverse_transform(df['target'])
         target_as_no = (inverse_target == 'no').sum()
         target_as_yes = (inverse_target == 'yes').sum()
         sizes = [target_as_no, target_as_yes]
         logger.debug("Target counts : " + str(sizes))
         colors = ['lightcoral', 'yellowgreen']
         patches, texts, percent = plt.pie(sizes,
                                           colors=colors,
                                           autopct='%1.1f%%',
                                           labels=labels,
                                           startangle=90,
                                           wedgeprops={'edgecolor': 'w'})
         plt.legend(patches, labels, loc="best")
         plt.axis('equal')
         plt.tight_layout()
         plt.ion()
         plt.show()
         plt.savefig(
             os.path.join(VISUALIZATION_SAVE_DIRECTORY,
                          'pie_visualization_target'))
         plt.pause(1)
         plt.close()
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In DataVisualisation | visualize_target finished")
 def get_binned_data(self, df=None, bins_per_col=4):
     logger.info("In DataFrameHandler | get_binned_data started")
     if df is None:
         df = self.data_frame_original
     try:
         binned_dataframe = df.copy()
         for col in self.numerical_cols:
             bins = np.linspace(binned_dataframe[col].min(),
                                binned_dataframe[col].max(),
                                bins_per_col + 1)
             binned_dataframe[col] = pd.cut(binned_dataframe[col],
                                            bins,
                                            precision=1,
                                            include_lowest=True,
                                            right=True)
             cat_list = pd.get_dummies(binned_dataframe[col], prefix=col)
             binned_dataframe = binned_dataframe.join(cat_list)
             binned_dataframe = binned_dataframe.drop(col, axis=1)
         logger.debug('Columns after Dummy Encoding : ' +
                      str(binned_dataframe.columns.values))
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In DataFrameHandler | get_binned_data finished")
     return binned_dataframe
Exemple #3
0
def create_model():
    logger.info('In RandomForestModel | create_model started')
    n_estimators = [int(x) for x in np.linspace(start=10, stop=1000, num=100)]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
    max_depth.append(None)
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    bootstrap = [True, False]
    random_grid = {
        'n_estimators': n_estimators,
        'max_features': max_features,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'bootstrap': bootstrap
    }
    random_forest = RandomForestClassifier(random_state=42)
    rand = RandomizedSearchCV(
        random_forest,
        random_grid,
        random_state=42,
        n_iter=10,
        cv=10,
        n_jobs=-1,
        scoring=['recall', 'accuracy', 'neg_log_loss', 'f1', 'roc_auc'],
        refit='accuracy')
    logger.info('In RandomForestModel | create_model finished')
    return random_forest, rand
Exemple #4
0
def create_model():
    logger.info('In LogisticRegression | create_model started')
    hyperparameters = [{
        'solver': ['liblinear'],
        'penalty': ['l1', 'l2'],
        'C': np.logspace(0, 10, 100)
    }, {
        'solver': ['newton-cg', 'sag', 'lbfgs'],
        'penalty': ['l2'],
        'C': np.logspace(0, 10, 100)
    }, {
        'solver': ['saga'],
        'penalty': ['l1', 'l2', 'elasticnet'],
        'C': np.logspace(0, 10, 100)
    }]
    logreg = LogisticRegression(solver='saga', random_state=42)
    rand = RandomizedSearchCV(
        logreg,
        hyperparameters,
        random_state=42,
        n_iter=10,
        cv=10,
        n_jobs=-1,
        scoring=['recall', 'accuracy', 'neg_log_loss', 'f1', 'roc_auc'],
        refit='accuracy')
    logger.info('In LogisticRegression | create_model finished')
    return logreg, rand
 def impute_missing_values(self, df, missing_val_info, method='strategic'):
     logger.info("In MissingValue | impute_missing_value started")
     try:
         possible_methods = ['strategic', 'knn', 'mice']
         if method in possible_methods:
             if method == 'strategic':
                 for col in df.columns:
                     if missing_val_info[col]['percentage'] > 0:
                         logger.debug('Strategically imputing column : ' + str(col))
                         column_imputation_method = COLUMN_WISE_IMPUTE_TECHNIQUE_MAP.get(col)
                         if column_imputation_method == 'mode':
                             self.__impute_by_mode(df, col)
                         elif column_imputation_method == 'mean':
                             self.__impute_by_mean(df, col)
                         elif column_imputation_method == 'median':
                             self.__impute_by_median(df, col)
                         elif column_imputation_method == 'value':
                             self.__impute_by_value(df, col, 0)
             elif method == 'knn':
                 self.__impute_by_knn(df)
             elif method == 'mice':
                 self.__impute_by_mice(df)
         else:
             logger.error("Incorrect Imputation Method !!! Possible values : strategic, knn, mice")
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In MissingValue | impute_missing_value finished")
 def outlier_DBSCAN(self, df, numerical_cols):
     logger.info("In OutlierDetection | outlier_DBSCAN started")
     try:
         for col in numerical_cols:
             outlier_detector = DBSCAN(eps=.5,
                                       metric='euclidean',
                                       min_samples=5,
                                       n_jobs=-1)
             clusters = outlier_detector.fit_predict(df[[col]])
             cmap = cm.get_cmap('Set1')
             df.plot.scatter(x=col,
                             y='target',
                             c=clusters,
                             cmap=cmap,
                             colorbar=False)
             plt.ion()
             plt.show()
             plt.savefig(
                 os.path.join(VISUALIZATION_SAVE_DIRECTORY,
                              'outlier_visualization_dbscan' + str(col)))
             plt.pause(1)
             plt.close()
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In OutlierDetection | visualize_outlier finished")
Exemple #7
0
 def visualize_feature_correlation_heat_map(self, df):
     logger.info(
         "In DataVisualisation | visualize_feature_correlation_heat_map started"
     )
     try:
         fig, ax = plt.subplots(figsize=(20, 20))
         chart = sns.heatmap(df.corr(),
                             ax=ax,
                             annot=True,
                             vmin=-1,
                             vmax=1,
                             center=0,
                             cmap='coolwarm')
         chart.set_xticklabels(chart.get_xticklabels(), rotation=90)
         chart.set_yticklabels(chart.get_yticklabels(), rotation=0)
         plt.ion()
         plt.show()
         plt.savefig(
             os.path.join(VISUALIZATION_SAVE_DIRECTORY,
                          'sns_correlation_heatmap'))
         plt.pause(1)
         plt.close()
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info(
         "In DataVisualisation | visualize_feature_correlation_heat_map finished"
     )
 def __impute_by_mice(self, df):
     logger.info("In MissingValue | __impute_by_mice started")
     try:
         df = mice(data=df)
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In MissingValue | __impute_by_mice finished")
     return df
 def __impute_by_value(self, df, col, value):
     logger.info("In MissingValue | __impute_by_value started")
     try:
         logger.debug("Value to replace NAN for column " + str(col) + " : " + str(value))
         df[col] = df[col].fillna(value)
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In MissingValue | __impute_by_value finished")
Exemple #10
0
 def detect_categorical_columns(self, df):
     logger.info("In PreProcessor | detect_categorical_columns started")
     try:
         logger.debug("In detect_categorical_columns | " + str(df.dtypes))
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In PreProcessor | detect_categorical_columns finished")
     return df.columns[df.dtypes == np.object]
 def __impute_by_mode(self, df, col):
     logger.info("In MissingValue | __impute_by_mode started")
     try:
         column_mode = df[col].mode()
         logger.debug("Mode obtained for column " + str(col) + " : " + str(column_mode))
         df[col] = df[col].fillna(column_mode)
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In MissingValue | __impute_by_mode finished")
 def __impute_by_knn(self, df):
     logger.info("In MissingValue | __impute_by_knn started")
     try:
         logger.debug("Applying KNN for imputation with k=1")
         df = fast_knn(k=1, data=df)
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In MissingValue | __impute_by_knn finished")
     return df
Exemple #13
0
def create_model(colnum):
    logger.info('In NeuralNetworkModel | create_model Started')
    model = Sequential()
    model.add(Dense(150, activation='relu', input_shape=(colnum, )))
    model.add(Dense(100, activation='relu', input_shape=(colnum, )))
    model.add(Dense(50, activation='relu', input_shape=(colnum, )))
    model.add(Dense(2, activation='softmax'))
    print(model.summary())
    logger.info('In NeuralNetworkModel | create_model finished')
    return model
def create_model():
    logger.info('In SVMModel | create_model started')
    C = np.logspace(0, 10, 100)
    kernel = ['linear', 'poly', 'rbf', 'sigmoid']
    gamma = ['scale', 'auto']
    hyperparameters = dict(C=C, kernel=kernel, gamma=gamma)
    svc_classifier = SVC(random_state=42, probability=True, cache_size=2000, tol=0.1)
    rand = RandomizedSearchCV(svc_classifier, hyperparameters, random_state=42, n_iter=100, cv=10, n_jobs=-1,
                              scoring=['recall', 'accuracy'], refit='accuracy')
    logger.info('In SVMModel | create_model finished')
    return svc_classifier, rand
Exemple #15
0
 def apply_min_max_scaling(self, df):
     logger.info("In PreProcessor | apply_min_max_scaling started")
     try:
         scaler = MinMaxScaler()
         for col in self.get_numeric_cols(df):
             df[col] = scaler.fit_transform(df[[col]])
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In PreProcessor | apply_min_max_scaling finished")
     return df
Exemple #16
0
 def load_data(self, default_directory=DEFAULT_DIRECTORY):
     logger.info("In PreProcessor | load_data started")
     try:
         data_file = os.path.join(default_directory, DATA_CSV_FILENAME)
         logger.debug("In load_data | Reading Data File : " + data_file)
         df = pd.read_csv(data_file)
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In PreProcessor | load_data finished")
     return df
    def convert_numerical_to_Categorical(self, df, eligible_cols):
        try:
            ogger.info(
                "In FeatureEngineering | create_category_percent started")
            d1 = data.copy()
            length = len(d1)

            logger.info(
                "In FeatureEngineering | create_category_percent finished")
            return d1
        except Exception as exp:
            err = self.errObj.handleErr(str(exp))
            logger.error(str(err))
 def visualize_missing_value_heatmap(self, df):
     logger.info("In MissingValue | visualize_missing_value_heatmap started")
     try:
         msno.heatmap(df)
         plt.ion()
         plt.show()
         plt.savefig(os.path.join(VISUALIZATION_SAVE_DIRECTORY, 'correlation_heatmap.png'))
         plt.pause(1)
         plt.close()
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In MissingValue | visualize_missing_value_heatmap finished")
 def split_attribute_and_target(self, df=None):
     logger.info("In DataFrameHandler | split_attribute_and_target started")
     if df is None:
         df = self.data_frame_original
     try:
         target = df[self.target_col]
         attribute_set = df.drop(self.target_col, axis=1)
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info(
         "In DataFrameHandler | split_attribute_and_target finished")
     return {'attributes': attribute_set, 'target': target}
 def get_scaled_data(self, df=None):
     logger.info("In DataFrameHandler | get_scaled_data started")
     if df is None:
         df = self.data_frame_original
     try:
         scaled_dataframe = df.copy()
         scaled_dataframe[self.numerical_cols] = self.scaler.fit_transform(
             scaled_dataframe[self.numerical_cols])
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In DataFrameHandler | get_scaled_data finished")
     return scaled_dataframe
 def get_label_encoded_data(self, df=None):
     logger.info("In DataFrameHandler | get_label_encoded_data started")
     if df is None:
         df = self.data_frame_original
     try:
         label_encoded_dataframe = df.copy()
         for col in self.categorical_cols:
             label_encoded_dataframe[col] = self.labelEncoder.fit_transform(
                 label_encoded_dataframe[col])
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In DataFrameHandler | get_label_encoded_data finished")
     return label_encoded_dataframe
Exemple #22
0
 def visualize_job_vs_target(self, df):
     logger.info("In DataVisualisation | visualize_job_vs_target started")
     try:
         target_encoder = EncoderStore.get('target')
         inverse_target = target_encoder.inverse_transform(df['target'])
         job_encoder = EncoderStore.get('job')
         job_labels = job_encoder.classes_
         inverse_job = job_encoder.inverse_transform(df['job'])
         sizes_not_paid = []
         sizes_paid = []
         for label in job_labels:
             job_label_and_target_no = ((inverse_job == label) &
                                        (inverse_target == 'no')).sum()
             job_label_and_target_yes = ((inverse_job == label) &
                                         (inverse_target == 'yes')).sum()
             sizes_not_paid.append(job_label_and_target_no)
             sizes_paid.append(job_label_and_target_yes)
         colors = [
             "aqua", "azure", "brown", "chartreuse", "coral", "crimson",
             "cyan", "fuchsia", "goldenrod", "lavender", "purple", "teal"
         ]
         fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 20))
         ax1.pie(sizes_not_paid,
                 autopct='%1.1f%%',
                 labels=job_labels,
                 startangle=90,
                 colors=colors,
                 wedgeprops={'edgecolor': 'w'})
         ax1.axis('equal')
         ax1.set_title('Target: no')
         ax2.pie(sizes_paid,
                 autopct='%1.1f%%',
                 labels=job_labels,
                 startangle=90,
                 colors=colors,
                 wedgeprops={'edgecolor': 'w'})
         ax2.set_title('Target: yes')
         ax2.axis('equal')
         plt.tight_layout()
         plt.ion()
         plt.show()
         plt.savefig(
             os.path.join(VISUALIZATION_SAVE_DIRECTORY,
                          'pie_visualization_job_vs_target'))
         plt.pause(1)
         plt.close()
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In DataVisualisation | visualize_job_vs_target finished")
Exemple #23
0
 def convert_to_categorical_values(self,
                                   df,
                                   cat_cols,
                                   use_label_encoder=False):
     logger.info("In PreProcessor | convert_to_categorical_values started")
     try:
         if use_label_encoder:
             for col in cat_cols:
                 if col in COLUMNS_CATEGORIZATION_APPLICABLE:
                     logger.debug('Categorizing Column : ' + str(col))
                     encoder = LabelEncoder()
                     logger.debug('Column unique value : ' +
                                  str(df[col].unique()))
                     encoder.fit(df[col].unique())
                     df[col] = encoder.fit_transform(df[col])
                     EncoderStore.save(col, encoder)
         if not use_label_encoder:
             one_hot_encoder = OneHotEncoder(sparse=False,
                                             handle_unknown='ignore')
             for col in cat_cols:
                 enc_df = pd.DataFrame(
                     one_hot_encoder.fit_transform(df[[col]]))
                 enc_df.columns = one_hot_encoder.get_feature_names([col])
                 df = df.join(enc_df)
                 df = df.drop(col, axis=1)
             logger.info('Columns in dataframe after one hot encoding: ' +
                         str(df.columns))
             logger.info('Shape of dataframe after one hot encoding: ' +
                         str(df.shape))
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In PreProcessor | convert_to_categorical_values finished")
     return df
 def create_category_percent(self, data, categorical_cols):
     try:
         logger.info(
             "In FeatureEngineering | create_category_percent started")
         d1 = data.copy()
         length = len(d1)
         for col in categorical_cols:
             d1[col + 'Pct'] = (d1[col].groupby(
                 d1[col]).transform('count')) * 100 / length
         logger.info(
             "In FeatureEngineering | create_category_percent finished")
         return d1
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
 def create_bin(self, data, numerical_cols, number_of_bins=4):
     try:
         logger.info("In FeatureEngineering | create_bin started")
         d1 = data.copy()
         for col in numerical_cols:
             bins = np.linspace(d1[col].min(), d1[col].max(),
                                number_of_bins)
             d1[col + '_bin'] = pd.cut(d1[col],
                                       bins,
                                       precision=1,
                                       include_lowest=True,
                                       right=True)
         logger.info("In FeatureEngineering | create_bin finished")
         return d1
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
Exemple #26
0
 def visualize_duration_vs_target(self, df):
     logger.info(
         "In DataVisualisation | visualize_duration_vs_target started")
     try:
         target_encoder = EncoderStore.get('target')
         bar_labels = target_encoder.classes_
         inverse_target = target_encoder.inverse_transform(df['target'])
         target_as_no = (inverse_target == 'no').sum()
         target_as_yes = (inverse_target == 'yes').sum()
         duration_gt_180_and_target_no = ((df['duration'] > 180) &
                                          (inverse_target == 'no')).sum()
         duration_lte_180_and_target_no = ((df['duration'] <= 180) &
                                           (inverse_target == 'no')).sum()
         duration_gt_180_and_target_yes = ((df['duration'] > 180) &
                                           (inverse_target == 'yes')).sum()
         duration_lte_180_and_target_yes = (
             (df['duration'] <= 180) & (inverse_target == 'yes')).sum()
         x_labels = ['duration>180', 'duration<=180']
         x = np.arange(2)
         ax = plt.subplot(1, 1, 1)
         w = 0.3
         not_paid = [
             duration_gt_180_and_target_no / target_as_no,
             duration_lte_180_and_target_no / target_as_no
         ]
         paid = [
             duration_gt_180_and_target_yes / target_as_yes,
             duration_lte_180_and_target_yes / target_as_yes
         ]
         plt.xticks(x + w / 2, x_labels)
         not_paid_bar = ax.bar(x, not_paid, color="lightcoral", width=w)
         paid_bar = ax.bar(x + w, paid, color="yellowgreen", width=w)
         plt.legend([not_paid_bar, paid_bar], bar_labels)
         plt.ion()
         plt.show()
         plt.savefig(
             os.path.join(VISUALIZATION_SAVE_DIRECTORY,
                          'bar_visualization_duration_vs_target'))
         plt.pause(1)
         plt.close()
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info(
         "In DataVisualisation | visualize_duration_vs_target finished")
Exemple #27
0
 def visualize_marital_status_vs_target(self, df):
     logger.info(
         "In DataVisualisation | visualize_marital_status_vs_target started"
     )
     try:
         target_encoder = EncoderStore.get('target')
         bar_labels = target_encoder.classes_
         inverse_target = target_encoder.inverse_transform(df['target'])
         target_as_no = (inverse_target == 'no').sum()
         target_as_yes = (inverse_target == 'yes').sum()
         marital_status_encoder = EncoderStore.get('marital')
         inverse_marital_status = marital_status_encoder.inverse_transform(
             df['marital'])
         x_labels = marital_status_encoder.classes_
         not_paid = []
         paid = []
         for stat in x_labels:
             marital_stat_and_target_no = ((inverse_marital_status == stat)
                                           &
                                           (inverse_target == 'no')).sum()
             marital_stat_and_target_yes = (
                 (inverse_marital_status == stat) &
                 (inverse_target == 'yes')).sum()
             not_paid.append(marital_stat_and_target_no / target_as_no)
             paid.append(marital_stat_and_target_yes / target_as_yes)
         x = np.arange(3)
         ax = plt.subplot(1, 1, 1)
         w = 0.3
         plt.xticks(x + w / 2, x_labels)
         not_paid_bar = ax.bar(x, not_paid, color="lightcoral", width=w)
         paid_bar = ax.bar(x + w, paid, color="yellowgreen", width=w)
         plt.legend([not_paid_bar, paid_bar], bar_labels)
         plt.ion()
         plt.show()
         plt.savefig(
             os.path.join(VISUALIZATION_SAVE_DIRECTORY,
                          'bar_visualization_marital_status_vs_target'))
         plt.pause(1)
         plt.close()
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info(
         "In DataVisualisation | visualize_marital_status_vs_target finished"
     )
 def get_missing_values_info(self, df):
     logger.info("In MissingValue | get_missing_values_info started")
     info = {}
     try:
         for col in df.columns:
             missing_val_count = df[col].isnull().sum()
             total_row_count = df[col].shape[0]
             logger.debug("Missing values in Column " + col + " : " + str(missing_val_count))
             logger.debug("Total Entries in Column " + col + " : " + str(total_row_count))
             info[col] = {
                 'count': missing_val_count,
                 'percentage': (missing_val_count / total_row_count) * 100
             }
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In MissingValue | get_missing_values_info finished")
     return info
 def visualize_outlier(self, df):
     logger.info("In OutlierDetection | visualize_outlier started")
     try:
         chart = boxplot(x='variable',
                         y='value',
                         data=pd.melt(df),
                         width=0.5,
                         palette="colorblind")
         chart.set_xticklabels(chart.get_xticklabels(), rotation=90)
         plt.ion()
         plt.show()
         plt.savefig(
             os.path.join(VISUALIZATION_SAVE_DIRECTORY,
                          'outlier_visualization'))
         plt.pause(1)
         plt.close()
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In OutlierDetection | visualize_outlier finished")
 def get_dummies_data(self, df=None):
     logger.info("In DataFrameHandler | get_dummies_data started")
     if df is None:
         df = self.data_frame_original
     try:
         dummies_dataframe = df.copy()
         for col in self.categorical_cols:
             cat_list = pd.get_dummies(dummies_dataframe[col], prefix=col)
             dummies_dataframe = dummies_dataframe.join(cat_list)
         all_dummies_cols = dummies_dataframe.columns.values.tolist()
         cols_to_keep = [
             col for col in all_dummies_cols
             if col not in self.categorical_cols
         ]
         dummies_dataframe = dummies_dataframe[cols_to_keep]
         logger.debug('Columns after Dummy Encoding : ' +
                      str(dummies_dataframe.columns.values))
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In DataFrameHandler | get_dummies_data finished")
     return dummies_dataframe