def chartSingleClassFrequency(self, data): """Display a bar chart of frequencies per label # Arguments data: list, List of integer values corresponding to the actual question score. """ num_classes = self.getNumClasses(data) count_map = Counter(data) counts = [count_map[i] for i in range(num_classes)] total_count = sum(counts) majority_class_count = max(counts) majority_acc = round((majority_class_count / total_count), 2) idx = np.arange(num_classes) colors = [] for count in counts: if count < (total_count * CONFIG.getfloat('VARIABLES', 'MinorityClassThreshold')): colors.append('r') else: colors.append('b') self.axes.cla() self.axes.bar(idx, counts, color=colors) self.axes.set_xlabel('Class') self.axes.set_ylabel('Number of Samples') self.axes.set_xticks(idx) self.axes.set_title(f"Majority class accuracy: {majority_acc}") rects = self.axes.patches for rect, label in zip(rects, counts): height = rect.get_height() self.axes.text(rect.get_x() + rect.get_width() / 2, height + 5, label, ha='center', va='bottom') self.draw()
def run(self): self._update_log('Beginning ModelTrain run') # * Run thru enumeration of columns. The second argument in enumerate # * tells python where to begin the idx count. Here, 1 for our offset try: for col_idx, col in enumerate(self.training_data.columns, 1): if col.endswith(self.tag_suffix): self._update_log(f'Current classification task: {col}', False) col_label = col.split( CONFIG.get('VARIABLES', 'TagDelimiter'))[0] col_path = os.path.join(self.version_directory, col_label) # * FInd and drop any samples missing an index missing_idx_count = self.training_data.index.isna().sum() if (missing_idx_count > 0): self._update_log( f"<b>Found {missing_idx_count} samples missing a value for index </b> \ (index_col = {CONFIG.get('VARIABLES', 'IndexColumn')}). Removing those samples..." ) valid_indexes = self.training_data.index.dropna() self.training_data = self.training_data[ self.training_data.index.isin(valid_indexes)] self._update_log( f'Shape of dataset after removal: {self.training_data.shape}' ) # * Create dict to fill na samples with 'unanswered' and score of 0 label_col_name = self.training_data.columns[col_idx] fill_dict = pd.DataFrame(data={ col: 'unanswered', label_col_name: 0 }, index=[0]) self.training_data.fillna(value=0, inplace=True, axis=1) x = self.training_data[col].copy() y = self.training_data[ self.training_data.columns[col_idx]].copy().values results = pd.DataFrame(index=self.training_data.index) results[TRUTH_LABEL_SUFFIX] = y preds = np.empty(y.shape) probs = np.empty(shape=(y.shape[0], len(np.unique(y)))) # * Initialize sklearn evaluation parameters sk_eval_type = self.training_eval_params['sklearn']['type'] sk_eval_value = self.training_eval_params['sklearn'][ 'value'] # * SKLEARN for model, selected in self.selected_models[ 'sklearn'].items(): if self._is_running == False: self.signals.training_complete.emit(pd.DataFrame()) break if selected: try: if self.tune_models: self._tune_model(x, y, model, col_path) model_params = self.get_params_from_file( model, col_path) self._update_log(f'Begin training {model}') pipeline = Pipeline( self.get_pipeline(model_params['params'])) try: if sk_eval_type == 'cv': skf = StratifiedKFold( n_splits=sk_eval_value, random_state=RANDOM_SEED) for train, test in skf.split(x, y): with joblib.parallel_backend( 'dask'): preds[test] = pipeline.fit( x.iloc[train], y[train]).predict( x.iloc[test]) if self.use_proba and hasattr( pipeline, 'predict_proba'): try: probs[ test] = pipeline.predict_proba( x.iloc[test]) except AttributeError: self.logger.debug( '{} does not support predict_proba' .format(model)) print( model, 'does not support predict_proba' ) else: probs = np.array([]) elif sk_eval_type == 'test_split': x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=sk_eval_value, stratify=y, random_state=CONFIG.getfloat( 'VARIABLES', 'RandomSeed')) preds = np.empty(len(y_test)) else: self._update_log( f'No evaluation type chosen.') except (KeyboardInterrupt, SystemExit): raise except Exception: self.logger.warning( '{} threw an exception during fit. \ Possible error with joblib multithreading.' .format(model), exc_info=True) tb = traceback.format_exc() print(tb) self._update_log( '{} threw an exception during fit. \ Possible error with joblib multithreading.' .format(model), True, False) model_scores = self.get_model_scores(y, preds) self._update_log( f'Task completed on <b>{model}</b>.') table_str = '''<table> <thead> <tr> <th>Accuracy</th><th>F1-Score</th><th>Cohen's Kappa</th> </tr> </thead> <tbody> <tr> ''' for metric, score in model_scores.items(): table_str += '<td style="border: 1px solid #333;">%.2f</td>' % score table_str += '</tr></tbody></table><br>' if sk_eval_type is not None: self._update_log(table_str, False, True) self._update_log( f'Training {model} on full dataset') with joblib.parallel_backend('dask'): pipeline.fit(x, y) pred_col_name = col_label + TAG_DELIMITER + model + PRED_LABEL_SUFFIX prob_col_name = col_label + TAG_DELIMITER + model + PROB_LABEL_SUFFIX results[pred_col_name] = preds.astype(int) # If predicting probabilities and the probability array has values, # use those values for the results. if self.use_proba and probs.size: results[prob_col_name] = np.amax(probs, axis=1) save_path = os.path.join(col_path, model) if not os.path.exists(save_path): os.makedirs(save_path) self.save_model(model, pipeline, save_path, model_scores) except (KeyboardInterrupt, SystemExit): raise except Exception as e: self.logger.error(f'ModelTrainer.run {model}:', exc_info=True) tb = traceback.format_exc() print(tb) self._update_log(tb) # Tensorflow__ would reside here try: if self.train_stacking_algorithm and self._is_running: self.train_stacker( results.drop(TRUTH_LABEL_SUFFIX, axis=1), results[TRUTH_LABEL_SUFFIX].values, col_path) else: self._update_log('Skipping Stacker training.') except ValueError as ve: self.signals.training_complete.emit(pd.DataFrame()) self._update_log( f'Unable to train Stacking algorithm on {col_label}.' ) tb = traceback.format_exc() print(tb) except Exception as e: self.logger.error(f'ModelTrainer.run {model}:', exc_info=True) tb = traceback.format_exc() print(tb) self._update_log(tb) self._is_running = False self.signals.training_complete.emit(self.all_predictions_df) except Exception as e: self.signals.training_complete.emit(pd.DataFrame()) self.logger.error('ModelTrainer.run (General):', exc_info=True) tb = traceback.format_exc() print(tb) self._update_log(tb)
# import package.utils.keras_models as keras_models # import package.utils.SequenceTransformer as seq_trans RANDOM_SEED = 1337 TOP_K = 20000 MAX_SEQUENCE_LENGTH = 1500 BASE_MODEL_DIR = './package/data/base_models' BASE_TFIDF_DIR = './package/data/feature_extractors/TfidfVectorizer.json' INPUT_SHAPE = (0, 0) TAG_DELIMITER = CONFIG.get('VARIABLES', 'TagDelimiter') PRED_LABEL_SUFFIX = CONFIG.get('VARIABLES', 'PredictedLabelSuffix') PROB_LABEL_SUFFIX = CONFIG.get('VARIABLES', 'ProbabilityLabelSuffix') TRUTH_LABEL_SUFFIX = CONFIG.get('VARIABLES', 'TruthLabelSuffix') STACKER_LABEL_SUFFIX = CONFIG.get('VARIABLES', 'StackerLabelSuffix') DISAGREEMENT_THRESHOLD = CONFIG.getfloat('VARIABLES', 'DisagreementThreshold') BAMBOOZLED_THRESHOLD = CONFIG.getint('VARIABLES', 'BamboozledThreshold') class ModelTrainerSignals(QObject): training_complete = pyqtSignal(pd.DataFrame) tuning_complete = pyqtSignal(bool, dict) update_progressbar = pyqtSignal(int, bool) update_training_logger = pyqtSignal(str, bool, bool) class ModelTrainer(QRunnable): ''' QThread tasked with running all model training/tuning. This could potentially take days to complete. '''