def __init__(self, selected_models, version_directory, training_eval_params, training_data, tune_models, tuning_params, use_proba=False, train_stacking_algorithm=True, **kwargs): super(ModelTrainer, self).__init__() self.logger = logging.getLogger(__name__) self.signals = ModelTrainerSignals() self.allowed_pipeline_types = [ 'feature_extraction', 'feature_selection' ] self.version_directory = version_directory self.selected_models = selected_models self.training_eval_params = training_eval_params self.training_data = training_data self.tune_models = tune_models self.tuning_params = tuning_params self.use_proba = use_proba self.train_stacking_algorithm = train_stacking_algorithm self.kwargs = kwargs self.all_predictions_df = pd.DataFrame(index=self.training_data.index) self.grid_search_time = None self.model_checksums = {} self._is_running = True self.tag_suffix = CONFIG.get('VARIABLES', 'TagDelimiter') + CONFIG.get( 'VARIABLES', 'TagDataColumnSuffix')
def chartSingleClassFrequency(self, data): """Display a bar chart of frequencies per label # Arguments data: list, List of integer values corresponding to the actual question score. """ num_classes = self.getNumClasses(data) count_map = Counter(data) counts = [count_map[i] for i in range(num_classes)] total_count = sum(counts) majority_class_count = max(counts) majority_acc = round((majority_class_count / total_count), 2) idx = np.arange(num_classes) colors = [] for count in counts: if count < (total_count * CONFIG.getfloat('VARIABLES', 'MinorityClassThreshold')): colors.append('r') else: colors.append('b') self.axes.cla() self.axes.bar(idx, counts, color=colors) self.axes.set_xlabel('Class') self.axes.set_ylabel('Number of Samples') self.axes.set_xticks(idx) self.axes.set_title(f"Majority class accuracy: {majority_acc}") rects = self.axes.patches for rect, label in zip(rects, counts): height = rect.get_height() self.axes.text(rect.get_x() + rect.get_width() / 2, height + 5, label, ha='center', va='bottom') self.draw()
def save_params_to_file(self, model, best_params, model_param_path, score_dict={}): try: model_path = os.path.join(model_param_path, model + '.json') if not os.path.isfile(model_path): # Get default values model_path = os.path.join( CONFIG.get('PATHS', 'DefaultModelDirectory'), model, model + '.json') with open(model_path, 'r') as param_file: model_params = json.load(param_file) current_time = time.localtime() model_params['meta']['training_meta'].update({ 'last_train_date': time.strftime('%Y-%m-%d %H:%M:%S', current_time), 'train_eval_score': score_dict, 'checksum': self.model_checksums[model] }) if self.tune_models: model_params['meta']['tuning_meta'].update({ 'last_tune_date': time.strftime('%Y-%m-%d %H:%M:%S', current_time), 'n_iter': self.tuning_params['gridsearch']['n_iter'], 'tuning_duration': self.grid_search_time, 'tune_eval_score': score_dict }) # Update model params to those discovered during tuning for param_type, parameters in model_params['params'].items(): param_key = param_type.split('.')[-1] for k, v in best_params.items(): best_param_key = k.split('__')[-1] if k.startswith( param_key) and best_param_key in parameters.keys(): parameters[best_param_key] = v save_path = os.path.join(model_param_path, model + '.json') # print(f'Saving {model} params: {model_params} to {save_path}') with open(save_path, 'w') as outfile: json.dump(model_params, outfile, indent=2, cls=CATEncoder) except FileNotFoundError as fnfe: self.logger.debug( 'ModelTrainer.save_params_to_file {} not found'.format( model_path)) except Exception as e: self.logger.error( 'ModelTrainer.save_params_to_file {}:'.format(model), exc_info=True) tb = traceback.format_exc() print(tb)
def get_params_from_file(self, model_name, base_path=None, tpot=False): ''' Loads model parameters either from file (if version has been saved), or grabs the defaults # Arguments model_name: string, model name used to specify path base_path: string, optional pathing used for loading custom model parameters # Returns model_params: dict, parameters from file or defaults ''' try: if tpot or base_path is not None: model_path = os.path.join(base_path, model_name, model_name + '.json') if not os.path.isfile(model_path): model_path = os.path.join( CONFIG.get('PATHS', 'DefaultModelDirectory'), model_name, model_name + '.json') # elif base_path is not None: # model_path = os.path.join( # base_path, model_name, model_name + '.json') # if not os.path.isfile(model_path): # model_path = os.path.join(CONFIG.get('PATHS', 'DefaultModelDirectory'), # model_name, # model_name + '.json') else: model_path = os.path.join( CONFIG.get('PATHS', 'DefaultModelDirectory'), model_name, model_name + '.json') with open(model_path, 'r') as param_file: model_params = json.load(param_file, object_hook=cat_decoder) return model_params except Exception as e: self.logger.error('ModelTrainer.get_params_from_file:', exc_info=True) tb = traceback.format_exc() print(tb) self._update_log(tb, True, False)
def run(self): apply_cols = [ col for col in self.data.columns if col.endswith('_text') ] self.data[apply_cols] = self.data[apply_cols].applymap( lambda x: processText(str(x), **self.options) ) if self.options['spell_correction']: sentences = self.data[apply_cols].applymap( lambda x: str(x).split() ).values sc = SpellCheck(sentences, CONFIG.getint('VARIABLES', 'TopKSpellCheck')) self.data[apply_cols] = self.data[apply_cols].applymap( lambda x: sc.correct_spelling(x) ) # sys.stdout = sys.__stdout__ self.preprocessing_complete.emit(self.data)
def __init__(self, parent=None): super(SelectModelWidget, self).__init__(parent) self.logger = logging.getLogger(__name__) self.parent = parent self.threadpool = QThreadPool() self.logger.info( f"Multithreading enabled with a maximum of {self.threadpool.maxThreadCount()} threads." ) print("Multithreading with maximum %d threads" % self.threadpool.maxThreadCount()) self.training_data = pd.DataFrame() self.training_predictions = pd.DataFrame() self.selected_version = CONFIG.get('PATHS', 'DefaultModelDirectory') self.comms = Communicate() self.selected_models = {} self.selected_models['sklearn'] = {} self.selected_models['tensorflow'] = {} self.model_checkboxes = [] # * Initialize training parameter dict. # * Has entry for both model base types self.training_params = {} self.training_params['sklearn'] = {} self.training_params['sklearn']['type'] = None self.training_params['sklearn']['value'] = None self.training_params['tensorflow'] = {} # * Init tuning param dict # * Currently only using gridsearch self.tuning_params = {} self.tuning_params['gridsearch'] = { 'n_iter': 20, 'cv': 3, 'n_jobs': -1, 'scoring': ['accuracy'], 'tune_stacker': False } self.sklearn_model_dialogs = [] self.sklearn_model_dialog_btns = [] self.sklearn_training_inputs = [] self.tensorflow_training_inputs = [] self.tensorflow_model_dialogs = [] self.tensorflow_model_dialog_btns = [] self.main_layout = QVBoxLayout() self.upper_hbox = QHBoxLayout() self.version_form = QFormLayout() self.header_hbox = QHBoxLayout() self.header_hbox.addLayout(self.version_form) self.header_hbox.addStretch() self.tune_models_chkbox = QCheckBox("Tune Models") self.header_hbox.addWidget(self.tune_models_chkbox) self.tune_models_chkbox.stateChanged.connect( lambda state: self._enable_tuning_ui(state)) self.main_layout.addLayout(self.header_hbox) self.main_layout.addLayout(self.upper_hbox) self.model_vbox = QVBoxLayout() self.tuning_vbox = QVBoxLayout() self.upper_hbox.addLayout(self.model_vbox) self.upper_hbox.addSpacing(10) self.upper_hbox.addLayout(self.tuning_vbox) self.upper_hbox.addSpacing(200) # * Build sklearn ui components self.sklearn_hbox = QHBoxLayout() self.sklearn_groupbox = QGroupBox("Sklearn") self.sklearn_groupbox.setLayout(self.sklearn_hbox) self.skmodel_groupbox = QGroupBox("Model Selection") self.sklearn_hbox.addWidget(self.skmodel_groupbox) self.sklearn_model_form = QFormLayout() self.sklearn_model_form.setFormAlignment(Qt.AlignTop) self.skmodel_groupbox.setLayout(self.sklearn_model_form) # * Sklearn training and tuning ui components self.sklearn_training_groupbox = QGroupBox("Training") self.sklearn_training_form = QFormLayout() self.sklearn_training_groupbox.setLayout(self.sklearn_training_form) self.sklearn_hbox.addWidget(self.sklearn_training_groupbox) self.model_vbox.addWidget(self.sklearn_groupbox) # * Build Tensorflow ui components self.tensorflow_hbox = QHBoxLayout() self.tensorflow_groupbox = QGroupBox("Tensorflow") self.tensorflow_groupbox.setLayout(self.tensorflow_hbox) self.tensorflow_model_groupbox = QGroupBox("Model Selection") self.tensorflow_hbox.addWidget(self.tensorflow_model_groupbox) self.tensorflow_model_form = QFormLayout() self.tensorflow_model_groupbox.setLayout(self.tensorflow_model_form) self.tensorflow_training_groupbox = QGroupBox("Training") self.tensorflow_training_form = QFormLayout() self.tensorflow_training_groupbox.setLayout( self.tensorflow_training_form) self.tensorflow_hbox.addWidget(self.tensorflow_training_groupbox) # * This is the tensorflow groupbox for models and training params. # self.model_vbox.addWidget(self.tensorflow_groupbox) self.tuning_groupbox = QGroupBox("Tuning") self.tuning_form = QFormLayout() self.tuning_groupbox.setLayout(self.tuning_form) self.tuning_vbox.addWidget(self.tuning_groupbox) self.tuning_groupbox.setEnabled(False) self.model_form_grid = QGridLayout() self.setup_model_selection_ui() self.setup_training_ui() self.setup_tuning_ui() # * QTextEdit box for training/tuning status self.training_logger = QTextEdit() self.training_logger.setReadOnly(True) self.training_logger.setAcceptRichText(True) self.training_logger.insertHtml( "<i>Multithreading with maximum %d threads</i><br>" % self.threadpool.maxThreadCount()) self.training_logger.setMinimumHeight(400) self.main_layout.addWidget(self.training_logger) self.clear_btn_hbox = QHBoxLayout() self.clear_text_btn = QPushButton('Clear') self.clear_text_btn.setMaximumWidth(50) self.clear_text_btn.clicked.connect( lambda: self.training_logger.clear()) self.clear_btn_hbox.addStretch() self.clear_btn_hbox.addWidget(self.clear_text_btn) self.main_layout.addLayout(self.clear_btn_hbox) self.main_layout.addStretch() self.run_btn = QPushButton("&Train Models") self.run_btn.setMinimumWidth(200) self.run_btn.clicked.connect(lambda: self.train_models()) self.run_btn.setEnabled(False) self.stop_btn = QPushButton('Sto&p') self.stop_btn.setSizePolicy(QSizePolicy.Minimum, QSizePolicy.Fixed) self.comms.enable_training_btn.connect(self.set_training_btn_state) self.button_hbox = QHBoxLayout() icon = QIcon() icon.addPixmap(QPixmap('icons/Programming-Save-icon.png')) self.save_results_btn = QPushButton() self.save_results_btn.setIcon(icon) self.save_results_btn.setEnabled(False) self.save_results_btn.setToolTip( 'Save model evaluation predictions, agreement ratio, and bamboozled score' ) self.save_results_btn.clicked.connect(lambda: self.save_predictions()) self.button_hbox.addWidget(self.run_btn) self.button_hbox.addWidget(self.stop_btn) self.button_hbox.addStretch() self.button_hbox.addWidget(self.save_results_btn) self.main_layout.addLayout(self.button_hbox) self.setLayout(self.main_layout) # Trigger update to load model parameters self._update_version(self.version_selection.currentData())
def setup_model_selection_ui(self): """ Setup model selection ui. The order of the parameters in ModelDialog matters. model_data must come first! """ self.version_selection_label = QLabel("Select version: ") self.version_selection = QComboBox(objectName='version_select') self.version_selection.setMinimumWidth(100) # Changed default models to a unique directory. This # is where default models will be saved. # self.version_selection.addItem( # 'default', '.\\package\\data\\default_models\\default') available_versions = os.listdir(BASE_VERSION_DIR) for version in available_versions: v_path = os.path.join(BASE_VERSION_DIR, version) if os.path.isdir(v_path): self.version_selection.addItem(version, v_path) self.version_selection.currentIndexChanged.connect( lambda x, y=self.version_selection: self._update_version( y.currentData())) self.version_form.addRow(self.version_selection_label, self.version_selection) # Load base TF-IDF features # and feature selection data try: with open(CONFIG.get('PATHS', 'BaseTfidfDirectory'), 'r') as f: tfidf_data = json.load(f) except IOError as ioe: self.logger.error("Error loading base TFIDF params", exc_info=True) exceptionWarning( 'Error occurred while loading base TFIDF parameters.', repr(ioe)) try: with open(CONFIG.get('PATHS', 'BaseFeatureSeletionDirectory'), 'r') as f: self.fs_params = json.load(f) except IOError as ioe: self.logger.error("Error loading base feature selector params", exc_info=True) exceptionWarning( 'Error occurred while loading base feature selector parameters.', repr(ioe)) # Dynamically generate ModelDialogs for each model in the base model directory. # Only considers *.json file extension. try: row = 0 for filename in os.listdir( CONFIG.get('PATHS', 'BaseModelDirectory')): if filename.endswith('.json'): with open( os.path.join( CONFIG.get('PATHS', 'BaseModelDirectory'), filename), 'r') as f: model_data = json.load(f) model = model_data['model_class'] model_base = model_data['model_base'] model_module = model_data['model_module'] #! The order of the arguments matters! model_data must come first. if model_base == 'tensorflow': continue # model_dialog = SkModelDialog(self, model_data) if model_module == 'tpot': model_dialog = TPOTModelDialog( self, model_data, tfidf_data) else: model_dialog = SkModelDialog( self, model_data, tfidf_data, self.fs_params) self.comms.version_change.connect( model_dialog.update_version) # Initialize model as unselected self.selected_models[model_base][model] = False btn = QPushButton(model, objectName=model + '_btn') # Partial allows the connection of dynamically generated QObjects btn.clicked.connect( partial(self.open_dialog, model_dialog)) chkbox = QCheckBox(objectName=model) chkbox.stateChanged.connect( lambda state, x=model, y=model_base: self. _update_selected_models(x, y, state)) if model_base == 'tensorflow': self.tensorflow_model_form.addRow(chkbox, btn) self.tensorflow_model_dialogs.append(model_dialog) self.tensorflow_model_dialog_btns.append(btn) else: self.sklearn_model_form.addRow(chkbox, btn) self.sklearn_model_dialogs.append(model_dialog) self.sklearn_model_dialog_btns.append(btn) self.model_checkboxes.append(chkbox) row += 1 except OSError as ose: self.logger.error("OSError opening model config files", exc_info=True) exceptionWarning('OSError opening model config files!', ose) tb = traceback.format_exc() print(tb) except Exception as e: self.logger.error("Error opening model config files", exc_info=True) exceptionWarning('Error occured.', e) tb = traceback.format_exc() print(tb)
def load_file(self, f_path): """ Load data from a CSV file to the workspace. Column 0 is used for the index column. chardet attempts to determine encoding if file is not utf-8. # Attributes f_path(String): The filename selected via open_file """ # FIXME: Reset status bar when new data is loaded. try: self.graph.clear_graph() self.available_column_model.loadData([], include_labels=False) self.prediction_data = pd.read_csv( f_path, encoding='utf-8', index_col=CONFIG.getint( 'VARIABLES', 'IndexColumn')) #TODO: user defined index column except UnicodeDecodeError as ude: self.logger.warning("UnicodeDecode error opening file", exc_info=True) self.comms.update_statusbar.emit( "Attempting to determine file encoding...") detector = UniversalDetector() try: for line in open(f_path, 'rb'): detector.feed(line) if detector.done: break detector.close() print("chardet determined encoding type to be {}".format( detector.result['encoding'])) self.prediction_data = pd.read_csv( f_path, encoding=detector.result['encoding'], index_col=0) except Exception as e: self.logger.error("Error detecing encoding", exc_info=True) exceptionWarning("Exception has occured.", exception=e) except IOError as ioe: self.logger.error("IOError detecting encoding", exc_info=True) exceptionWarning("IO Exception occured while opening file.", exception=ioe) except Exception as e: self.logger.error("Error detecting encoding", exc_info=True) exceptionWarning("Error occured opening file.", exception=e) try: columns = self.prediction_data.columns self.available_columns = [] self.columns_with_truth = [] self.ground_truth_columns = self.prediction_data.columns[ ~self.prediction_data.isna().any()].tolist() for column in columns: if column.lower().endswith("text"): self.available_columns.append(column) column_tag = column.split('__')[0] if (column_tag + '__actual' in self.ground_truth_columns): self.columns_with_truth.append(column) if self.available_columns: self.available_column_model.loadData(self.available_columns, include_labels=False) if self.columns_with_truth: self.available_column_model.setTruthData( self.columns_with_truth) # self.full_text_count.setText(str(self.prediction_data.shape[0])) # self.display_selected_row(None) # self.select_all_btn.setEnabled(True) # self.deselect_all_btn.setEnabled(True) self.comms.update_statusbar.emit("CSV loaded.") # else: # exceptionWarning("No allowable data discovered in file.") except pd.errors.EmptyDataError as ede: exceptionWarning('Empty Data Error.\n', exception=ede) except Exception as e: self.logger.error("Error loading dataframe", exc_info=True) exceptionWarning("Exception occured. PredictWidget.load_file.", exception=e)
from functools import partial import hashlib from chardet.universaldetector import UniversalDetector import pandas as pd import pkg_resources from package.evaluate.EvaluateTableModel import EvaluateTableModel from package.utils.catutils import exceptionWarning, clearLayout from package.utils.DataframeTableModel import DataframeTableModel from package.utils.GraphWidget import GraphWidget from package.utils.config import CONFIG from sklearn.metrics import f1_score, accuracy_score, cohen_kappa_score, precision_score, recall_score DELIMITER = CONFIG.get('VARIABLES', 'TagDelimiter') PRED_SUFFIX = CONFIG.get('VARIABLES', 'PredictedLabelSuffix') TRUTH_SUFFIX = CONFIG.get('VARIABLES', 'TruthLabelSuffix') AVG_TYPE = CONFIG.get('VARIABLES', 'MetricsAverageType') class Communicate(QObject): version_change = pyqtSignal(str) enable_eval_btn = pyqtSignal(bool) # stop_training = pyqtSignal() data_load = pyqtSignal(pd.DataFrame) update_statusbar = pyqtSignal(str) update_progressbar = pyqtSignal(int, bool) class EvaluateWidget(QWidget):
def train_stacker(self, x, y, col_path): def get_ratio(row): """ Returns the ratio of agreement between column values (here, predictors) in a given row. """ try: pred_value = row.iloc[-1] total_same = 0.0 col_count = float(len(row.iloc[:-1])) for data in row.iloc[:-1]: if data == pred_value: total_same += 1.0 return total_same / col_count except ZeroDivisionError as zde: return 0 except Exception as e: self.logger.error("ModelTrainer.get_ratio", exc_info=True) exceptionWarning( 'Exception occured in ModelTrainer.get_ratio.', repr(e)) def get_bamboozled_score(row): """ Returns the difference between the number of models and the number of models who predicted incorrectly. The higher this value, the more bamboozling the sample """ try: pred_value = row.iloc[-1] total_wrong = 0 col_count = len(row.iloc[:-1]) for data in row.iloc[:-1]: if data != pred_value: total_wrong += 1 return col_count - total_wrong except Exception as e: self.logger.error("ModelTrainer.get_bamboozled_score", exc_info=True) exceptionWarning( 'Exception occured in ModelTrainer.get_bamboozled_score.', repr(e)) stacker_full_class = CONFIG.get( 'VARIABLES', 'StackingAlgorithmCLassName').split('.') final_preds = np.empty(y.shape) stacker_module = '.'.join(stacker_full_class[0:-1]) inst_module = importlib.import_module(stacker_module) stacker_class = getattr(inst_module, stacker_full_class[-1]) stacker = stacker_class() if self.tuning_params['gridsearch']['tune_stacker']: self._update_log( f'Beginning tuning run on Stacker <b>{".".join(stacker_full_class)}</b>...' ) rscv = RandomizedSearchCV( estimator=stacker, n_jobs=self.tuning_params['gridsearch']['n_jobs'] if self.tuning_params['gridsearch']['n_jobs'] != 0 else None, cv=self.tuning_params['gridsearch']['cv'], n_iter=self.tuning_params['gridsearch']['n_iter'], pre_dispatch=CONFIG.get('VARIABLES', 'PreDispatch'), verbose=CONFIG.getint('VARIABLES', 'RandomizedSearchVerbosity'), scoring=self.tuning_params['gridsearch']['scoring'] if len(self.tuning_params['gridsearch']['scoring']) > 0 else None, refit='accuracy') rscv.fit(x, y) best_params = rscv.best_params_ stacker = stacker_class(**best_params) self._update_log('Stacker tuning completed! Re-evaluating...') self._update_log( f'Training Stacking algorithm <b>{".".join(stacker_full_class)}</b>' ) skf = StratifiedKFold(n_splits=5, random_state=RANDOM_SEED) for train, test in skf.split(x, y): with joblib.parallel_backend('dask'): stacker.fit(x.iloc[train], y[train]) final_preds[test] = stacker.predict(x.iloc[test]) # stack_preds = [1 if x > .5 else 0 for x in np.nditer(final_preds)] self._update_log('Stacking training complete') stack_scores = self.get_model_scores(y, final_preds) table_str = '''<table> <thead> <tr> <th>Accuracy</th><th>F1-Score</th><th>Cohen's Kappa</th> </tr> </thead> <tbody> <tr> ''' for metric, score in stack_scores.items(): table_str += '<td style="border: 1px solid #333;">%.2f</td>' % score table_str += '</tr></tbody></table><br>' self._update_log(table_str, False, True) self._update_log('Retraining Stacker on full dataset') stacker.fit(x, y) save_path = os.path.join(col_path, 'Stacker') if not os.path.exists(save_path): os.makedirs(save_path) save_file = os.path.join(save_path, 'Stacker.pkl') self._update_log(f'Saving Stacking algorithm to : {save_file}', False) joblib.dump(stacker, save_file, compress=1) self.model_checksums['Stacker'] = hashlib.md5( open(save_file, 'rb').read()).hexdigest() self._update_log(f'Stacking hash: {self.model_checksums["Stacker"]}') # Save particulars to file col_name = col_path.split('\\')[-1] stacker_info = { 'column': col_name, 'version_directory': self.version_directory, 'last_train_date': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), 'train_eval_score': stack_scores, 'model_checksums': self.model_checksums } stacker_json_save_file = os.path.join(save_path, 'Stacker.json') with open(stacker_json_save_file, 'w') as outfile: json.dump(stacker_info, outfile, indent=2) x[col_name + TRUTH_LABEL_SUFFIX] = y agreement_ratios = x.apply(get_ratio, axis=1) bamboozled = x.apply(get_bamboozled_score, axis=1) x[col_name + TAG_DELIMITER + 'agreement_ratio'] = agreement_ratios x[col_name + TAG_DELIMITER + 'bamboozled_score'] = bamboozled pc_len = len(x[x[col_name + TAG_DELIMITER + 'agreement_ratio'] <= DISAGREEMENT_THRESHOLD]) bamboozled_len = len(x[x[col_name + TAG_DELIMITER + 'bamboozled_score'] <= BAMBOOZLED_THRESHOLD]) self._update_log( f"Found {pc_len} samples for {col_name} that fall at or below the {DISAGREEMENT_THRESHOLD} predictor agreement." ) self._update_log( f"Found {bamboozled_len} samples for {col_name} that have a bamboozled score of {BAMBOOZLED_THRESHOLD} or below." ) # print('HEAD OF X IN TRAIN_STACKER') # print(x.head()) # print(x.columns) # ? What X is a dataframe [col_name + CONFIG.get('VARIABLES', 'StackerLabelSuffix')] = final_preds self.all_predictions_df = pd.merge(self.all_predictions_df, x, how='outer', left_index=True, right_index=True) # print('HEAD OF all_redictions_df IN TRAIN_STACKER') # print(self.all_predictions_df.head()) # print(self.all_predictions_df.columns) self._update_log('Run complete') self._update_log('<hr>', False, True)
def save_tpot_params_to_file(self, pipeline, model_param_path, score_dict): try: model = 'TPOTClassifier' model_path = os.path.join(model_param_path, model + '.json') if not os.path.isfile(model_path): # Get default values model_path = os.path.join( CONFIG.get('PATHS', 'DefaultModelDirectory'), model, model + '.json') with open(model_path, 'r') as param_file: model_params = json.load(param_file) best_params = pipeline.get_params() tpot_params = model_params['tpot_params'] # * Remove any models under params that are not TfidfVectorizers for param_type in list(model_params['params'].keys()): param_key = param_type.split('.')[1] if param_key != 'feature_extraction': del model_params['params'][param_type] # * Update tfidf params to the best for param_type, parameters in model_params['params'].items(): param_key = param_type.split('.')[-1] for k, v in best_params.items(): best_param_key = k.split('__')[-1] if k.startswith( param_key) and best_param_key in parameters.keys(): parameters[best_param_key] = v current_time = time.localtime() model_params['meta']['training_meta'].update({ 'last_train_date': time.strftime('%Y-%m-%d %H:%M:%S', current_time), 'train_eval_score': score_dict, 'checksum': self.model_checksums[model] }) if self.tune_models: model_params['meta']['tuning_meta'].update({ 'last_tune_date': time.strftime('%Y-%m-%d %H:%M:%S', current_time), 'n_iter': self.tuning_params['gridsearch']['n_iter'], 'tuning_duration': self.grid_search_time, 'tune_eval_score': score_dict }) # * Now to get the new model parameters for name, obj in pipeline.named_steps.items(): if name == 'TfidfVectorizer': continue module_name = str(obj.__class__).split("'")[1] module_params = obj.get_params() model_params['params'].update({module_name: module_params}) model_params['tpot_params'] = tpot_params with open(os.path.join(model_param_path, model + '.json'), 'w') as outfile: json.dump(model_params, outfile, indent=2, cls=CATEncoder) except FileNotFoundError as fnfe: self.logger.debug( 'ModelTrainer.save_params_to_file {} not found'.format( model_path)) except Exception as e: self.logger.error( 'ModelTrainer.save_params_to_file {}:'.format(model), exc_info=True) tb = traceback.format_exc() print(tb)
def grid_search(self, model, x, y, pipeline, tuning_params, n_jobs=-1, n_iter=20, scoring=None, include_tfidf=False, keras_params=None): '''Performs grid search on selected pipeline. # Arguments model: string, name of classifier in pipeline x: pandas.DataFrame, training data y: numpy.array, training labels pipeline: sklearn.model_selection.Pipeline, pipeline object containing feature extractors, feature selectors and estimator n_jobs: int, Number of jobs to run in parallel. n_iter: int, number of iterations to perform search scoring: list, scoring metrics to be used by the evaluator include_tfidf: bool:, flag to indicate tfidf is included in the pipeline keras_params: dict, parameters necessary for model training outside of the regular hyperparams. e.g. input_shape, num_classes, num_features ''' try: start_time = time.time() filepath = os.path.join(CONFIG.get('PATHS', 'BaseModelDirectory'), model + '.json') with open(filepath, 'r') as f: model_data = json.load(f, object_hook=cat_decoder) grid_params = {} default_params = model_data[model] for param_types, types in default_params.items(): for t, params in types.items(): if params['tunable']: param_name = model + '__' + t if params['type'] == 'dropdown': param_options = list(params['options'].values()) elif params['type'] == 'double': param_options = scipy.stats.expon( scale=params['step_size']) elif params['type'] == 'int': param_options = scipy.stats.randint( params['min'], params['max'] + 1) elif params['type'] == 'range': param_options = [(1, 1), (1, 2), (1, 3), (1, 4)] grid_params.update({param_name: param_options}) else: continue if include_tfidf: with open(CONFIG.get('PATHS', 'BaseTfidfDirectory'), 'r') as f: model_data = json.load(f, object_hook=cat_decoder) model_class = model_data['model_class'] default_params = model_data[model_class] for param_types, types in default_params.items(): for t, params in types.items(): if params['tunable']: param_name = model_class + '__' + t if params['type'] == 'dropdown': param_options = list( params['options'].values()) elif params['type'] == 'double': param_options = scipy.stats.expon( scale=params['step_size']) elif params['type'] == 'int': param_options = scipy.stats.randint( params['min'], params['max'] + 1) elif params['type'] == 'range': param_options = [(1, 1), (1, 2), (1, 3), (1, 4)] else: param_options = None grid_params.update({param_name: param_options}) else: continue # Remnant from __TENSORFLOW work. # if keras_params: # updated_key_dict = {f'{model}__{k}': # [v] for k, v in keras_params.items()} # grid_params.update(updated_key_dict) self._update_log(f'Beginning RandomizedSearchCV on {model}...') rscv = RandomizedSearchCV( pipeline, grid_params, n_jobs=tuning_params['gridsearch']['n_jobs'] if tuning_params['gridsearch']['n_jobs'] != 0 else None, cv=tuning_params['gridsearch']['cv'], n_iter=n_iter, pre_dispatch=CONFIG.get('VARIABLES', 'PreDispatch'), verbose=CONFIG.getint('VARIABLES', 'RandomizedSearchVerbosity'), scoring=tuning_params['gridsearch']['scoring'] if len(tuning_params['gridsearch']['scoring']) > 0 else None, refit='accuracy') # refit='accuracy' if len(tuning_params['gridsearch']['scoring']) > 0 else None) # ! FIXME: Should we allow other, non accuracy metrics here? with joblib.parallel_backend('dask'): rscv.fit(x, y) self.grid_search_time = time.time() - start_time self._update_log( f'RandomizedSearchCV on {model} completed in {self.grid_search_time}' ) self._update_log(f'Best score for {model}: {rscv.best_score_}', False) return rscv except FileNotFoundError as fnfe: self.logger.debug( 'ModelTrainer.grid_search {} not found'.format(filepath)) except Exception as e: self.logger.error('ModelTrainer.grid_search {}:'.format(model), exc_info=True) tb = traceback.format_exc() print(tb) self._update_log(tb)
def run(self): self._update_log('Beginning ModelTrain run') # * Run thru enumeration of columns. The second argument in enumerate # * tells python where to begin the idx count. Here, 1 for our offset try: for col_idx, col in enumerate(self.training_data.columns, 1): if col.endswith(self.tag_suffix): self._update_log(f'Current classification task: {col}', False) col_label = col.split( CONFIG.get('VARIABLES', 'TagDelimiter'))[0] col_path = os.path.join(self.version_directory, col_label) # * FInd and drop any samples missing an index missing_idx_count = self.training_data.index.isna().sum() if (missing_idx_count > 0): self._update_log( f"<b>Found {missing_idx_count} samples missing a value for index </b> \ (index_col = {CONFIG.get('VARIABLES', 'IndexColumn')}). Removing those samples..." ) valid_indexes = self.training_data.index.dropna() self.training_data = self.training_data[ self.training_data.index.isin(valid_indexes)] self._update_log( f'Shape of dataset after removal: {self.training_data.shape}' ) # * Create dict to fill na samples with 'unanswered' and score of 0 label_col_name = self.training_data.columns[col_idx] fill_dict = pd.DataFrame(data={ col: 'unanswered', label_col_name: 0 }, index=[0]) self.training_data.fillna(value=0, inplace=True, axis=1) x = self.training_data[col].copy() y = self.training_data[ self.training_data.columns[col_idx]].copy().values results = pd.DataFrame(index=self.training_data.index) results[TRUTH_LABEL_SUFFIX] = y preds = np.empty(y.shape) probs = np.empty(shape=(y.shape[0], len(np.unique(y)))) # * Initialize sklearn evaluation parameters sk_eval_type = self.training_eval_params['sklearn']['type'] sk_eval_value = self.training_eval_params['sklearn'][ 'value'] # * SKLEARN for model, selected in self.selected_models[ 'sklearn'].items(): if self._is_running == False: self.signals.training_complete.emit(pd.DataFrame()) break if selected: try: if self.tune_models: self._tune_model(x, y, model, col_path) model_params = self.get_params_from_file( model, col_path) self._update_log(f'Begin training {model}') pipeline = Pipeline( self.get_pipeline(model_params['params'])) try: if sk_eval_type == 'cv': skf = StratifiedKFold( n_splits=sk_eval_value, random_state=RANDOM_SEED) for train, test in skf.split(x, y): with joblib.parallel_backend( 'dask'): preds[test] = pipeline.fit( x.iloc[train], y[train]).predict( x.iloc[test]) if self.use_proba and hasattr( pipeline, 'predict_proba'): try: probs[ test] = pipeline.predict_proba( x.iloc[test]) except AttributeError: self.logger.debug( '{} does not support predict_proba' .format(model)) print( model, 'does not support predict_proba' ) else: probs = np.array([]) elif sk_eval_type == 'test_split': x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=sk_eval_value, stratify=y, random_state=CONFIG.getfloat( 'VARIABLES', 'RandomSeed')) preds = np.empty(len(y_test)) else: self._update_log( f'No evaluation type chosen.') except (KeyboardInterrupt, SystemExit): raise except Exception: self.logger.warning( '{} threw an exception during fit. \ Possible error with joblib multithreading.' .format(model), exc_info=True) tb = traceback.format_exc() print(tb) self._update_log( '{} threw an exception during fit. \ Possible error with joblib multithreading.' .format(model), True, False) model_scores = self.get_model_scores(y, preds) self._update_log( f'Task completed on <b>{model}</b>.') table_str = '''<table> <thead> <tr> <th>Accuracy</th><th>F1-Score</th><th>Cohen's Kappa</th> </tr> </thead> <tbody> <tr> ''' for metric, score in model_scores.items(): table_str += '<td style="border: 1px solid #333;">%.2f</td>' % score table_str += '</tr></tbody></table><br>' if sk_eval_type is not None: self._update_log(table_str, False, True) self._update_log( f'Training {model} on full dataset') with joblib.parallel_backend('dask'): pipeline.fit(x, y) pred_col_name = col_label + TAG_DELIMITER + model + PRED_LABEL_SUFFIX prob_col_name = col_label + TAG_DELIMITER + model + PROB_LABEL_SUFFIX results[pred_col_name] = preds.astype(int) # If predicting probabilities and the probability array has values, # use those values for the results. if self.use_proba and probs.size: results[prob_col_name] = np.amax(probs, axis=1) save_path = os.path.join(col_path, model) if not os.path.exists(save_path): os.makedirs(save_path) self.save_model(model, pipeline, save_path, model_scores) except (KeyboardInterrupt, SystemExit): raise except Exception as e: self.logger.error(f'ModelTrainer.run {model}:', exc_info=True) tb = traceback.format_exc() print(tb) self._update_log(tb) # Tensorflow__ would reside here try: if self.train_stacking_algorithm and self._is_running: self.train_stacker( results.drop(TRUTH_LABEL_SUFFIX, axis=1), results[TRUTH_LABEL_SUFFIX].values, col_path) else: self._update_log('Skipping Stacker training.') except ValueError as ve: self.signals.training_complete.emit(pd.DataFrame()) self._update_log( f'Unable to train Stacking algorithm on {col_label}.' ) tb = traceback.format_exc() print(tb) except Exception as e: self.logger.error(f'ModelTrainer.run {model}:', exc_info=True) tb = traceback.format_exc() print(tb) self._update_log(tb) self._is_running = False self.signals.training_complete.emit(self.all_predictions_df) except Exception as e: self.signals.training_complete.emit(pd.DataFrame()) self.logger.error('ModelTrainer.run (General):', exc_info=True) tb = traceback.format_exc() print(tb) self._update_log(tb)
import package.utils.training_utils as tu from package.utils.catutils import CATEncoder, cat_decoder, exceptionWarning from package.utils.config import CONFIG import package.utils.embedding_utils as embed_utils # import package.utils.keras_models as keras_models # import package.utils.SequenceTransformer as seq_trans RANDOM_SEED = 1337 TOP_K = 20000 MAX_SEQUENCE_LENGTH = 1500 BASE_MODEL_DIR = './package/data/base_models' BASE_TFIDF_DIR = './package/data/feature_extractors/TfidfVectorizer.json' INPUT_SHAPE = (0, 0) TAG_DELIMITER = CONFIG.get('VARIABLES', 'TagDelimiter') PRED_LABEL_SUFFIX = CONFIG.get('VARIABLES', 'PredictedLabelSuffix') PROB_LABEL_SUFFIX = CONFIG.get('VARIABLES', 'ProbabilityLabelSuffix') TRUTH_LABEL_SUFFIX = CONFIG.get('VARIABLES', 'TruthLabelSuffix') STACKER_LABEL_SUFFIX = CONFIG.get('VARIABLES', 'StackerLabelSuffix') DISAGREEMENT_THRESHOLD = CONFIG.getfloat('VARIABLES', 'DisagreementThreshold') BAMBOOZLED_THRESHOLD = CONFIG.getint('VARIABLES', 'BamboozledThreshold') class ModelTrainerSignals(QObject): training_complete = pyqtSignal(pd.DataFrame) tuning_complete = pyqtSignal(bool, dict) update_progressbar = pyqtSignal(int, bool) update_training_logger = pyqtSignal(str, bool, bool)
from chardet.universaldetector import UniversalDetector from package.utils.catutils import exceptionWarning, clearLayout from package.utils.preprocess_text import processText, get_avg_words_per_sample from package.utils.spellcheck import SpellCheck from package.utils.DataframeTableModel import DataframeTableModel from package.utils.AttributeTableModel import AttributeTableModel from package.utils.GraphWidget import GraphWidget from package.utils.config import CONFIG """DataLoader imports CSV file and returns a dataframe with the appropriate columns. For training data, DI will consider the nth column as a training sample and nth+1 as ground truth. CSV files must be formatted accordingly. """ TAG_DELIMITER = CONFIG.get('VARIABLES', 'TagDelimiter') DATA_COLUMN_SUFFIX = CONFIG.get('VARIABLES', 'TrainingDataColumnSuffix') TRUTH_SUFFIX = CONFIG.get('VARIABLES', 'TruthLabelSuffix') class DataLoader(QWidget): """ TODO: Refactor this monstrosity into functions to setup UI """ data_load = pyqtSignal(pd.DataFrame) update_statusbar = pyqtSignal(str) update_progressbar = pyqtSignal(int, bool) def __init__(self, parent=None): super(DataLoader, self).__init__(parent) self.logger = logging.getLogger(__name__) # self.logger.setLevel(logging.DEBUG)
import errno import pandas as pd from PyQt5.QtCore import (Qt, pyqtSlot, pyqtSignal) from PyQt5.QtWidgets import (QApplication, QHBoxLayout, QDialog, QHeaderView, QAction, QMainWindow, QSizePolicy, QProgressBar, QWidget, QVBoxLayout, QFormLayout, QGroupBox, QLineEdit, QLabel, QDialogButtonBox, QMessageBox, QComboBox, QPushButton) from package.train.TrainWidget import TrainWidget from package.utils.catutils import exceptionWarning from package.utils.config import CONFIG VERSION_BASE_DIR = CONFIG.get('PATHS', 'BaseVersionDirectory') # DEFAULT_QUESTION_LABELS = ['Q1', 'Q2', 'Q3', 'Q4', 'Q6', # 'Q7', 'Q9', 'Q11', 'Q14', 'Q15'] DEFAULT_QUESTION_LABELS = CONFIG.get('VARIABLES', 'DefaultQuestionLabels').split(',') class CatTrain(QMainWindow): """ The central widget for the training component of CATScore Most of the functionality is contained in this class """ def __init__(self, parent=None): super(CatTrain, self).__init__(parent) self.logger = logging.getLogger(__name__) self.title = 'CAT Train' self.setWindowTitle(self.title) geometry = QApplication.desktop().availableGeometry(self) parent_left = self.parent().geometry().left() parent_top = self.parent().geometry().top()