Ejemplo n.º 1
0
    def chartSingleClassFrequency(self, data):
        """Display a bar chart of frequencies per label
            # Arguments
                data: list, List of integer values corresponding to the actual
                question score.
        """

        num_classes = self.getNumClasses(data)
        count_map = Counter(data)
        counts = [count_map[i] for i in range(num_classes)]
        total_count = sum(counts)
        majority_class_count = max(counts)
        majority_acc = round((majority_class_count / total_count), 2)
        idx = np.arange(num_classes)
        colors = []
        for count in counts:
            if count < (total_count * CONFIG.getfloat('VARIABLES', 'MinorityClassThreshold')):
                colors.append('r')
            else:
                colors.append('b')
        self.axes.cla()
        self.axes.bar(idx, counts, color=colors)
        self.axes.set_xlabel('Class')
        self.axes.set_ylabel('Number of Samples')
        self.axes.set_xticks(idx)
        self.axes.set_title(f"Majority class accuracy: {majority_acc}")
        rects = self.axes.patches
        for rect, label in zip(rects, counts):
            height = rect.get_height()
            self.axes.text(rect.get_x() + rect.get_width() / 2,
                           height + 5, label, ha='center', va='bottom')
        self.draw()
Ejemplo n.º 2
0
    def run(self):
        self._update_log('Beginning ModelTrain run')
        # * Run thru enumeration of columns.  The second argument in enumerate
        # * tells python where to begin the idx count.  Here, 1 for our offset
        try:
            for col_idx, col in enumerate(self.training_data.columns, 1):
                if col.endswith(self.tag_suffix):
                    self._update_log(f'Current classification task: {col}',
                                     False)
                    col_label = col.split(
                        CONFIG.get('VARIABLES', 'TagDelimiter'))[0]
                    col_path = os.path.join(self.version_directory, col_label)
                    # * FInd and drop any samples missing an index
                    missing_idx_count = self.training_data.index.isna().sum()
                    if (missing_idx_count > 0):
                        self._update_log(
                            f"<b>Found {missing_idx_count} samples missing a value for index </b> \
                                        (index_col = {CONFIG.get('VARIABLES', 'IndexColumn')}).  Removing those samples..."
                        )
                        valid_indexes = self.training_data.index.dropna()
                        self.training_data = self.training_data[
                            self.training_data.index.isin(valid_indexes)]
                        self._update_log(
                            f'Shape of dataset after removal: {self.training_data.shape}'
                        )
                    # * Create dict to fill na samples with 'unanswered' and score of 0
                    label_col_name = self.training_data.columns[col_idx]
                    fill_dict = pd.DataFrame(data={
                        col: 'unanswered',
                        label_col_name: 0
                    },
                                             index=[0])
                    self.training_data.fillna(value=0, inplace=True, axis=1)
                    x = self.training_data[col].copy()
                    y = self.training_data[
                        self.training_data.columns[col_idx]].copy().values

                    results = pd.DataFrame(index=self.training_data.index)
                    results[TRUTH_LABEL_SUFFIX] = y
                    preds = np.empty(y.shape)
                    probs = np.empty(shape=(y.shape[0], len(np.unique(y))))

                    # * Initialize sklearn evaluation parameters
                    sk_eval_type = self.training_eval_params['sklearn']['type']
                    sk_eval_value = self.training_eval_params['sklearn'][
                        'value']
                    # * SKLEARN
                    for model, selected in self.selected_models[
                            'sklearn'].items():
                        if self._is_running == False:
                            self.signals.training_complete.emit(pd.DataFrame())
                            break
                        if selected:
                            try:
                                if self.tune_models:
                                    self._tune_model(x, y, model, col_path)
                                model_params = self.get_params_from_file(
                                    model, col_path)
                                self._update_log(f'Begin training {model}')
                                pipeline = Pipeline(
                                    self.get_pipeline(model_params['params']))
                                try:
                                    if sk_eval_type == 'cv':
                                        skf = StratifiedKFold(
                                            n_splits=sk_eval_value,
                                            random_state=RANDOM_SEED)
                                        for train, test in skf.split(x, y):
                                            with joblib.parallel_backend(
                                                    'dask'):
                                                preds[test] = pipeline.fit(
                                                    x.iloc[train],
                                                    y[train]).predict(
                                                        x.iloc[test])
                                            if self.use_proba and hasattr(
                                                    pipeline, 'predict_proba'):
                                                try:
                                                    probs[
                                                        test] = pipeline.predict_proba(
                                                            x.iloc[test])
                                                except AttributeError:
                                                    self.logger.debug(
                                                        '{} does not support predict_proba'
                                                        .format(model))
                                                    print(
                                                        model,
                                                        'does not support predict_proba'
                                                    )
                                            else:
                                                probs = np.array([])
                                    elif sk_eval_type == 'test_split':
                                        x_train, x_test, y_train, y_test = train_test_split(
                                            x,
                                            y,
                                            test_size=sk_eval_value,
                                            stratify=y,
                                            random_state=CONFIG.getfloat(
                                                'VARIABLES', 'RandomSeed'))
                                        preds = np.empty(len(y_test))
                                    else:
                                        self._update_log(
                                            f'No evaluation type chosen.')
                                except (KeyboardInterrupt, SystemExit):
                                    raise
                                except Exception:
                                    self.logger.warning(
                                        '{} threw an exception during fit. \
                                            Possible error with joblib multithreading.'
                                        .format(model),
                                        exc_info=True)
                                    tb = traceback.format_exc()
                                    print(tb)
                                    self._update_log(
                                        '{} threw an exception during fit. \
                                            Possible error with joblib multithreading.'
                                        .format(model), True, False)
                                model_scores = self.get_model_scores(y, preds)

                                self._update_log(
                                    f'Task completed on <b>{model}</b>.')
                                table_str = '''<table>
                                                    <thead>
                                                        <tr>
                                                            <th>Accuracy</th><th>F1-Score</th><th>Cohen's Kappa</th>
                                                        </tr>
                                                    </thead>
                                                <tbody>
                                                    <tr>
                                            '''
                                for metric, score in model_scores.items():
                                    table_str += '<td style="border: 1px solid #333;">%.2f</td>' % score
                                table_str += '</tr></tbody></table><br>'
                                if sk_eval_type is not None:
                                    self._update_log(table_str, False, True)
                                self._update_log(
                                    f'Training {model} on full dataset')
                                with joblib.parallel_backend('dask'):
                                    pipeline.fit(x, y)

                                pred_col_name = col_label + TAG_DELIMITER + model + PRED_LABEL_SUFFIX
                                prob_col_name = col_label + TAG_DELIMITER + model + PROB_LABEL_SUFFIX
                                results[pred_col_name] = preds.astype(int)
                                # If predicting probabilities and the probability array has values,
                                # use those values for the results.
                                if self.use_proba and probs.size:
                                    results[prob_col_name] = np.amax(probs,
                                                                     axis=1)

                                save_path = os.path.join(col_path, model)
                                if not os.path.exists(save_path):
                                    os.makedirs(save_path)
                                self.save_model(model, pipeline, save_path,
                                                model_scores)
                            except (KeyboardInterrupt, SystemExit):
                                raise
                            except Exception as e:
                                self.logger.error(f'ModelTrainer.run {model}:',
                                                  exc_info=True)
                                tb = traceback.format_exc()
                                print(tb)
                                self._update_log(tb)
                    # Tensorflow__ would reside here
                    try:
                        if self.train_stacking_algorithm and self._is_running:
                            self.train_stacker(
                                results.drop(TRUTH_LABEL_SUFFIX, axis=1),
                                results[TRUTH_LABEL_SUFFIX].values, col_path)
                        else:
                            self._update_log('Skipping Stacker training.')
                    except ValueError as ve:
                        self.signals.training_complete.emit(pd.DataFrame())
                        self._update_log(
                            f'Unable to train Stacking algorithm on {col_label}.'
                        )
                        tb = traceback.format_exc()
                        print(tb)
                    except Exception as e:
                        self.logger.error(f'ModelTrainer.run {model}:',
                                          exc_info=True)
                        tb = traceback.format_exc()
                        print(tb)
                        self._update_log(tb)
            self._is_running = False
            self.signals.training_complete.emit(self.all_predictions_df)

        except Exception as e:
            self.signals.training_complete.emit(pd.DataFrame())
            self.logger.error('ModelTrainer.run (General):', exc_info=True)
            tb = traceback.format_exc()
            print(tb)
            self._update_log(tb)
Ejemplo n.º 3
0
# import package.utils.keras_models as keras_models
# import package.utils.SequenceTransformer as seq_trans

RANDOM_SEED = 1337
TOP_K = 20000
MAX_SEQUENCE_LENGTH = 1500
BASE_MODEL_DIR = './package/data/base_models'
BASE_TFIDF_DIR = './package/data/feature_extractors/TfidfVectorizer.json'
INPUT_SHAPE = (0, 0)

TAG_DELIMITER = CONFIG.get('VARIABLES', 'TagDelimiter')
PRED_LABEL_SUFFIX = CONFIG.get('VARIABLES', 'PredictedLabelSuffix')
PROB_LABEL_SUFFIX = CONFIG.get('VARIABLES', 'ProbabilityLabelSuffix')
TRUTH_LABEL_SUFFIX = CONFIG.get('VARIABLES', 'TruthLabelSuffix')
STACKER_LABEL_SUFFIX = CONFIG.get('VARIABLES', 'StackerLabelSuffix')
DISAGREEMENT_THRESHOLD = CONFIG.getfloat('VARIABLES', 'DisagreementThreshold')
BAMBOOZLED_THRESHOLD = CONFIG.getint('VARIABLES', 'BamboozledThreshold')


class ModelTrainerSignals(QObject):
    training_complete = pyqtSignal(pd.DataFrame)
    tuning_complete = pyqtSignal(bool, dict)
    update_progressbar = pyqtSignal(int, bool)
    update_training_logger = pyqtSignal(str, bool, bool)


class ModelTrainer(QRunnable):
    '''
    QThread tasked with running all model training/tuning.  
    This could potentially take days to complete.
    '''