Ejemplo n.º 1
0
 def create_version(self):
     """
     Create the new version specified by the user.
     """
     if(self.exec_() == QDialog.Accepted):
         v_dir = os.path.join(VERSION_BASE_DIR, self.version_name)
         try:
             if not os.path.exists(v_dir):
                 os.makedirs(v_dir)
             for k,v in self.input_widgets.items():
                 sub_dir = os.path.join(v_dir, v.text())
                 if not os.path.exists(sub_dir):
                     os.makedirs(sub_dir)
             self.version_created.emit(v_dir)
         except Exception as e:
             exceptionWarning('Error occured when creating new version.', e, title='Create version exception')
         finally:
             self.question_labels = {}
             self.version_name = None
             self.version_name_input.setText('')
             for k,v in self.input_widgets.items():
                 self.version_form.removeRow(v)
             self.input_widgets = {}
             self.version_form.update()
             self._generate_fields()
Ejemplo n.º 2
0
    def _update_fields(self, state):
        """
        Appends or removes an input widget for version field labels.

            # Arguments
                state(bool): If True, add a field, else remove the last field

            # Returns
                None
        """
        current_row_idx = self.version_form.rowCount()
        if state:
            label = QLabel('Field ' + str(current_row_idx) + ':')
            q_input = QLineEdit(objectName=str(current_row_idx - 1))
            q_input.textChanged.connect(
                lambda state, x=current_row_idx-1, y=q_input:
                    self._verify_unique_params(
                        x, 
                        (None if y.text() == '' else y.text())
                    )
            )
            self.version_form.insertRow(current_row_idx, label, q_input)
            self.input_widgets[str(current_row_idx - 1)] = q_input
            q_input.setFocus()
        else:
            if current_row_idx == 1:
                return
            item = self.input_widgets[str(current_row_idx - 2)]
            try:
                del self.input_widgets[item.objectName()]
                self.version_form.removeRow(current_row_idx - 1)
                self.version_form.update()
            except Exception as e:
                exceptionWarning('Error updating version params.', e, title="Update version warning")
        self._verify_params()
Ejemplo n.º 3
0
 def load_trained_models(self, model_dir, model_checksums, col_name):
     try:
         for model, checksum in model_checksums.items():
             model_path = os.path.join(model_dir, model, model + '.pkl')
             current_chksum = hashlib.md5(open(model_path,
                                               'rb').read()).hexdigest()
             if (current_chksum != checksum):
                 self.logger.error(f"PredictWidget._load_training_models: \
                     Checksums for model {model_path} do not match.  \
                     Model checksum: {current_chksum}, Saved checksum: {checksum}"
                                   )
                 exceptionWarning(
                     f"Checksums for {model_path} are invalid.  Retrain or delete this model.  Skipping... "
                 )
                 continue
             # Update the stacker info with model directory for ease of access later
             if model == 'Stacker':
                 self.trained_model_meta[col_name][model].update(
                     {'model_path': model_path})
                 continue
             model_param_path = os.path.join(model_dir, model,
                                             model + '.json')
             with open(model_param_path, 'r') as infile:
                 model_data = json.load(infile, object_hook=cat_decoder)
             self.trained_model_meta[col_name][model] = model_data['meta'][
                 'training_meta']
             self.trained_model_meta[col_name][model].update(
                 {'model_path': model_path})
     except Exception as e:
         self.logger.error("PredictWidget.load_trained_models",
                           exc_info=True)
         print("Exception {}".format(e))
         tb = traceback.format_exc()
         print(tb)
Ejemplo n.º 4
0
    def update_tuning_params(self, model_base, param, value, scorer=False):
        '''
        Updates the tuning parameters passed to ModelTrainer.

            # Arguments
                model_base: string, Signifies which type of tuning parameter to update.  Currently, only
                    used by RandomizedSearchCV (sklearn)
                param: string, parameter name
                value: [int, float, string], parameter value
        '''
        if model_base is None or param is None:
            return
        try:
            if scorer:
                if value:
                    self.tuning_params[model_base]['scoring'].append(param)
                else:
                    if param in self.tuning_params[model_base]['scoring']:
                        self.tuning_params[model_base]['scoring'].remove(param)
            else:
                self.tuning_params[model_base][param] = value
        except KeyError as ke:
            self.tuning_params[model_base][param] = {}
            self.tuning_params[model_base][param] = value
        except Exception as e:
            self.logger.error("SelectModelWidget.update_tuning_params",
                              exc_info=True)
            exceptionWarning('Exception occured when training models.', e)
            tb = traceback.format_exc()
            print(tb)
        print(self.tuning_params)
Ejemplo n.º 5
0
    def display_selected_rows(self, selection=None):
        """
        Updates the stats and label distro plot when a question is selected.
            # Attributes
                selection: QItemSelectionModel, item currently selected by user.
        """
        try:
            if selection:
                idx = selection.indexes()[0]
            else:
                # If no question selected, select the first in the list
                self.available_column_view.selectRow(0)
                self.available_column_view.setFocus()
                idx = QModelIndex(self.available_column_model.index(0, 0))
            offset = idx.row() * 2
            col_name = self.available_column_model.data(idx)
            label_col_name = col_name.split(TAG_DELIMITER)[0] + TRUTH_SUFFIX
            self.text_stats_groupbox.setTitle(col_name)
            question_data = self.full_data[col_name].fillna(
                value="unanswered")
            avg_num_words = get_avg_words_per_sample(question_data, question_data.shape[0])
            self.current_question_count.setText(str(question_data.shape[0]))
            self.current_question_avg_word.setText("%.2f" % avg_num_words)

            self.graph.chartSingleClassFrequency(
                self.full_data[label_col_name].values.astype(int))
        except Exception as e:
            self.logger.error("Dataloader.display_selected_rows", exc_info=True)
            exceptionWarning(
                "Exception occured.  DataLoader.load_file.", exception=e)
            tb = traceback.format_exc()
            print(tb)
Ejemplo n.º 6
0
    def train_models(self):
        try:
            tune_models = self.tune_models_chkbox.isChecked()
            self.model_trainer = ModelTrainer(
                selected_models=self.selected_models,
                version_directory=self.selected_version,
                training_eval_params=self.training_params,
                training_data=self.training_data,
                tune_models=tune_models,
                tuning_params=self.tuning_params)
            self.model_trainer.signals.update_training_logger.connect(
                self.update_training_logger)
            self.update_progressbar.emit(1, True)
            self.model_trainer.signals.training_complete.connect(
                self.training_complete)
            self.comms.stop_training.connect(self.model_trainer.stop_thread)
            self.run_btn.setEnabled(False)
            self.stop_btn.clicked.connect(lambda: self._abort_training())

            self.training_predictions = pd.DataFrame()
            self.threadpool.start(self.model_trainer)
        except Exception as e:
            self.logger.error("SelectModelWidget.train_models", exc_info=True)
            exceptionWarning('Exception occured when training models.', e)
            tb = traceback.format_exc()
            print(tb)
Ejemplo n.º 7
0
 def save_data(self):
     if self.selected_data.empty:
         exceptionWarning('No data selected')
         return
     file_name, filter = QFileDialog.getSaveFileName(
         self, 'Save to CSV', os.getenv('HOME'), 'CSV(*.csv)')
     if file_name:
         self.selected_data.to_csv(
             file_name, index_label='testnum', quoting=1, encoding='utf-8')
         self.update_statusbar.emit("Data saved successfully.")
Ejemplo n.º 8
0
    def _verify_unique_params(self, key, value):
        """
        Checks that field name is unique.  This is necessary as field values
        are used for version directory and data structure.

            # Arguments
                key(String): dict key for appropriate input widget
                value(String): field name.  Must be unique per version.
        """
        if value.lower() in [x.lower() for x in self.question_labels.values()]:
            self.buttonBox.button(QDialogButtonBox.Ok).setEnabled(False)
            exceptionWarning('Field names must be unique!')
            return
        try:
            self._verify_params()
        except Exception as e:
           exceptionWarning('Error updating version params.', e, title="Update version warning")
Ejemplo n.º 9
0
 def get_ratio(row):
     """
     Returns the ratio of agreement between column values (here, predictors) in a given row.
     """
     try:
         pred_value = row.iloc[-1]
         total_same = 0.0
         col_count = float(len(row.iloc[:-1]))
         for data in row.iloc[:-1]:
             if data == pred_value:
                 total_same += 1.0
         return total_same / col_count
     except ZeroDivisionError as zde:
         return 0
     except Exception as e:
         self.logger.error("ModelTrainer.get_ratio", exc_info=True)
         exceptionWarning(
             'Exception occured in ModelTrainer.get_ratio.', repr(e))
Ejemplo n.º 10
0
 def get_bamboozled_score(row):
     """
     Returns the difference between the number of models and the number of models who predicted incorrectly.
     The higher this value, the more bamboozling the sample
     """
     try:
         pred_value = row.iloc[-1]
         total_wrong = 0
         col_count = len(row.iloc[:-1])
         for data in row.iloc[:-1]:
             if data != pred_value:
                 total_wrong += 1
         return col_count - total_wrong
     except Exception as e:
         self.logger.error(
             "ModelTrainer.get_bamboozled_score", exc_info=True)
         exceptionWarning(
             'Exception occured in ModelTrainer.get_bamboozled_score.', repr(e))
Ejemplo n.º 11
0
 def save_predictions(self):
     '''
     Save predictions to user specified file.  Opens a QFileDialog to choose save directory.
     '''
     try:
         if self.training_predictions.empty:
             exceptionWarning('No predictions to save')
             return
         file_name, filter = QFileDialog.getSaveFileName(
             self, 'Save to CSV', os.getenv('HOME'), 'CSV(*.csv)')
         if file_name:
             self.training_predictions.to_csv(file_name,
                                              index_label='testnum',
                                              quoting=1,
                                              encoding='utf-8')
             self.comms.update_statusbar.emit(
                 "Predictions saved successfully.")
     except PermissionError as pe:
         self.logger.warning("SelectModelWidget.save_predictions",
                             exc_info=True)
         exceptionWarning(
             f'Permission denied while attempting to save {file_name}')
     except Exception as e:
         self.logger.error("SelectModelWidget.save_predictions",
                           exc_info=True)
         exceptionWarning(
             "Exception occured.  SelectModelWidget.save_predictions.",
             exception=e)
         tb = traceback.format_exc()
         print(tb)
Ejemplo n.º 12
0
 def save_predictions(self):
     try:
         if self.predictions.empty:
             exceptionWarning('No predictions to save')
             return
         file_name, filter = QFileDialog.getSaveFileName(
             self, 'Save to CSV', os.getenv('HOME'), 'CSV(*.csv)')
         if file_name:
             for idx, col in enumerate(self.predictions.columns):
                 if col.endswith('predicted') or col.endswith('ratio'):
                     self.full_data[col] = self.predictions[col]
             self.full_data.to_csv(file_name,
                                   index_label='testnum',
                                   quoting=1,
                                   encoding='utf-8')
             self.comms.update_statusbar.emit(
                 "Predictions saved successfully.")
     except PermissionError as pe:
         self.logger.warning("PredictWidget.save_predictions",
                             exc_info=True)
         exceptionWarning(
             f'Permission denied while attempting to save {file_name}')
     except Exception as e:
         self.logger.error("PredictWidget.save_predictions", exc_info=True)
         exceptionWarning(
             "Exception occured.  PredictWidget.save_predictions.",
             exception=e)
         tb = traceback.format_exc()
         print(tb)
Ejemplo n.º 13
0
 def copy_version(self):
     """
     Copy version specified by the user.
     """
     if(self.exec_() == QDialog.Accepted):
         src_dir = self.version_selection.currentData()
         dest_dir = os.path.join(VERSION_BASE_DIR, self.version_name_input.text())
         try:
             shutil.copytree(src_dir, dest_dir)
             self.version_copied.emit(dest_dir)
         except OSError as e:
             if e.errno == errno.ENOTDIR:
                 shutil.copy(src_dir, dest_dir)
             else:
                 exceptionWarning(f'Unable to copy {src_dir} to {dest_dir}', title='Copy version exception')
                 tb = traceback.format_exc()
                 print(tb)
         except Exception as e:
             exceptionWarning('Error occured when copying version.', e, title='Copy version exception')
         finally:
             self.version_name = None
             self.version_selection.addItem(self.version_name_input.text(), dest_dir)
Ejemplo n.º 14
0
 def save_data(self):
     try:
         if self.selected_data.empty:
             exceptionWarning('No data selected')
             return
         file_name, filter = QFileDialog.getSaveFileName(
             self, 'Save to CSV', os.getenv('HOME'), 'CSV(*.csv)')
         if file_name:
             self.selected_data.to_csv(file_name,
                                       index_label='testnum',
                                       quoting=1,
                                       encoding='utf-8')
             self.comms.update_statusbar.emit(
                 "Processed data saved successfully.")
     except PermissionError as pe:
         self.logger.warning("PredictWidget.save_data", exc_info=True)
         exceptionWarning(
             f'Permission denied while attempting to save {file_name}')
     except Exception as e:
         self.logger.error("PredictWidget.save_data", exc_info=True)
         exceptionWarning("Exception occured.  PredictWidget.save_data.",
                          exception=e)
         tb = traceback.format_exc()
         print(tb)
Ejemplo n.º 15
0
    def setup_model_selection_ui(self):
        """
        Setup model selection ui.

        The order of the parameters in ModelDialog matters.  model_data must come first!
        """
        self.version_selection_label = QLabel("Select version: ")
        self.version_selection = QComboBox(objectName='version_select')
        self.version_selection.setMinimumWidth(100)
        # Changed default models to a unique directory.  This
        # is where default models will be saved.
        # self.version_selection.addItem(
        #     'default', '.\\package\\data\\default_models\\default')
        available_versions = os.listdir(BASE_VERSION_DIR)
        for version in available_versions:
            v_path = os.path.join(BASE_VERSION_DIR, version)
            if os.path.isdir(v_path):
                self.version_selection.addItem(version, v_path)
        self.version_selection.currentIndexChanged.connect(
            lambda x, y=self.version_selection: self._update_version(
                y.currentData()))
        self.version_form.addRow(self.version_selection_label,
                                 self.version_selection)

        # Load base TF-IDF features
        # and feature selection data
        try:
            with open(CONFIG.get('PATHS', 'BaseTfidfDirectory'), 'r') as f:
                tfidf_data = json.load(f)
        except IOError as ioe:
            self.logger.error("Error loading base TFIDF params", exc_info=True)
            exceptionWarning(
                'Error occurred while loading base TFIDF parameters.',
                repr(ioe))
        try:
            with open(CONFIG.get('PATHS', 'BaseFeatureSeletionDirectory'),
                      'r') as f:
                self.fs_params = json.load(f)
        except IOError as ioe:
            self.logger.error("Error loading base feature selector params",
                              exc_info=True)
            exceptionWarning(
                'Error occurred while loading base feature selector parameters.',
                repr(ioe))
        # Dynamically generate ModelDialogs for each model in the base model directory.
        # Only considers *.json file extension.
        try:
            row = 0
            for filename in os.listdir(
                    CONFIG.get('PATHS', 'BaseModelDirectory')):
                if filename.endswith('.json'):
                    with open(
                            os.path.join(
                                CONFIG.get('PATHS', 'BaseModelDirectory'),
                                filename), 'r') as f:
                        model_data = json.load(f)
                        model = model_data['model_class']
                        model_base = model_data['model_base']
                        model_module = model_data['model_module']
                        #! The order of the arguments matters!  model_data must come first.
                        if model_base == 'tensorflow':
                            continue
                            # model_dialog = SkModelDialog(self, model_data)
                        if model_module == 'tpot':
                            model_dialog = TPOTModelDialog(
                                self, model_data, tfidf_data)
                        else:
                            model_dialog = SkModelDialog(
                                self, model_data, tfidf_data, self.fs_params)
                        self.comms.version_change.connect(
                            model_dialog.update_version)
                        # Initialize model as unselected
                        self.selected_models[model_base][model] = False
                        btn = QPushButton(model, objectName=model + '_btn')
                        # Partial allows the connection of dynamically generated QObjects
                        btn.clicked.connect(
                            partial(self.open_dialog, model_dialog))
                        chkbox = QCheckBox(objectName=model)
                        chkbox.stateChanged.connect(
                            lambda state, x=model, y=model_base: self.
                            _update_selected_models(x, y, state))
                        if model_base == 'tensorflow':
                            self.tensorflow_model_form.addRow(chkbox, btn)
                            self.tensorflow_model_dialogs.append(model_dialog)
                            self.tensorflow_model_dialog_btns.append(btn)
                        else:
                            self.sklearn_model_form.addRow(chkbox, btn)
                            self.sklearn_model_dialogs.append(model_dialog)
                            self.sklearn_model_dialog_btns.append(btn)
                        self.model_checkboxes.append(chkbox)
                        row += 1
        except OSError as ose:
            self.logger.error("OSError opening model config files",
                              exc_info=True)
            exceptionWarning('OSError opening model config files!', ose)
            tb = traceback.format_exc()
            print(tb)
        except Exception as e:
            self.logger.error("Error opening model config files",
                              exc_info=True)
            exceptionWarning('Error occured.', e)
            tb = traceback.format_exc()
            print(tb)
Ejemplo n.º 16
0
    def load_file(self, f_path):
        """
        Load data from a CSV file to the workspace.
        Column 0 is used for the index column.
        chardet attempts to determine encoding if file is not utf-8.
            # Attributes
                f_path(String): The filename selected via open_file
        """
        # FIXME: Reset status bar when new data is loaded.
        try:
            self.graph.clear_graph()
            self.available_column_model.loadData([], include_labels=False)
            self.prediction_data = pd.read_csv(
                f_path,
                encoding='utf-8',
                index_col=CONFIG.getint(
                    'VARIABLES',
                    'IndexColumn'))  #TODO: user defined index column
        except UnicodeDecodeError as ude:
            self.logger.warning("UnicodeDecode error opening file",
                                exc_info=True)
            self.comms.update_statusbar.emit(
                "Attempting to determine file encoding...")
            detector = UniversalDetector()
            try:
                for line in open(f_path, 'rb'):
                    detector.feed(line)
                    if detector.done:
                        break
                detector.close()
                print("chardet determined encoding type to be {}".format(
                    detector.result['encoding']))
                self.prediction_data = pd.read_csv(
                    f_path, encoding=detector.result['encoding'], index_col=0)
            except Exception as e:
                self.logger.error("Error detecing encoding", exc_info=True)
                exceptionWarning("Exception has occured.", exception=e)
        except IOError as ioe:
            self.logger.error("IOError detecting encoding", exc_info=True)
            exceptionWarning("IO Exception occured while opening file.",
                             exception=ioe)
        except Exception as e:
            self.logger.error("Error detecting encoding", exc_info=True)
            exceptionWarning("Error occured opening file.", exception=e)

        try:
            columns = self.prediction_data.columns
            self.available_columns = []
            self.columns_with_truth = []

            self.ground_truth_columns = self.prediction_data.columns[
                ~self.prediction_data.isna().any()].tolist()

            for column in columns:
                if column.lower().endswith("text"):
                    self.available_columns.append(column)
                    column_tag = column.split('__')[0]
                    if (column_tag + '__actual' in self.ground_truth_columns):
                        self.columns_with_truth.append(column)

            if self.available_columns:
                self.available_column_model.loadData(self.available_columns,
                                                     include_labels=False)

            if self.columns_with_truth:
                self.available_column_model.setTruthData(
                    self.columns_with_truth)
                # self.full_text_count.setText(str(self.prediction_data.shape[0]))
                # self.display_selected_row(None)
                # self.select_all_btn.setEnabled(True)
                # self.deselect_all_btn.setEnabled(True)

            self.comms.update_statusbar.emit("CSV loaded.")
            # else:
            #     exceptionWarning("No allowable data discovered in file.")
        except pd.errors.EmptyDataError as ede:
            exceptionWarning('Empty Data Error.\n', exception=ede)
        except Exception as e:
            self.logger.error("Error loading dataframe", exc_info=True)
            exceptionWarning("Exception occured.  PredictWidget.load_file.",
                             exception=e)
Ejemplo n.º 17
0
    def display_selected_row(self, selection=None):
        """
        Updates the stats and label distro plot when a question is selected.
            # Attributes
                selection: QItemSelectionModel, item currently selected by user.
        """
        try:
            if selection:
                idx = selection.indexes()[0]
            else:
                # If no question selected, select the first in the list
                self.available_column_view.selectRow(0)
                self.available_column_view.setFocus()
                idx = QModelIndex(self.available_column_model.index(0, 0))
            row = idx.row()
            # col_name = self.full_data.columns[row]
            col_name = self.available_column_model.data(idx)
            self.text_stats_groupbox.setTitle(col_name)
            # question_data = self.full_data[self.full_data.columns[row]].fillna(
            #     value="unanswered")
            question_data = self.full_data[col_name].fillna(value="unanswered")

            avg_num_words = get_avg_words_per_sample(question_data,
                                                     question_data.shape[0])
            self.current_question_count.setText(str(question_data.shape[0]))
            self.current_question_avg_word.setText("%.2f" % avg_num_words)

            grid_row = 1
            grid_column = 0
            clearLayout(self.training_stats_grid)

            model_label = QLabel("Model")
            model_label.setFont(QFont("Times", weight=QFont.Bold))
            self.training_stats_grid.addWidget(model_label, 0, 0,
                                               Qt.AlignHCenter)

            accuracy_label = QLabel("Accuracy")
            accuracy_label.setFont(QFont("Times", weight=QFont.Bold))
            self.training_stats_grid.addWidget(accuracy_label, 0, 1,
                                               Qt.AlignTop)

            f1_label = QLabel("F1 (weighted)")
            f1_label.setFont(QFont("Times", weight=QFont.Bold))
            self.training_stats_grid.addWidget(f1_label, 0, 2, Qt.AlignTop)

            cohen_kappa = QLabel("Cohen's Kappa")
            cohen_kappa.setFont(QFont("Times", weight=QFont.Bold))
            self.training_stats_grid.addWidget(cohen_kappa, 0, 3, Qt.AlignTop)

            train_date_label = QLabel("Last Trained")
            train_date_label.setFont(QFont("Times", weight=QFont.Bold))
            self.training_stats_grid.addWidget(train_date_label, 0, 4,
                                               Qt.AlignTop)

            for model, meta in self.trained_model_meta[col_name].items():
                self.training_stats_grid.addWidget(QLabel(model), grid_row,
                                                   grid_column, Qt.AlignTop)
                grid_column += 1
                self.training_stats_grid.addWidget(
                    QLabel("%.4f" % meta['train_eval_score']['accuracy']),
                    grid_row, grid_column, Qt.AlignTop)
                grid_column += 1
                self.training_stats_grid.addWidget(
                    QLabel("%.4f" % meta['train_eval_score']['f1_score']),
                    grid_row, grid_column, Qt.AlignTop)
                grid_column += 1
                self.training_stats_grid.addWidget(
                    QLabel("%.4f" % meta['train_eval_score']['cohen_kappa']),
                    grid_row, grid_column, Qt.AlignTop)
                grid_column += 1
                self.training_stats_grid.addWidget(
                    QLabel(meta['last_train_date']), grid_row, grid_column,
                    Qt.AlignTop)
                grid_row += 1
                grid_column = 0
            verticalSpacer = QSpacerItem(40, 20, QSizePolicy.Minimum,
                                         QSizePolicy.Expanding)
            self.training_stats_grid.addItem(verticalSpacer, grid_row, 0,
                                             Qt.AlignTop)
            self.repaint()
        except Exception as e:
            self.logger.error("PredictWidget.display_selected_row",
                              exc_info=True)
            exceptionWarning(
                "Exception occured.  PredictWidget.display_selected_row.",
                exception=e)
            tb = traceback.format_exc()
            print(tb)
Ejemplo n.º 18
0
    def load_file(self, f_path):
        """
        Load data from a CSV file to the workspace.
        Column 0 is used for the index column.
        chardet attempts to determine encoding if file is not utf-8.
            # Attributes
                f_path(String): The filename selected via open_file
        """
        # FIXME: Reset status bar when new data is loaded.
        try:
            self.full_data = pd.read_csv(f_path, encoding='utf-8', index_col=0)
        except UnicodeDecodeError as ude:
            self.logger.warning("UnicodeDecode error opening file",
                                exc_info=True)
            self.comms.update_statusbar.emit(
                "Attempting to determine file encoding...")
            detector = UniversalDetector()
            try:
                for line in open(f_path, 'rb'):
                    detector.feed(line)
                    if detector.done:
                        break
                detector.close()
                print("chardet determined encoding type to be {}".format(
                    detector.result['encoding']))
                self.full_data = pd.read_csv(
                    f_path, encoding=detector.result['encoding'], index_col=0)
            except Exception as e:
                self.logger.error("Error detecing encoding", exc_info=True)
                exceptionWarning("Exception has occured.", exception=e)
        except IOError as ioe:
            self.logger.error("IOError detecting encoding", exc_info=True)
            exceptionWarning("IO Exception occured while opening file.",
                             exception=ioe)
        except Exception as e:
            self.logger.error("Error detecting encoding", exc_info=True)
            exceptionWarning("Error occured opening file.", exception=e)

        try:
            columns = self.full_data.columns
            self.available_columns = []

            for column in columns:
                if column.endswith("text"):
                    self.available_columns.append(column)
            if self.available_columns:
                self.available_column_model.loadData(self.available_columns,
                                                     include_labels=False)

                self.available_column_model.setAllowableData(
                    self.allowable_columns)
                # drop_cols = [col for col in self.full_data.columns if col not in self.available_columns ]
                # self.full_data.drop(drop_cols, axis=1, inplace=True)
                # print("full_data columns: ", self.full_data.columns)
                self.full_text_count.setText(str(self.full_data.shape[0]))
                # self.display_selected_row(None)
                self.select_all_btn.setEnabled(True)
                self.deselect_all_btn.setEnabled(True)

                self.comms.update_statusbar.emit("CSV loaded.")
            else:
                exceptionWarning("No allowable data discovered in file.")
        except pd.errors.EmptyDataError as ede:
            exceptionWarning('Empty Data Error.\n', exception=ede)
        except Exception as e:
            self.logger.error("Error loading dataframe", exc_info=True)
            exceptionWarning("Exception occured.  PredictWidget.load_file.",
                             exception=e)
Ejemplo n.º 19
0
 def load_file(self, f_path):
     """
     Load data from a CSV file to the workspace.
     Column 0 is used for the index column.
     chardet attempts to determine encoding if file is not utf-8.
         # Attributes
             f_path(String): The filename selected via open_file
     """
     # FIXME: Reset status bar when new data is loaded.
     try:
         self.update_progressbar.emit(0, True)
         self.available_column_model.loadData([])
         self.select_all_btn.setEnabled(False)
         self.deselect_all_btn.setEnabled(False)
         self.full_data = pd.read_csv(f_path, encoding='utf-8', index_col=0, sep=None)
     except UnicodeDecodeError as ude:
         self.logger.warning(
             "UnicodeDecode error opening file", exc_info=True)
         print("UnicodeDecodeError caught.  File is not UTF-8 encoded. \
                Attempting to determine file encoding...")
         self.update_statusbar.emit(
             "File is not UTF-8 encoded. Attempting to determine file encoding...")
         detector = UniversalDetector()
         try:
             for line in open(f_path, 'rb'):
                 detector.feed(line)
                 if detector.done:
                     break
             detector.close()
             self.update_statusbar.emit("Chardet determined encoding type to be {}".format(
                 detector.result['encoding']))
             self.logger.info("Chardet determined encoding type to be {}".format(
                 detector.result['encoding']))
             self.full_data = pd.read_csv(
                 f_path, encoding=detector.result['encoding'], index_col=0)
         except Exception as e:
             self.logger.error("Error detecting encoding", exc_info=True)
             exceptionWarning("Exception has occured.", exception=e)
     except IOError as ioe:
         self.logger.error("IOError detecting encoding", exc_info=True)
         exceptionWarning(
             "IO Exception occured while opening file.", exception=ioe)
     except Exception as e:
         self.logger.error("Error detecting encoding", exc_info=True)
         exceptionWarning("Error occured opening file.", exception=e)
     #TODO: clean up dataset by removing NA for values or index
     try:
         columns = self.full_data.columns
         self.available_columns = []
         for column in columns:
             if column.endswith(DATA_COLUMN_SUFFIX):
                 label_col = column.split(TAG_DELIMITER)[0] + TRUTH_SUFFIX 
                 if label_col in columns:
                     self.available_columns.append(column)
                     self.available_columns.append(label_col)
         # If no data found, the model will be reset.
         if(self.available_columns):
             self.available_column_model.loadData(self.available_columns)
             self.full_text_count.setText(str(self.full_data.shape[0]))
             self.display_selected_rows(None)
             self.update_statusbar.emit("CSV loaded.")
             self.select_all_btn.setEnabled(True)
             self.deselect_all_btn.setEnabled(True)
         else:
             exceptionWarning(f"No usable data found in {f_path}")
             self.logger.info(f"No usable data found in {f_path}")
             self.update_statusbar.emit("No usable data found in file")
         self.available_column_model.setCheckboxes(False)
         self.load_selected_data()
     except pd.errors.EmptyDataError as ede:
         exceptionWarning(
             exceptionTitle='Empty Data Error.\n', exception=ede)
     except Exception as e:
         self.logger.error("Error loading dataframe", exc_info=True)
         exceptionWarning(
             "Exception occured.  DataLoader.load_file.", exception=e)
         tb = traceback.format_exc()
         print(tb)
     finally:
         self.update_progressbar.emit(0, False)