def train(self, features, labels, train_type='offline'):
        """Train the current classifier"""

        self.trained = False

        if train_type != 'offline':
            self.logger.error('Can only train offline a multilabel classifier')
            return

        self.logger.info("Train using {} documents".format(len(features)))

        # change labels format for multilabel training
        labels = self._binarize_labels(labels)

        try:
            self.model.fit(features, labels)
        except:
            raise CaughtException(
                "Exception when {} training the {} multilabel classifier".
                format(train_type, self.name))
        else:
            self.trained = True

        # reset category binary labels to real category names
        self.model.classes_ = self.categories
    def store_transformation(self, input_file, output_file, dict_file,
                             tfidf_file):
        """
        Apply the transformation model on the given hash documents.
        Store transformed 'features' in file.
        """

        self.check_model()

        utils.check_file_readable(dict_file)
        cdictionary = dictionary.load(dict_file)

        tfidf_model = None
        if self.name != 'LDA':
            utils.check_file_readable(tfidf_file)
            tfidf_model = self.TRANSFORMERS['TFIDF'].load(tfidf_file)

        sc = StreamCorpus(input_file)

        try:
            pc = PushCorpus(output_file)

            for doc in sc:
                if 'content' in doc and 'id' in doc:
                    doc['features'] = self.transform_doc(
                        cdictionary, tfidf_model, doc['content'], doc['id'])
                    pc.add(doc)
        except Exception as e:
            raise CaughtException(
                "Exception encountered when storing transformed documents: {}".
                format(e))
        else:
            self.logger.info("Stored {} documents to file".format(pc.size))
        finally:
            pc.close_stream()
    def shape(self):
        """The dictionary shape of the model"""

        shape = {}

        try:
            if self.name == 'MLPClassifier':
                shape = {}
                shape['name'] = self.name
                shape['classifier_type'] = 'multilabel'
                shape['classes'] = list(self.model.classes_)
                shape['n_classes'] = len(self.model.classes_)
                shape['n_features'] = len(self.model.coefs_[0])
                shape['hidden_activation'] = self.model.activation
                shape['output_activation'] = self.model.out_activation_

                # coefficients & intercepts of hidden layers
                hl_coeffs = self.model.coefs_[:-1]
                hl_intercepts = self.model.intercepts_[:-1]

                if len(hl_coeffs) != len(hl_intercepts):
                    raise ConfigError(
                        "Hidden coefficients&intercepts not equally sized {}/{}"
                        .format(len(hl_coeffs), len(hl_intercepts)))

                hcoeffs = []
                for layer in hl_coeffs:
                    hcoeffs.append([[float(x) for x in cx] for cx in layer])
                shape['hidden_coeffs'] = hcoeffs

                shape['hidden_intercepts'] = \
                    [[float(x) for x in ix] for ix in hl_intercepts]

                # coefficients & intercepts of output layer
                ocoeffs = self.model.coefs_[-1]
                ocoeffs = [[float(x) for x in ox] for ox in ocoeffs]
                ointercepts = self.model.intercepts_[-1]

                if len(ocoeffs[0]) != len(ointercepts):
                    raise ConfigError(
                        "Output coefficients&intercepts not equally sized {}/{}"
                        .format(len(ocoeffs[0]), len(ointercepts)))

                shape['output_coeffs'] = ocoeffs
                shape['output_intercepts'] = list(ointercepts)
            else:
                self.logger.warning(
                    "Unknown shape for {} classifier (WIP)".format(self.name))
        except:
            raise CaughtException("Exception encountered when recovering "
                                  "the {} classifier model's shape".format(
                                      self.name))

        return shape
Esempio n. 4
0
    def train(self, features, labels, train_type):
        """Train the classifier on the given data"""

        self.trained = False
        self.logger.info("Train using {} documents".format(len(features)))

        try:
            if train_type == "offline":
                self.model.fit(features, labels)
            else:
                self.model.partial_fit(features,
                                       labels,
                                       classes=self.categories)
        except:
            raise CaughtException(
                "Exception when {} training the {} multiclass classifier".
                format(train_type, self.name))
        else:
            self.trained = True
Esempio n. 5
0
    def __init__(self, model_file):
        """Load classifier model from binary file"""

        super().__init__()

        utils.check_file_readable(model_file)

        self.model = None
        with open(model_file, 'rb') as icstream:
            try:
                self.model = pickle.load(icstream)
            except Exception as e:
                raise CaughtException(
                    "Exception encountered when loading the classifier: {}".
                    format(e))

        self.name = type(self.model).__name__
        self.categories = self.model.classes_

        self.logger.info("Loaded already-trained {} classifier model "
                         "from '{}' file".format(self.name, model_file))
    def __init__(self, classif_name, categories, **kwargs):
        """
        Initialize the classifier
        with its name and its list of known categories
        """

        super().__init__()

        self.name = classif_name
        self.trained = False
        self.categories = list(categories)

        classifiers = xi.ml.classify.TrainClassifier.CLASSIFIERS['multilabel']

        try:
            self.model = classifiers['models'][classif_name](**kwargs)
        except:
            raise CaughtException(
                "Exception when initializing the {} multilabel classifier ({})"
                .format(self.name, kwargs))

        self.logger.info("Initialized a {} classifier".format(classif_name))
Esempio n. 7
0
    def store_prediction(self, input_file, output_file):
        """
        Test the classifier on 'untagged' documents.
        Store prediction category and prediction probability in file.
        """

        if not self.prediction_checkups():
            return

        utils.check_file_readable(input_file)
        utils.create_path(output_file)

        sc = StreamCorpus(input_file)

        try:
            pc = PushCorpus(output_file)

            for doc in sc:
                if 'features' in doc:
                    prediction = self.classify_doc(doc['features'])

                    if isinstance(prediction, dict) and \
                        'category' in prediction and \
                        'probas' in prediction:

                        doc['season'] = prediction['category']
                        doc['season_prob'] = prediction['probas']
                        pc.add(doc)
        except Exception as e:
            raise CaughtException(
                "Exception encountered when storing classified documents: {}".
                format(e))
        else:
            self.logger.info("Stored {} documents to file".format(pc.size))
        finally:
            pc.close_stream()