def train(self, features, labels, train_type='offline'): """Train the current classifier""" self.trained = False if train_type != 'offline': self.logger.error('Can only train offline a multilabel classifier') return self.logger.info("Train using {} documents".format(len(features))) # change labels format for multilabel training labels = self._binarize_labels(labels) try: self.model.fit(features, labels) except: raise CaughtException( "Exception when {} training the {} multilabel classifier". format(train_type, self.name)) else: self.trained = True # reset category binary labels to real category names self.model.classes_ = self.categories
def store_transformation(self, input_file, output_file, dict_file, tfidf_file): """ Apply the transformation model on the given hash documents. Store transformed 'features' in file. """ self.check_model() utils.check_file_readable(dict_file) cdictionary = dictionary.load(dict_file) tfidf_model = None if self.name != 'LDA': utils.check_file_readable(tfidf_file) tfidf_model = self.TRANSFORMERS['TFIDF'].load(tfidf_file) sc = StreamCorpus(input_file) try: pc = PushCorpus(output_file) for doc in sc: if 'content' in doc and 'id' in doc: doc['features'] = self.transform_doc( cdictionary, tfidf_model, doc['content'], doc['id']) pc.add(doc) except Exception as e: raise CaughtException( "Exception encountered when storing transformed documents: {}". format(e)) else: self.logger.info("Stored {} documents to file".format(pc.size)) finally: pc.close_stream()
def shape(self): """The dictionary shape of the model""" shape = {} try: if self.name == 'MLPClassifier': shape = {} shape['name'] = self.name shape['classifier_type'] = 'multilabel' shape['classes'] = list(self.model.classes_) shape['n_classes'] = len(self.model.classes_) shape['n_features'] = len(self.model.coefs_[0]) shape['hidden_activation'] = self.model.activation shape['output_activation'] = self.model.out_activation_ # coefficients & intercepts of hidden layers hl_coeffs = self.model.coefs_[:-1] hl_intercepts = self.model.intercepts_[:-1] if len(hl_coeffs) != len(hl_intercepts): raise ConfigError( "Hidden coefficients&intercepts not equally sized {}/{}" .format(len(hl_coeffs), len(hl_intercepts))) hcoeffs = [] for layer in hl_coeffs: hcoeffs.append([[float(x) for x in cx] for cx in layer]) shape['hidden_coeffs'] = hcoeffs shape['hidden_intercepts'] = \ [[float(x) for x in ix] for ix in hl_intercepts] # coefficients & intercepts of output layer ocoeffs = self.model.coefs_[-1] ocoeffs = [[float(x) for x in ox] for ox in ocoeffs] ointercepts = self.model.intercepts_[-1] if len(ocoeffs[0]) != len(ointercepts): raise ConfigError( "Output coefficients&intercepts not equally sized {}/{}" .format(len(ocoeffs[0]), len(ointercepts))) shape['output_coeffs'] = ocoeffs shape['output_intercepts'] = list(ointercepts) else: self.logger.warning( "Unknown shape for {} classifier (WIP)".format(self.name)) except: raise CaughtException("Exception encountered when recovering " "the {} classifier model's shape".format( self.name)) return shape
def train(self, features, labels, train_type): """Train the classifier on the given data""" self.trained = False self.logger.info("Train using {} documents".format(len(features))) try: if train_type == "offline": self.model.fit(features, labels) else: self.model.partial_fit(features, labels, classes=self.categories) except: raise CaughtException( "Exception when {} training the {} multiclass classifier". format(train_type, self.name)) else: self.trained = True
def __init__(self, model_file): """Load classifier model from binary file""" super().__init__() utils.check_file_readable(model_file) self.model = None with open(model_file, 'rb') as icstream: try: self.model = pickle.load(icstream) except Exception as e: raise CaughtException( "Exception encountered when loading the classifier: {}". format(e)) self.name = type(self.model).__name__ self.categories = self.model.classes_ self.logger.info("Loaded already-trained {} classifier model " "from '{}' file".format(self.name, model_file))
def __init__(self, classif_name, categories, **kwargs): """ Initialize the classifier with its name and its list of known categories """ super().__init__() self.name = classif_name self.trained = False self.categories = list(categories) classifiers = xi.ml.classify.TrainClassifier.CLASSIFIERS['multilabel'] try: self.model = classifiers['models'][classif_name](**kwargs) except: raise CaughtException( "Exception when initializing the {} multilabel classifier ({})" .format(self.name, kwargs)) self.logger.info("Initialized a {} classifier".format(classif_name))
def store_prediction(self, input_file, output_file): """ Test the classifier on 'untagged' documents. Store prediction category and prediction probability in file. """ if not self.prediction_checkups(): return utils.check_file_readable(input_file) utils.create_path(output_file) sc = StreamCorpus(input_file) try: pc = PushCorpus(output_file) for doc in sc: if 'features' in doc: prediction = self.classify_doc(doc['features']) if isinstance(prediction, dict) and \ 'category' in prediction and \ 'probas' in prediction: doc['season'] = prediction['category'] doc['season_prob'] = prediction['probas'] pc.add(doc) except Exception as e: raise CaughtException( "Exception encountered when storing classified documents: {}". format(e)) else: self.logger.info("Stored {} documents to file".format(pc.size)) finally: pc.close_stream()