def __init__(self, model=None): """ Pass in the path of the pickle classifier object. """ ## Get the default model from the settings if it isn't passed in model = model or settings.model ## Load the model from the pickle with open(model, 'rb') as pkl: self._classifier = pickle.load(pkl) ## Create a featurizer to use self.featurizer = ProductFeatures()
class ApparelClassifier(object): """ Performs classification of products using a classifier that is loaded via a pickle at runtime. This classifier can be of any type, but we expect the Maximum Entropy classifier trained from a CSV corpus. """ def __init__(self, model=None): """ Pass in the path of the pickle classifier object. """ ## Get the default model from the settings if it isn't passed in model = model or settings.model ## Load the model from the pickle with open(model, 'rb') as pkl: self._classifier = pickle.load(pkl) ## Create a featurizer to use self.featurizer = ProductFeatures() def classify(self, name, description=None, keywords=None): """ Classifies the text using the internal classifier. Returns a probability distribution of the labels associated with the text. """ features = self.featurizer.featurize(name, description, keywords) probdist = self._classifier.prob_classify(features) labels = [(label, probdist.prob(label)) for label in probdist.samples() if probdist.prob(label) > 0.01] return sorted(labels, key=itemgetter(1), reverse=True) def explain(self, name, description=None, keywords=None): """ Wrapper for classifier.explain - prints out (no way to capture the string output, unfortunately) the features contributing to the chosen classifier. """ features = self.featurizer.featurize(name, description, keywords) self._classifier.explain(features) def labels(self): """ Wrapper for classifier.labels - returns a list of the labels. """ return self._classifier.labels()
def __init__(self, corpus=None, **kwargs): self.corpus = corpus or settings.corpus self.validate = kwargs.pop('validate', True) # Perform cross validation self.outpath = kwargs.pop('outpath', '.') # Where to write out the data # Compute info and model paths self.model_path, self.info_path = self.get_output_paths() # Other required properties self.accuracy = None # Accuracy of the model self.started = None # Start timestamp of the build self.finished = None # Finish timestamp of the build self.buildtime = None # Time (seconds) of complete build self.feattime = None # Time (seconds) to get features self.traintime = None # Time (seconds) to train the model self.validtime = None # Time (seconds) to run the validation # Create a featurizer self.featurizer = ProductFeatures() # Cache the features on the model self._featureset = None
class ClassifierBuilder(object): """ Creates a classifier model using MaximumEntropy and saves it as a pickle to disk. This class also writes out extra information to disk to ensure that the model can be identified in the future. """ def __init__(self, corpus=None, **kwargs): self.corpus = corpus or settings.corpus self.validate = kwargs.pop('validate', True) # Perform cross validation self.outpath = kwargs.pop('outpath', '.') # Where to write out the data # Compute info and model paths self.model_path, self.info_path = self.get_output_paths() # Other required properties self.accuracy = None # Accuracy of the model self.started = None # Start timestamp of the build self.finished = None # Finish timestamp of the build self.buildtime = None # Time (seconds) of complete build self.feattime = None # Time (seconds) to get features self.traintime = None # Time (seconds) to train the model self.validtime = None # Time (seconds) to run the validation # Create a featurizer self.featurizer = ProductFeatures() # Cache the features on the model self._featureset = None def featureset(self): """ Opens the corpus path, reads the data and constructs features to pass to the classifier. (A simple improvement is to cache this). Returns a dictionary of features and the label as follows: [(feats, label) for row in corpus] This is the expected format for the MaxentClassifier. """ if self._featureset is None: # Time how long it takes to extract features start = time.time() self._featureset = [] with open(self.corpus, 'r') as f: reader = csv.DictReader(f) for row in reader: label = row.pop('category') feats = self.featurizer.featurize(**row) self._featureset.append((feats, label)) # Record feature extraction time self.feattime = time.time() - start return self._featureset def train(self, featureset=None): """ Trains the maximum entropy classifier and returns it. If a featureset is specified it trains on that, otherwise it trains on the models featureset. Pass in a featureset during cross validation. Returns the training time and the classifier. """ featureset = featureset or self.featureset() # Time how long it takes to train start = time.time() classifier = MaxentClassifier.train(featureset, algorithm='megam', trace=1, gaussian_prior_sigma=1) delta = time.time() - start return classifier, delta def build(self): """ Builds the model and writes to the outpath (which should be a directory). Two files are written: - the pickle of the model - a yaml file of associated data Note, if a file already exists at the outpath, this will raise an exception (don't want to overwrite a model by accident!) """ # Record the start time self.started = datetime.now() start = time.time() # Extract the features and train the model classifier, self.traintime = self.train() # Write the classifier to disk with open(self.model_path, 'w') as f: pickle.dump(classifier, f, pickle.HIGHEST_PROTOCOL) # Begin accuracy validation if self.validate: self.cross_validate() # Record the finish time self.finished = datetime.now() self.buildtime = time.time() - start # Write the information to disk self.write_details() def cross_validate(self): """ Performs cross validation by training the model on 90% of the corpus then checking the accuracy on the remaining 10%. """ start = time.time() feats = self.featureset() offset = len(feats) / 10 random.shuffle(feats) train = feats[:offset] test = feats[offset:] classifier, _ = self.train(train) self.accuracy = accuracy(classifier, test) self.validtime = time.time() - start def get_output_paths(self): """ Returns two paths - the pickle path and the information yaml path. Ensures those paths don't exist and wont' be overwritten. """ today = datetime.now().strftime('%Y-%d-%m') mname = os.path.join(self.outpath, "model-%s.pickle" % today) iname = os.path.join(self.outpath, "info-%s.json" % today) for name in (mname, iname): if os.path.exists(name): raise Exception("Can't overwrite file at '%s'!" % name) return mname, iname def write_details(self): """ Writes the details of the classifier to a YAML file. """ details = { 'version': apparel.get_version(), 'started': self.started.strftime(DATE_FORMAT), 'finished': self.finished.strftime(DATE_FORMAT), 'accuracy': self.accuracy, 'validated': self.validate, 'corpus': self.corpus, 'paths': { 'model': self.model_path, 'info': self.info_path, }, 'classes': { 'classifier': MaxentClassifier.__name__, 'features': ProductFeatures.__name__, }, 'timer': { 'build': self.buildtime, 'features': self.feattime, 'validation': self.validtime, 'training': self.traintime, } } with open(self.info_path, 'w') as f: json.dump(details, f, indent=4)