예제 #1
0
    def __init__(self, model=None):
        """
        Pass in the path of the pickle classifier object.
        """

        ## Get the default model from the settings if it isn't passed in
        model = model or settings.model

        ## Load the model from the pickle
        with open(model, 'rb') as pkl:
            self._classifier = pickle.load(pkl)

        ## Create a featurizer to use
        self.featurizer = ProductFeatures()
예제 #2
0
class ApparelClassifier(object):
    """
    Performs classification of products using a classifier that is loaded
    via a pickle at runtime. This classifier can be of any type, but we
    expect the Maximum Entropy classifier trained from a CSV corpus.
    """

    def __init__(self, model=None):
        """
        Pass in the path of the pickle classifier object.
        """

        ## Get the default model from the settings if it isn't passed in
        model = model or settings.model

        ## Load the model from the pickle
        with open(model, 'rb') as pkl:
            self._classifier = pickle.load(pkl)

        ## Create a featurizer to use
        self.featurizer = ProductFeatures()

    def classify(self, name, description=None, keywords=None):
        """
        Classifies the text using the internal classifier. Returns a
        probability distribution of the labels associated with the text.
        """
        features = self.featurizer.featurize(name, description, keywords)
        probdist = self._classifier.prob_classify(features)
        labels   = [(label, probdist.prob(label))
                    for label in probdist.samples()
                    if probdist.prob(label) > 0.01]
        return sorted(labels, key=itemgetter(1), reverse=True)

    def explain(self, name, description=None, keywords=None):
        """
        Wrapper for classifier.explain - prints out (no way to capture the
        string output, unfortunately) the features contributing to the
        chosen classifier.
        """
        features = self.featurizer.featurize(name, description, keywords)
        self._classifier.explain(features)

    def labels(self):
        """
        Wrapper for classifier.labels - returns a list of the labels.
        """
        return self._classifier.labels()
예제 #3
0
    def __init__(self, corpus=None, **kwargs):
        self.corpus = corpus or settings.corpus
        self.validate = kwargs.pop('validate',
                                   True)  # Perform cross validation
        self.outpath = kwargs.pop('outpath',
                                  '.')  # Where to write out the data

        # Compute info and model paths
        self.model_path, self.info_path = self.get_output_paths()

        # Other required properties
        self.accuracy = None  # Accuracy of the model
        self.started = None  # Start timestamp of the build
        self.finished = None  # Finish timestamp of the build
        self.buildtime = None  # Time (seconds) of complete build
        self.feattime = None  # Time (seconds) to get features
        self.traintime = None  # Time (seconds) to train the model
        self.validtime = None  # Time (seconds) to run the validation

        # Create a featurizer
        self.featurizer = ProductFeatures()

        # Cache the features on the model
        self._featureset = None
예제 #4
0
    def __init__(self, corpus=None, **kwargs):
        self.corpus      = corpus or settings.corpus
        self.validate    = kwargs.pop('validate', True)    # Perform cross validation
        self.outpath     = kwargs.pop('outpath', '.')      # Where to write out the data

        # Compute info and model paths
        self.model_path, self.info_path = self.get_output_paths()

        # Other required properties
        self.accuracy    = None  # Accuracy of the model
        self.started     = None  # Start timestamp of the build
        self.finished    = None  # Finish timestamp of the build
        self.buildtime   = None  # Time (seconds) of complete build
        self.feattime    = None  # Time (seconds) to get features
        self.traintime   = None  # Time (seconds) to train the model
        self.validtime   = None  # Time (seconds) to run the validation

        # Create a featurizer
        self.featurizer  = ProductFeatures()

        # Cache the features on the model
        self._featureset = None
예제 #5
0
class ClassifierBuilder(object):
    """
    Creates a classifier model using MaximumEntropy and saves it as a
    pickle to disk. This class also writes out extra information to disk
    to ensure that the model can be identified in the future.
    """
    def __init__(self, corpus=None, **kwargs):
        self.corpus = corpus or settings.corpus
        self.validate = kwargs.pop('validate',
                                   True)  # Perform cross validation
        self.outpath = kwargs.pop('outpath',
                                  '.')  # Where to write out the data

        # Compute info and model paths
        self.model_path, self.info_path = self.get_output_paths()

        # Other required properties
        self.accuracy = None  # Accuracy of the model
        self.started = None  # Start timestamp of the build
        self.finished = None  # Finish timestamp of the build
        self.buildtime = None  # Time (seconds) of complete build
        self.feattime = None  # Time (seconds) to get features
        self.traintime = None  # Time (seconds) to train the model
        self.validtime = None  # Time (seconds) to run the validation

        # Create a featurizer
        self.featurizer = ProductFeatures()

        # Cache the features on the model
        self._featureset = None

    def featureset(self):
        """
        Opens the corpus path, reads the data and constructs features to
        pass to the classifier. (A simple improvement is to cache this).

        Returns a dictionary of features and the label as follows:

            [(feats, label) for row in corpus]

        This is the expected format for the MaxentClassifier.
        """

        if self._featureset is None:

            # Time how long it takes to extract features
            start = time.time()

            self._featureset = []
            with open(self.corpus, 'r') as f:
                reader = csv.DictReader(f)
                for row in reader:
                    label = row.pop('category')
                    feats = self.featurizer.featurize(**row)
                    self._featureset.append((feats, label))

            # Record feature extraction time
            self.feattime = time.time() - start

        return self._featureset

    def train(self, featureset=None):
        """
        Trains the maximum entropy classifier and returns it. If a
        featureset is specified it trains on that, otherwise it trains on
        the models featureset.

        Pass in a featureset during cross validation.
        Returns the training time and the classifier.
        """
        featureset = featureset or self.featureset()

        # Time how long it takes to train
        start = time.time()

        classifier = MaxentClassifier.train(featureset,
                                            algorithm='megam',
                                            trace=1,
                                            gaussian_prior_sigma=1)

        delta = time.time() - start
        return classifier, delta

    def build(self):
        """
        Builds the model and writes to the outpath (which should be a
        directory). Two files are written:

            - the pickle of the model
            - a yaml file of associated data

        Note, if a file already exists at the outpath, this will raise an
        exception (don't want to overwrite a model by accident!)
        """

        # Record the start time
        self.started = datetime.now()
        start = time.time()

        # Extract the features and train the model
        classifier, self.traintime = self.train()

        # Write the classifier to disk
        with open(self.model_path, 'w') as f:
            pickle.dump(classifier, f, pickle.HIGHEST_PROTOCOL)

        # Begin accuracy validation
        if self.validate:
            self.cross_validate()

        # Record the finish time
        self.finished = datetime.now()
        self.buildtime = time.time() - start

        # Write the information to disk
        self.write_details()

    def cross_validate(self):
        """
        Performs cross validation by training the model on 90% of the
        corpus then checking the accuracy on the remaining 10%.
        """
        start = time.time()

        feats = self.featureset()
        offset = len(feats) / 10
        random.shuffle(feats)

        train = feats[:offset]
        test = feats[offset:]

        classifier, _ = self.train(train)
        self.accuracy = accuracy(classifier, test)

        self.validtime = time.time() - start

    def get_output_paths(self):
        """
        Returns two paths - the pickle path and the information yaml path.
        Ensures those paths don't exist and wont' be overwritten.
        """

        today = datetime.now().strftime('%Y-%d-%m')
        mname = os.path.join(self.outpath, "model-%s.pickle" % today)
        iname = os.path.join(self.outpath, "info-%s.json" % today)

        for name in (mname, iname):
            if os.path.exists(name):
                raise Exception("Can't overwrite file at '%s'!" % name)

        return mname, iname

    def write_details(self):
        """
        Writes the details of the classifier to a YAML file.
        """

        details = {
            'version': apparel.get_version(),
            'started': self.started.strftime(DATE_FORMAT),
            'finished': self.finished.strftime(DATE_FORMAT),
            'accuracy': self.accuracy,
            'validated': self.validate,
            'corpus': self.corpus,
            'paths': {
                'model': self.model_path,
                'info': self.info_path,
            },
            'classes': {
                'classifier': MaxentClassifier.__name__,
                'features': ProductFeatures.__name__,
            },
            'timer': {
                'build': self.buildtime,
                'features': self.feattime,
                'validation': self.validtime,
                'training': self.traintime,
            }
        }

        with open(self.info_path, 'w') as f:
            json.dump(details, f, indent=4)
예제 #6
0
class ClassifierBuilder(object):
    """
    Creates a classifier model using MaximumEntropy and saves it as a
    pickle to disk. This class also writes out extra information to disk
    to ensure that the model can be identified in the future.
    """

    def __init__(self, corpus=None, **kwargs):
        self.corpus      = corpus or settings.corpus
        self.validate    = kwargs.pop('validate', True)    # Perform cross validation
        self.outpath     = kwargs.pop('outpath', '.')      # Where to write out the data

        # Compute info and model paths
        self.model_path, self.info_path = self.get_output_paths()

        # Other required properties
        self.accuracy    = None  # Accuracy of the model
        self.started     = None  # Start timestamp of the build
        self.finished    = None  # Finish timestamp of the build
        self.buildtime   = None  # Time (seconds) of complete build
        self.feattime    = None  # Time (seconds) to get features
        self.traintime   = None  # Time (seconds) to train the model
        self.validtime   = None  # Time (seconds) to run the validation

        # Create a featurizer
        self.featurizer  = ProductFeatures()

        # Cache the features on the model
        self._featureset = None

    def featureset(self):
        """
        Opens the corpus path, reads the data and constructs features to
        pass to the classifier. (A simple improvement is to cache this).

        Returns a dictionary of features and the label as follows:

            [(feats, label) for row in corpus]

        This is the expected format for the MaxentClassifier.
        """

        if self._featureset is None:

            # Time how long it takes to extract features
            start = time.time()

            self._featureset = []
            with open(self.corpus, 'r') as f:
                reader = csv.DictReader(f)
                for row in reader:
                    label = row.pop('category')
                    feats = self.featurizer.featurize(**row)
                    self._featureset.append((feats, label))

            # Record feature extraction time
            self.feattime = time.time() - start

        return self._featureset

    def train(self, featureset=None):
        """
        Trains the maximum entropy classifier and returns it. If a
        featureset is specified it trains on that, otherwise it trains on
        the models featureset.

        Pass in a featureset during cross validation.
        Returns the training time and the classifier.
        """
        featureset = featureset or self.featureset()

        # Time how long it takes to train
        start = time.time()

        classifier = MaxentClassifier.train(featureset,
                        algorithm='megam', trace=1, gaussian_prior_sigma=1)

        delta = time.time() - start
        return classifier, delta

    def build(self):
        """
        Builds the model and writes to the outpath (which should be a
        directory). Two files are written:

            - the pickle of the model
            - a yaml file of associated data

        Note, if a file already exists at the outpath, this will raise an
        exception (don't want to overwrite a model by accident!)
        """

        # Record the start time
        self.started  = datetime.now()
        start = time.time()

        # Extract the features and train the model
        classifier, self.traintime = self.train()

        # Write the classifier to disk
        with open(self.model_path, 'w') as f:
            pickle.dump(classifier, f, pickle.HIGHEST_PROTOCOL)

        # Begin accuracy validation
        if self.validate:
            self.cross_validate()

        # Record the finish time
        self.finished = datetime.now()
        self.buildtime = time.time() - start

        # Write the information to disk
        self.write_details()

    def cross_validate(self):
        """
        Performs cross validation by training the model on 90% of the
        corpus then checking the accuracy on the remaining 10%.
        """
        start  = time.time()

        feats  = self.featureset()
        offset = len(feats) / 10
        random.shuffle(feats)

        train  = feats[:offset]
        test   = feats[offset:]

        classifier, _  = self.train(train)
        self.accuracy  = accuracy(classifier, test)

        self.validtime = time.time() - start

    def get_output_paths(self):
        """
        Returns two paths - the pickle path and the information yaml path.
        Ensures those paths don't exist and wont' be overwritten.
        """

        today = datetime.now().strftime('%Y-%d-%m')
        mname = os.path.join(self.outpath, "model-%s.pickle" % today)
        iname = os.path.join(self.outpath, "info-%s.json" % today)

        for name in (mname, iname):
            if os.path.exists(name):
                raise Exception("Can't overwrite file at '%s'!" % name)

        return mname, iname

    def write_details(self):
        """
        Writes the details of the classifier to a YAML file.
        """

        details = {
            'version': apparel.get_version(),
            'started': self.started.strftime(DATE_FORMAT),
            'finished': self.finished.strftime(DATE_FORMAT),
            'accuracy': self.accuracy,
            'validated': self.validate,
            'corpus': self.corpus,
            'paths': {
                'model': self.model_path,
                'info': self.info_path,
            },
            'classes': {
                'classifier': MaxentClassifier.__name__,
                'features': ProductFeatures.__name__,
            },
            'timer': {
                'build': self.buildtime,
                'features': self.feattime,
                'validation': self.validtime,
                'training': self.traintime,
            }
        }

        with open(self.info_path, 'w') as f:
            json.dump(details, f, indent=4)