Ejemplo n.º 1
0
class FeaturesPerceptronRanker(BasePerceptronRanker):
    """Base class for global ranker for whole trees, based on features."""
    def __init__(self, cfg):
        super(FeaturesPerceptronRanker, self).__init__(cfg)
        if not cfg:
            cfg = {}
        self.feats = ['bias: bias']
        self.vectorizer = None
        self.normalizer = None
        self.binarize = cfg.get('binarize', False)
        # initialize feature functions
        if 'features' in cfg:
            self.feats.extend(cfg['features'])
        self.feats = Features(self.feats, cfg.get('intermediate_features', []))

    def _extract_feats(self, tree, da):
        feats = self.vectorizer.transform(
            [self.feats.get_features(tree, {'da': da})])
        if self.normalizer:
            feats = self.normalizer.transform(feats)
        return feats[0]

    def _init_training(self, das_file, ttree_file, data_portion):

        super(FeaturesPerceptronRanker,
              self)._init_training(das_file, ttree_file, data_portion)

        # precompute training data features
        X = []
        for da, tree in zip(self.train_das, self.train_trees):
            X.append(self.feats.get_features(tree, {'da': da}))
        if self.prune_feats > 1:
            self._prune_features(X)
        # vectorize and binarize or normalize (+train vectorizer/normalizer)
        if self.binarize:
            self.vectorizer = DictVectorizer(sparse=False,
                                             binarize_numeric=True)
            self.train_feats = self.vectorizer.fit_transform(X)
        else:
            self.vectorizer = DictVectorizer(sparse=False)
            self.normalizer = StandardScaler(copy=False)
            self.train_feats = self.normalizer.fit_transform(
                self.vectorizer.fit_transform(X))

        log_info('Features matrix shape: %s' % str(self.train_feats.shape))

    def _prune_features(self, X):
        """Prune features – remove all entries from X that involve features not having a
        specified minimum occurrence count.
        """
        counts = defaultdict(int)
        for inst in X:
            for key in inst.iterkeys():
                counts[key] += 1
        for inst in X:
            for key in inst.keys():
                if counts[key] < self.prune_feats:
                    del inst[key]
Ejemplo n.º 2
0
class FeaturesPerceptronRanker(BasePerceptronRanker):
    """Base class for global ranker for whole trees, based on features."""

    def __init__(self, cfg):
        super(FeaturesPerceptronRanker, self).__init__(cfg)
        if not cfg:
            cfg = {}
        self.feats = ['bias: bias']
        self.vectorizer = None
        self.normalizer = None
        self.binarize = cfg.get('binarize', False)
        # initialize feature functions
        if 'features' in cfg:
            self.feats.extend(cfg['features'])
        self.feats = Features(self.feats, cfg.get('intermediate_features', []))

    def _extract_feats(self, tree, da):
        feats = self.vectorizer.transform([self.feats.get_features(tree, {'da': da})])
        if self.normalizer:
            feats = self.normalizer.transform(feats)
        return feats[0]

    def _init_training(self, das_file, ttree_file, data_portion):

        super(FeaturesPerceptronRanker, self)._init_training(das_file, ttree_file, data_portion)

        # precompute training data features
        X = []
        for da, tree in zip(self.train_das, self.train_trees):
            X.append(self.feats.get_features(tree, {'da': da}))
        if self.prune_feats > 1:
            self._prune_features(X)
        # vectorize and binarize or normalize (+train vectorizer/normalizer)
        if self.binarize:
            self.vectorizer = DictVectorizer(sparse=False, binarize_numeric=True)
            self.train_feats = self.vectorizer.fit_transform(X)
        else:
            self.vectorizer = DictVectorizer(sparse=False)
            self.normalizer = StandardScaler(copy=False)
            self.train_feats = self.normalizer.fit_transform(self.vectorizer.fit_transform(X))

        log_info('Features matrix shape: %s' % str(self.train_feats.shape))

    def _prune_features(self, X):
        """Prune features – remove all entries from X that involve features not having a
        specified minimum occurrence count.
        """
        counts = defaultdict(int)
        for inst in X:
            for key in inst.iterkeys():
                counts[key] += 1
        for inst in X:
            for key in inst.keys():
                if counts[key] < self.prune_feats:
                    del inst[key]
Ejemplo n.º 3
0
    def _init_training(self, das_file, ttree_file, data_portion):

        super(FeaturesPerceptronRanker,
              self)._init_training(das_file, ttree_file, data_portion)

        # precompute training data features
        X = []
        for da, tree in zip(self.train_das, self.train_trees):
            X.append(self.feats.get_features(tree, {'da': da}))
        if self.prune_feats > 1:
            self._prune_features(X)
        # vectorize and binarize or normalize (+train vectorizer/normalizer)
        if self.binarize:
            self.vectorizer = DictVectorizer(sparse=False,
                                             binarize_numeric=True)
            self.train_feats = self.vectorizer.fit_transform(X)
        else:
            self.vectorizer = DictVectorizer(sparse=False)
            self.normalizer = StandardScaler(copy=False)
            self.train_feats = self.normalizer.fit_transform(
                self.vectorizer.fit_transform(X))

        log_info('Features matrix shape: %s' % str(self.train_feats.shape))
Ejemplo n.º 4
0
    def _init_training(self, das_file, ttree_file, data_portion):

        super(FeaturesPerceptronRanker, self)._init_training(das_file, ttree_file, data_portion)

        # precompute training data features
        X = []
        for da, tree in zip(self.train_das, self.train_trees):
            X.append(self.feats.get_features(tree, {'da': da}))
        if self.prune_feats > 1:
            self._prune_features(X)
        # vectorize and binarize or normalize (+train vectorizer/normalizer)
        if self.binarize:
            self.vectorizer = DictVectorizer(sparse=False, binarize_numeric=True)
            self.train_feats = self.vectorizer.fit_transform(X)
        else:
            self.vectorizer = DictVectorizer(sparse=False)
            self.normalizer = StandardScaler(copy=False)
            self.train_feats = self.normalizer.fit_transform(self.vectorizer.fit_transform(X))

        log_info('Features matrix shape: %s' % str(self.train_feats.shape))