Esempio n. 1
0
    def train_model(self, data, model_library, model):
        """data is an instance of DragnetModelData
        model_library is a list of model definitions as input to
         run_train_models
        model provides model.make_features to make the features
        """
        from mozsci.map_train import run_train_models

        # to train the model need a set of all the features and their labels
        # the features + labels are block level

        # get the features from the first document to see how many features we have
        features, labels, weights = self.make_features_from_data(data, model)

        # cap weights!
        weights = np.minimum(weights, 200)

        # do kfold cross validation
        folds = cv_kfold(len(labels), self.kfolds, seed=2)

        if self.weighted:
            errors = run_train_models(processes=4, model_library=model_library,
                X=features, y=labels, folds=folds, weights=weights)
        else:
            errors = run_train_models(processes=4, model_library=model_library,
                X=features, y=labels, folds=folds)

        return errors, features, labels, weights, folds
Esempio n. 2
0
    def setUp(self):
        np.random.seed(5)
        self.X = np.linspace(0, 1, 100).reshape(100, 1)
        self.y = (5 * self.X.reshape(100, ) - 2 + np.random.rand(100) >
                  0).astype(np.int)

        self.folds = cv_kfold(100, 4, seed=2)
Esempio n. 3
0
    def train_model(self, data, model_library, features_to_use):
        """data is an instance of DragnetModelData
        model_library: the block_models to train as a list of model
            definitions as input to run_train_models
        features_to_use = a list of the features to use.  Must be one of
            the features known by AllFeatures
        """
        from . import AllFeatures
        from .blocks import TagCountReadabilityBlockifier as Blkr

        from mozsci.map_train import run_train_models

        # assemble the features
        feature_instances = []
        for f in features_to_use:
            feature_instances.append(AllFeatures.get(f))

        # do feature centering
        print("Initializing features")
        for f in feature_instances:
            # check to see if this feature needs to be init
            # if so, then init it, take the return object and serialize to json
            if hasattr(f, 'init_params'):
                # initialize it
                model_init = ContentExtractionModel(Blkr, [f], None)
                features, labels, weights = self.make_features_from_data(
                    data, model_init, train=True)
                mean_std = f.init_params(features)
                f.set_params(mean_std)

        model_to_train = ContentExtractionModel(Blkr, feature_instances, None)

        # train the model
        print("Training the model")
        features, labels, weights = self.make_features_from_data(
            data, model_to_train, training_or_test='training')

        # cap weights!
        weights = np.minimum(weights, 200)

        # do kfold cross validation
        if self.kfolds > 1:
            folds = cv_kfold(len(labels), self.kfolds, seed=2)
        else:
            folds = None

        if self.weighted:
            errors = run_train_models(
                processes=1, model_library=model_library,
                X=features, y=labels, folds=folds, weights=weights)
        else:
            errors = run_train_models(
                processes=1, model_library=model_library,
                X=features, y=labels, folds=folds)

        return errors, features, labels, weights, folds
Esempio n. 4
0
    def train_model(self, data, model_library, features_to_use):
        """data is an instance of DragnetModelData
        model_library: the block_models to train as a list of model
            definitions as input to run_train_models
        features_to_use = a list of the features to use.  Must be one of
            the features known by AllFeatures
        """
        from . import AllFeatures
        from .blocks import TagCountReadabilityBlockifier as Blkr

        from mozsci.map_train import run_train_models

        # assemble the features
        feature_instances = []
        for f in features_to_use:
            feature_instances.append(AllFeatures.get(f))

        # do feature centering
        print("Initializing features")
        for f in feature_instances:
            # check to see if this feature needs to be init
            # if so, then init it, take the return object and serialize to json
            if hasattr(f, 'init_params'):
                # initialize it
                model_init = ContentExtractionModel(Blkr, [f], None)
                features, labels, weights = self.make_features_from_data(
                    data, model_init, train=True)
                mean_std = f.init_params(features)
                f.set_params(mean_std)

        model_to_train = ContentExtractionModel(Blkr, feature_instances, None)

        # train the model
        print("Training the model")
        features, labels, weights = self.make_features_from_data(
            data, model_to_train, training_or_test='training')

        # cap weights!
        weights = np.minimum(weights, 200)

        # do kfold cross validation
        if self.kfolds > 1:
            folds = cv_kfold(len(labels), self.kfolds, seed=2)
        else:
            folds = None

        if self.weighted:
            errors = run_train_models(
                processes=1, model_library=model_library,
                X=features, y=labels, folds=folds, weights=weights)
        else:
            errors = run_train_models(
                processes=1, model_library=model_library,
                X=features, y=labels, folds=folds)

        return errors, features, labels, weights, folds
Esempio n. 5
0
    def test_cv_kfold(self):
        folds = cross_validate.cv_kfold(20, 4, seed=2)

        sum_training = np.sum([len(ele[0]) for ele in folds])
        self.assertTrue(sum_training == 3 * 20)

        sum_training = np.sum([len(ele[1]) for ele in folds])
        self.assertTrue(sum_training == 20)

        actual_folds = [
 [[0, 3, 4, 5, 8, 9, 17, 2, 7, 10, 11, 13, 15, 16, 18], [1, 6, 12, 14, 19]],
 [[1, 6, 12, 14, 19, 2, 7, 10, 11, 13, 15, 16, 18], [0, 3, 4, 5, 8, 9, 17]],
 [[1, 6, 12, 14, 19, 0, 3, 4, 5, 8, 9, 17, 15, 16, 18], [2, 7, 10, 11, 13]],
 [[1, 6, 12, 14, 19, 0, 3, 4, 5, 8, 9, 17, 2, 7, 10, 11, 13], [15, 16, 18]]]

        self.assertEqual(actual_folds, folds)
Esempio n. 6
0
    def test_cv_kfold(self):
        folds = cross_validate.cv_kfold(20, 4, seed=2)

        sum_training = np.sum([len(ele[0]) for ele in folds])
        self.assertTrue(sum_training == 3 * 20)

        sum_training = np.sum([len(ele[1]) for ele in folds])
        self.assertTrue(sum_training == 20)

        actual_folds = [
            [[0, 3, 4, 5, 8, 9, 17, 2, 7, 10, 11, 13, 15, 16, 18],
             [1, 6, 12, 14, 19]],
            [[1, 6, 12, 14, 19, 2, 7, 10, 11, 13, 15, 16, 18],
             [0, 3, 4, 5, 8, 9, 17]],
            [[1, 6, 12, 14, 19, 0, 3, 4, 5, 8, 9, 17, 15, 16, 18],
             [2, 7, 10, 11, 13]],
            [[1, 6, 12, 14, 19, 0, 3, 4, 5, 8, 9, 17, 2, 7, 10, 11, 13],
             [15, 16, 18]]
        ]

        self.assertEqual(actual_folds, folds)
Esempio n. 7
0
    def setUp(self):
        np.random.seed(5)
        self.X = np.linspace(0, 1, 100).reshape(100, 1)
        self.y = (5 * self.X.reshape(100, ) - 2 + np.random.rand(100) > 0).astype(np.int)

        self.folds = cv_kfold(100, 4, seed=2)