def train_model(self, data, model_library, model): """data is an instance of DragnetModelData model_library is a list of model definitions as input to run_train_models model provides model.make_features to make the features """ from mozsci.map_train import run_train_models # to train the model need a set of all the features and their labels # the features + labels are block level # get the features from the first document to see how many features we have features, labels, weights = self.make_features_from_data(data, model) # cap weights! weights = np.minimum(weights, 200) # do kfold cross validation folds = cv_kfold(len(labels), self.kfolds, seed=2) if self.weighted: errors = run_train_models(processes=4, model_library=model_library, X=features, y=labels, folds=folds, weights=weights) else: errors = run_train_models(processes=4, model_library=model_library, X=features, y=labels, folds=folds) return errors, features, labels, weights, folds
def setUp(self): np.random.seed(5) self.X = np.linspace(0, 1, 100).reshape(100, 1) self.y = (5 * self.X.reshape(100, ) - 2 + np.random.rand(100) > 0).astype(np.int) self.folds = cv_kfold(100, 4, seed=2)
def train_model(self, data, model_library, features_to_use): """data is an instance of DragnetModelData model_library: the block_models to train as a list of model definitions as input to run_train_models features_to_use = a list of the features to use. Must be one of the features known by AllFeatures """ from . import AllFeatures from .blocks import TagCountReadabilityBlockifier as Blkr from mozsci.map_train import run_train_models # assemble the features feature_instances = [] for f in features_to_use: feature_instances.append(AllFeatures.get(f)) # do feature centering print("Initializing features") for f in feature_instances: # check to see if this feature needs to be init # if so, then init it, take the return object and serialize to json if hasattr(f, 'init_params'): # initialize it model_init = ContentExtractionModel(Blkr, [f], None) features, labels, weights = self.make_features_from_data( data, model_init, train=True) mean_std = f.init_params(features) f.set_params(mean_std) model_to_train = ContentExtractionModel(Blkr, feature_instances, None) # train the model print("Training the model") features, labels, weights = self.make_features_from_data( data, model_to_train, training_or_test='training') # cap weights! weights = np.minimum(weights, 200) # do kfold cross validation if self.kfolds > 1: folds = cv_kfold(len(labels), self.kfolds, seed=2) else: folds = None if self.weighted: errors = run_train_models( processes=1, model_library=model_library, X=features, y=labels, folds=folds, weights=weights) else: errors = run_train_models( processes=1, model_library=model_library, X=features, y=labels, folds=folds) return errors, features, labels, weights, folds
def test_cv_kfold(self): folds = cross_validate.cv_kfold(20, 4, seed=2) sum_training = np.sum([len(ele[0]) for ele in folds]) self.assertTrue(sum_training == 3 * 20) sum_training = np.sum([len(ele[1]) for ele in folds]) self.assertTrue(sum_training == 20) actual_folds = [ [[0, 3, 4, 5, 8, 9, 17, 2, 7, 10, 11, 13, 15, 16, 18], [1, 6, 12, 14, 19]], [[1, 6, 12, 14, 19, 2, 7, 10, 11, 13, 15, 16, 18], [0, 3, 4, 5, 8, 9, 17]], [[1, 6, 12, 14, 19, 0, 3, 4, 5, 8, 9, 17, 15, 16, 18], [2, 7, 10, 11, 13]], [[1, 6, 12, 14, 19, 0, 3, 4, 5, 8, 9, 17, 2, 7, 10, 11, 13], [15, 16, 18]]] self.assertEqual(actual_folds, folds)
def test_cv_kfold(self): folds = cross_validate.cv_kfold(20, 4, seed=2) sum_training = np.sum([len(ele[0]) for ele in folds]) self.assertTrue(sum_training == 3 * 20) sum_training = np.sum([len(ele[1]) for ele in folds]) self.assertTrue(sum_training == 20) actual_folds = [ [[0, 3, 4, 5, 8, 9, 17, 2, 7, 10, 11, 13, 15, 16, 18], [1, 6, 12, 14, 19]], [[1, 6, 12, 14, 19, 2, 7, 10, 11, 13, 15, 16, 18], [0, 3, 4, 5, 8, 9, 17]], [[1, 6, 12, 14, 19, 0, 3, 4, 5, 8, 9, 17, 15, 16, 18], [2, 7, 10, 11, 13]], [[1, 6, 12, 14, 19, 0, 3, 4, 5, 8, 9, 17, 2, 7, 10, 11, 13], [15, 16, 18]] ] self.assertEqual(actual_folds, folds)