Beispiel #1
0
    def go_details(self, obj, show, data, mock_get_trainer,
                   mock_get_vect_data):
        should_called = not obj.is_weights_calculated
        trainer = TrainerStorage.loads(MODEL_TRAINER)
        mock_get_trainer.return_value = trainer

        mock_get_vect_data.return_value = [0.123, 0.0] * 500

        url = self._get_url(id=obj.id, show=show, data=data)
        resp = self.client.get(url, headers=HTTP_HEADERS)
        self.assertEquals(
            resp.status_code, 200,
            "code: {0}, data: {1}".format(resp.status_code, resp.data))
        self.assertEquals(mock_get_trainer.called, should_called)
        return json.loads(resp.data)['test_example']
Beispiel #2
0
    def test_generate_visualization_tree(self, get_trainer_mock):
        """
        Checks generate_visualization_tree task with decision tree
        clf without segmentation.
        """
        # Using non existance model
        from api.ml_models.tasks import generate_visualization_tree, \
            VisualizationException
        invalid_model_id = -101
        self.assertRaises(ValueError, generate_visualization_tree,
                          invalid_model_id, 10)

        # Trying to generate tree for logistic regression classifier
        self.assertRaises(VisualizationException, generate_visualization_tree,
                          self.obj.id, 10)

        # Re-generating tree for decision tree classifier
        model = Model.query.filter_by(name='decision_tree_clf_model').one()
        from cloudml.trainer.store import TrainerStorage
        trainer = TrainerStorage.loads(DECISION_TREE)
        get_trainer_mock.return_value = trainer
        # In this model 'all_weights' not saved to visualization_data
        # while training.
        # So it's inpossible to re-generate tree.
        self.assertRaises(VisualizationException,
                          generate_visualization_tree,
                          model.id,
                          deep=2)

        from cloudml.trainer.trainer import DEFAULT_SEGMENT
        model.visualization_data = {DEFAULT_SEGMENT: TREE_VISUALIZATION_DATA}
        model.save()

        from random import randint
        deep = randint(2, 10)
        res = generate_visualization_tree(model.id, deep=deep)
        self.assertEquals(res, "Tree visualization was completed")
        print "using deep %s" % deep
        self.assertEquals(
            model.visualization_data[DEFAULT_SEGMENT]['parameters']['deep'],
            deep)
        tree = model.visualization_data[DEFAULT_SEGMENT]['tree']
        self.assertEquals(determine_deep(tree), deep)
Beispiel #3
0
        def do_train(exclude_labels):
            trainer = TrainerStorage.loads(MULTICLASS_MODEL)
            mock_get_trainer.return_value = trainer

            import gzip
            from StringIO import StringIO
            with gzip.open('./api/import_handlers/fixtures/'
                           'multiclass_ds.gz', 'r') as dataset:
                examples = []
                for line in dataset.readlines():
                    example = json.loads(line)
                    if example['hire_outcome'] in exclude_labels:
                        continue
                    examples.append(json.dumps(example))
                s = StringIO()
                s.write('\n'.join(examples))
                s.seek(0)
                mock_get_data_stream.return_value = s

            return run_test([self.dataset.id, ], test.id)
Beispiel #4
0
    def test_edit(self, mock_update_at_server, list_mock, set_meta,
                  mock_get_trainer):
        # set Up
        trainer = TrainerStorage.loads(MODEL_TRAINER)
        mock_get_trainer.return_value = trainer
        model2 = Model.query.filter_by(name=ModelData.model_02.name).one()
        model2.trainer = 'trainer_file2'
        list_mock.return_value = [{
            'id': str(self.model.id),
            'name': self.model.name
        }, {
            'id': str(model2.id),
            'name': model2.name
        }]
        files_list = [f for f in self.server.list_keys(FOLDER_MODELS)]
        obj_id = files_list[0]['id']

        url = '{0}{1}/'.format(self.BASE_URL, obj_id)
        # correct data
        resp = self.client.put(url,
                               data={'name': 'new name'},
                               headers=HTTP_HEADERS)
        self.assertEqual(200, resp.status_code)
        self.assertTrue(mock_update_at_server.delay.called)
        resp_data = json.loads(resp.data)
        self.assertEqual(obj_id, resp_data[self.RESOURCE.OBJECT_NAME]['id'])

        # test edit with same name
        resp = self.client.put(url,
                               data={'name': files_list[1]['name']},
                               headers=HTTP_HEADERS)
        self.assertEqual(400, resp.status_code)
        self.assertIn('already exists on the server', resp.data)

        # non-existing
        set_meta.side_effect = AmazonS3ObjectNotFound('not found')
        url = '{0}{1}/'.format(self.BASE_URL, 'bbb.model')
        resp = self.client.put(url, data={'name': 'nnn'}, headers=HTTP_HEADERS)
        self.assertEqual(404, resp.status_code)
        self.assertIn('not found', resp.data)
Beispiel #5
0
    def _check_run_test(self, test, metrics_mock_class, _fake_raw_data,
                        load_mock, mock_get_data_stream, mock_get_trainer):
        mocks = [mock_get_data_stream, mock_get_trainer]
        import numpy
        import scipy

        if _fake_raw_data is None:
            _fake_raw_data = {
                "default": [{'application_id': '123',
                             'hire_outcome': '0',
                             'title': 'A1'}] * 100}

        def _fake_test(self, *args, **kwargs):
            _fake_test.called = True
            self._raw_data = _fake_raw_data
            metrics_mock = metrics_mock_class()
            preds = Mock()
            preds.size = 100
            preds.__iter__ = Mock(return_value=iter([0] * 100))
            metrics_mock._preds = preds

            metrics_mock._probs = [numpy.array([0.1, 0.2])] * 100

            metrics_mock._true_data = scipy.sparse.coo_matrix(
                [[0, 0, 0]] * 100)

            return metrics_mock

        # Set up mock trainer
        trainer = TrainerStorage.loads(MODEL_TRAINER)
        mock_get_trainer.return_value = trainer

        with patch('cloudml.trainer.trainer.Trainer.test',
                   _fake_test) as mock_test:
            mocks.append(mock_test)
            return run_test([self.dataset.id, ], test.id), mocks
Beispiel #6
0
 def set_trainer(self, trainer):
     from bson import Binary
     from cloudml.trainer.store import TrainerStorage
     trainer_data = Binary(TrainerStorage(trainer).dumps())
     self.trainer = trainer_data
     self.trainer_size = len(trainer_data)
Beispiel #7
0
def upload_model_to_server(server_id, model_id, user_id):
    """
    Upload model to S3 for cloudml-predict.
    """
    init_logger('trainmodel_log', obj=int(model_id))
    logging.info('Starting uploading to cloudml_predict')

    try:
        server = Server.query.get(server_id)
        user = User.query.get(user_id)
        model = Model.query.get(model_id)

        # TODO: Checking name, whether it's enough of the memory, etc.
        model_files = server.list_keys(FOLDER_MODELS)
        for file_ in model_files:
            if file_['name'] == model.name:
                raise ValueError('Model with name "{0}" already exist on '
                                 'the server {1}'.format(
                                     model.name, server.name))

        uid = get_a_Uuid()

        # TODO: Shall we use another account?
        s3 = AmazonS3Helper(
            bucket_name=app.config['CLOUDML_PREDICT_BUCKET_NAME'])
        path = '{0}/{1}/{2}.model'.format(server.folder.strip('/'),
                                          FOLDER_MODELS, uid)
        meta = {
            'id': model.id,
            'object_name': model.name,
            'name': model.name,
            'user_id': user.id,
            'user_name': user.name,
            'hide': "False",
            'uploaded_on': str(datetime.now())
        }

        trainer = model.get_trainer()
        #from cloudml.trainer.store import load_trainer
        #trainer = load_trainer(trainer_data)
        from cloudml.trainer.store import TrainerStorage
        from bson import Binary
        import cPickle as pickle
        trainer_data = Binary(TrainerStorage(trainer).dumps())
        logging.info(len(trainer_data))
        #trainer.visualization = None
        #trainer_data = store_trainer(trainer)
        #trainer_data = model.trainer
        s3.save_key_string(path, trainer_data, meta)
        s3.close()
        model.locked = True
        s_ids = list(model.servers_ids) if (isinstance(model.servers_ids,
                                                       list)) else []
        s_ids.append(server.id)
        model.servers_ids = list(s_ids)
        model.save()
        feature_set = model.features_set
        feature_set.locked = True
        feature_set.save()
        logging.info('Creating grafan dashboard for model')
        update_grafana_dashboard(server, model)
        logging.info('Model has been uploaded: %s' % model.name)

        return '{0}/{1}.model'.format(FOLDER_MODELS, uid)
    except Exception as e:
        logging.error("Got exception on uploading model to predict: "
                      " {0} \n {1}".format(e.message, get_task_traceback(e)))
        raise TaskException(e.message, e)
Beispiel #8
0
    def test_get_models_action(self, grafana_mock, save_mock, list_mock,
                               load_mock, mock_get_trainer):
        # no models and import handlers
        trainer = TrainerStorage.loads(MODEL_TRAINER)
        mock_get_trainer.return_value = trainer
        list_mock.return_value = []
        result = self._check(server=self.obj.id,
                             id=self.obj.id,
                             action='models')
        self.assertEqual(0, len(result['files']))

        # should return model data
        model = Model.query.filter_by(name=ModelData.model_01.name).one()
        model.trainer = 'trainer_file'
        model.train_import_handler = get_importhandler()
        model.test_import_handler = get_importhandler()
        model.name = 'BestMatch.v31'
        model.save()
        user = User.query.first()
        upload_model_to_server(self.obj.id, model.id, user.id)
        upload_import_handler_to_server(self.obj.id, 'xml',
                                        model.test_import_handler.id, user.id)

        def list_side_effect(*args, **kwargs):
            side_effect_data = {
                FOLDER_MODELS: [{
                    'id': str(model.id),
                    'object_name': model.name,
                    'object_id': str(model.id),
                    'name': model.name,
                    'user_id': user.id,
                    'user_name': user.name,
                }],
                FOLDER_IMPORT_HANDLERS: [{
                    'id':
                    str(model.test_import_handler.id),
                    'object_name':
                    model.test_import_handler.name,
                    'size':
                    100,
                    'name':
                    model.test_import_handler.name,
                    'object_id':
                    str(model.test_import_handler.id),
                    'object_type':
                    model.test_import_handler.type,
                    'user_id':
                    user.id,
                    'user_name':
                    user.name,
                    'crc32':
                    'crc32'
                }]
            }
            if 'folder' in kwargs:
                return side_effect_data[kwargs['folder']]
            else:
                return []

        list_mock.side_effect = list_side_effect
        result = self._check(server=self.obj.id,
                             id=self.obj.id,
                             action='models')
        self.assertEqual(1, len(result['files']))
        f = result['files'][0]
        self.assertEqual(f['model_name'], 'BestMatch.v31')
        self.assertEqual(f['model_metadata']['name'], 'BestMatch.v31')
        self.assertEqual(f['model']['id'], model.id)
        self.assertEqual(f['import_handler_name'],
                         f['import_handler_metadata']['name'])
        self.assertEqual(f['import_handler_metadata']['object_id'],
                         str(model.test_import_handler.id))
        self.assertEqual(f['import_handler']['id'],
                         model.test_import_handler.id)
Beispiel #9
0
    def test_upload_segment_features_transformers_task(self, get_trainer_mock,
                                                       save_mock, dl_mock):
        from cloudml.trainer.store import TrainerStorage
        from api.ml_models.tasks import upload_segment_features_transformers
        from zipfile import ZipFile, ZIP_DEFLATED
        import os

        model = Model.query.filter_by(name=ModelData.model_01.name).first()

        trainer = TrainerStorage.loads(MODEL_TRAINER)
        get_trainer_mock.return_value = trainer
        for segment_name in trainer.features:
            s = Segment()
            s.name = segment_name
            s.records = 111
            s.model_id = model.id
            s.save()

        segment = Segment.query.filter(Segment.name == s.name).one()

        # check that nothing fails
        upload_segment_features_transformers(model.id, segment.id, 'json')

        # repeat logic from task here except posting to Amazon S3
        try:
            fformat = 'json'

            def _save_content(content, feature_name, transformer_type):
                filename = "{0}-{1}-{2}-data.{3}".format(
                    segment.name, feature_name, transformer_type, fformat)
                if fformat == 'csv':
                    import csv
                    import StringIO
                    si = StringIO.StringIO()
                    if len(content):
                        fieldnames = content[0].keys()
                        writer = csv.DictWriter(si, fieldnames=fieldnames)
                        writer.writeheader()
                        for c in content:
                            writer.writerow(c)
                    response = si.getvalue()
                else:
                    import json
                    response = json.dumps(content, indent=2)
                with open(filename, 'w') as fh:
                    fh.write(response)
                    fh.close()
                return filename

            trainer = model.get_trainer()
            if segment.name not in trainer.features:
                raise Exception("Segment %s doesn't exists in trained model" %
                                segment.name)
            files = []
            for name, feature in trainer.features[segment.name].iteritems():
                if "transformer" in feature and \
                                feature["transformer"] is not None:
                    try:
                        data = feature["transformer"].load_vocabulary()
                        files.append(
                            _save_content(data, name,
                                          feature["transformer-type"]))
                    except AttributeError:
                        continue

            arc_name = "{0}-{1}-{2}.zip".format(model.name, segment.name,
                                                fformat)
            with ZipFile(arc_name, "w") as z:
                for f in files:
                    z.write(f, compress_type=ZIP_DEFLATED)
                z.close()
            for f in files:
                os.remove(f)

            self.assertEqual(arc_name,
                             "{0}-default-json.zip".format(model.name))
            fh = open(arc_name, 'rb')
            z = ZipFile(fh)
            for name in z.namelist():
                outpath = "./"
                z.extract(name, outpath)
            file_list = z.namelist()
            fh.close()

            self.assertEqual(len(file_list), 2)
            self.assertEqual(
                set(file_list),
                set([
                    "default-contractor.dev_blurb-Tfidf-data.json",
                    "default-contractor.dev_profile_title-Tfidf-data.json"
                ]))
            fh = open("default-contractor.dev_blurb-Tfidf-data.json", 'r')
            content = fh.read()
            res = json.loads(content)
            self.assertEqual(set(res[0].keys()),
                             set(["word", "index", "weight"]))
        finally:
            for f in files:
                os.remove(f)
            if os.path.exists(arc_name):
                os.remove(arc_name)