def go_details(self, obj, show, data, mock_get_trainer, mock_get_vect_data): should_called = not obj.is_weights_calculated trainer = TrainerStorage.loads(MODEL_TRAINER) mock_get_trainer.return_value = trainer mock_get_vect_data.return_value = [0.123, 0.0] * 500 url = self._get_url(id=obj.id, show=show, data=data) resp = self.client.get(url, headers=HTTP_HEADERS) self.assertEquals( resp.status_code, 200, "code: {0}, data: {1}".format(resp.status_code, resp.data)) self.assertEquals(mock_get_trainer.called, should_called) return json.loads(resp.data)['test_example']
def test_generate_visualization_tree(self, get_trainer_mock): """ Checks generate_visualization_tree task with decision tree clf without segmentation. """ # Using non existance model from api.ml_models.tasks import generate_visualization_tree, \ VisualizationException invalid_model_id = -101 self.assertRaises(ValueError, generate_visualization_tree, invalid_model_id, 10) # Trying to generate tree for logistic regression classifier self.assertRaises(VisualizationException, generate_visualization_tree, self.obj.id, 10) # Re-generating tree for decision tree classifier model = Model.query.filter_by(name='decision_tree_clf_model').one() from cloudml.trainer.store import TrainerStorage trainer = TrainerStorage.loads(DECISION_TREE) get_trainer_mock.return_value = trainer # In this model 'all_weights' not saved to visualization_data # while training. # So it's inpossible to re-generate tree. self.assertRaises(VisualizationException, generate_visualization_tree, model.id, deep=2) from cloudml.trainer.trainer import DEFAULT_SEGMENT model.visualization_data = {DEFAULT_SEGMENT: TREE_VISUALIZATION_DATA} model.save() from random import randint deep = randint(2, 10) res = generate_visualization_tree(model.id, deep=deep) self.assertEquals(res, "Tree visualization was completed") print "using deep %s" % deep self.assertEquals( model.visualization_data[DEFAULT_SEGMENT]['parameters']['deep'], deep) tree = model.visualization_data[DEFAULT_SEGMENT]['tree'] self.assertEquals(determine_deep(tree), deep)
def do_train(exclude_labels): trainer = TrainerStorage.loads(MULTICLASS_MODEL) mock_get_trainer.return_value = trainer import gzip from StringIO import StringIO with gzip.open('./api/import_handlers/fixtures/' 'multiclass_ds.gz', 'r') as dataset: examples = [] for line in dataset.readlines(): example = json.loads(line) if example['hire_outcome'] in exclude_labels: continue examples.append(json.dumps(example)) s = StringIO() s.write('\n'.join(examples)) s.seek(0) mock_get_data_stream.return_value = s return run_test([self.dataset.id, ], test.id)
def test_edit(self, mock_update_at_server, list_mock, set_meta, mock_get_trainer): # set Up trainer = TrainerStorage.loads(MODEL_TRAINER) mock_get_trainer.return_value = trainer model2 = Model.query.filter_by(name=ModelData.model_02.name).one() model2.trainer = 'trainer_file2' list_mock.return_value = [{ 'id': str(self.model.id), 'name': self.model.name }, { 'id': str(model2.id), 'name': model2.name }] files_list = [f for f in self.server.list_keys(FOLDER_MODELS)] obj_id = files_list[0]['id'] url = '{0}{1}/'.format(self.BASE_URL, obj_id) # correct data resp = self.client.put(url, data={'name': 'new name'}, headers=HTTP_HEADERS) self.assertEqual(200, resp.status_code) self.assertTrue(mock_update_at_server.delay.called) resp_data = json.loads(resp.data) self.assertEqual(obj_id, resp_data[self.RESOURCE.OBJECT_NAME]['id']) # test edit with same name resp = self.client.put(url, data={'name': files_list[1]['name']}, headers=HTTP_HEADERS) self.assertEqual(400, resp.status_code) self.assertIn('already exists on the server', resp.data) # non-existing set_meta.side_effect = AmazonS3ObjectNotFound('not found') url = '{0}{1}/'.format(self.BASE_URL, 'bbb.model') resp = self.client.put(url, data={'name': 'nnn'}, headers=HTTP_HEADERS) self.assertEqual(404, resp.status_code) self.assertIn('not found', resp.data)
def _check_run_test(self, test, metrics_mock_class, _fake_raw_data, load_mock, mock_get_data_stream, mock_get_trainer): mocks = [mock_get_data_stream, mock_get_trainer] import numpy import scipy if _fake_raw_data is None: _fake_raw_data = { "default": [{'application_id': '123', 'hire_outcome': '0', 'title': 'A1'}] * 100} def _fake_test(self, *args, **kwargs): _fake_test.called = True self._raw_data = _fake_raw_data metrics_mock = metrics_mock_class() preds = Mock() preds.size = 100 preds.__iter__ = Mock(return_value=iter([0] * 100)) metrics_mock._preds = preds metrics_mock._probs = [numpy.array([0.1, 0.2])] * 100 metrics_mock._true_data = scipy.sparse.coo_matrix( [[0, 0, 0]] * 100) return metrics_mock # Set up mock trainer trainer = TrainerStorage.loads(MODEL_TRAINER) mock_get_trainer.return_value = trainer with patch('cloudml.trainer.trainer.Trainer.test', _fake_test) as mock_test: mocks.append(mock_test) return run_test([self.dataset.id, ], test.id), mocks
def set_trainer(self, trainer): from bson import Binary from cloudml.trainer.store import TrainerStorage trainer_data = Binary(TrainerStorage(trainer).dumps()) self.trainer = trainer_data self.trainer_size = len(trainer_data)
def upload_model_to_server(server_id, model_id, user_id): """ Upload model to S3 for cloudml-predict. """ init_logger('trainmodel_log', obj=int(model_id)) logging.info('Starting uploading to cloudml_predict') try: server = Server.query.get(server_id) user = User.query.get(user_id) model = Model.query.get(model_id) # TODO: Checking name, whether it's enough of the memory, etc. model_files = server.list_keys(FOLDER_MODELS) for file_ in model_files: if file_['name'] == model.name: raise ValueError('Model with name "{0}" already exist on ' 'the server {1}'.format( model.name, server.name)) uid = get_a_Uuid() # TODO: Shall we use another account? s3 = AmazonS3Helper( bucket_name=app.config['CLOUDML_PREDICT_BUCKET_NAME']) path = '{0}/{1}/{2}.model'.format(server.folder.strip('/'), FOLDER_MODELS, uid) meta = { 'id': model.id, 'object_name': model.name, 'name': model.name, 'user_id': user.id, 'user_name': user.name, 'hide': "False", 'uploaded_on': str(datetime.now()) } trainer = model.get_trainer() #from cloudml.trainer.store import load_trainer #trainer = load_trainer(trainer_data) from cloudml.trainer.store import TrainerStorage from bson import Binary import cPickle as pickle trainer_data = Binary(TrainerStorage(trainer).dumps()) logging.info(len(trainer_data)) #trainer.visualization = None #trainer_data = store_trainer(trainer) #trainer_data = model.trainer s3.save_key_string(path, trainer_data, meta) s3.close() model.locked = True s_ids = list(model.servers_ids) if (isinstance(model.servers_ids, list)) else [] s_ids.append(server.id) model.servers_ids = list(s_ids) model.save() feature_set = model.features_set feature_set.locked = True feature_set.save() logging.info('Creating grafan dashboard for model') update_grafana_dashboard(server, model) logging.info('Model has been uploaded: %s' % model.name) return '{0}/{1}.model'.format(FOLDER_MODELS, uid) except Exception as e: logging.error("Got exception on uploading model to predict: " " {0} \n {1}".format(e.message, get_task_traceback(e))) raise TaskException(e.message, e)
def test_get_models_action(self, grafana_mock, save_mock, list_mock, load_mock, mock_get_trainer): # no models and import handlers trainer = TrainerStorage.loads(MODEL_TRAINER) mock_get_trainer.return_value = trainer list_mock.return_value = [] result = self._check(server=self.obj.id, id=self.obj.id, action='models') self.assertEqual(0, len(result['files'])) # should return model data model = Model.query.filter_by(name=ModelData.model_01.name).one() model.trainer = 'trainer_file' model.train_import_handler = get_importhandler() model.test_import_handler = get_importhandler() model.name = 'BestMatch.v31' model.save() user = User.query.first() upload_model_to_server(self.obj.id, model.id, user.id) upload_import_handler_to_server(self.obj.id, 'xml', model.test_import_handler.id, user.id) def list_side_effect(*args, **kwargs): side_effect_data = { FOLDER_MODELS: [{ 'id': str(model.id), 'object_name': model.name, 'object_id': str(model.id), 'name': model.name, 'user_id': user.id, 'user_name': user.name, }], FOLDER_IMPORT_HANDLERS: [{ 'id': str(model.test_import_handler.id), 'object_name': model.test_import_handler.name, 'size': 100, 'name': model.test_import_handler.name, 'object_id': str(model.test_import_handler.id), 'object_type': model.test_import_handler.type, 'user_id': user.id, 'user_name': user.name, 'crc32': 'crc32' }] } if 'folder' in kwargs: return side_effect_data[kwargs['folder']] else: return [] list_mock.side_effect = list_side_effect result = self._check(server=self.obj.id, id=self.obj.id, action='models') self.assertEqual(1, len(result['files'])) f = result['files'][0] self.assertEqual(f['model_name'], 'BestMatch.v31') self.assertEqual(f['model_metadata']['name'], 'BestMatch.v31') self.assertEqual(f['model']['id'], model.id) self.assertEqual(f['import_handler_name'], f['import_handler_metadata']['name']) self.assertEqual(f['import_handler_metadata']['object_id'], str(model.test_import_handler.id)) self.assertEqual(f['import_handler']['id'], model.test_import_handler.id)
def test_upload_segment_features_transformers_task(self, get_trainer_mock, save_mock, dl_mock): from cloudml.trainer.store import TrainerStorage from api.ml_models.tasks import upload_segment_features_transformers from zipfile import ZipFile, ZIP_DEFLATED import os model = Model.query.filter_by(name=ModelData.model_01.name).first() trainer = TrainerStorage.loads(MODEL_TRAINER) get_trainer_mock.return_value = trainer for segment_name in trainer.features: s = Segment() s.name = segment_name s.records = 111 s.model_id = model.id s.save() segment = Segment.query.filter(Segment.name == s.name).one() # check that nothing fails upload_segment_features_transformers(model.id, segment.id, 'json') # repeat logic from task here except posting to Amazon S3 try: fformat = 'json' def _save_content(content, feature_name, transformer_type): filename = "{0}-{1}-{2}-data.{3}".format( segment.name, feature_name, transformer_type, fformat) if fformat == 'csv': import csv import StringIO si = StringIO.StringIO() if len(content): fieldnames = content[0].keys() writer = csv.DictWriter(si, fieldnames=fieldnames) writer.writeheader() for c in content: writer.writerow(c) response = si.getvalue() else: import json response = json.dumps(content, indent=2) with open(filename, 'w') as fh: fh.write(response) fh.close() return filename trainer = model.get_trainer() if segment.name not in trainer.features: raise Exception("Segment %s doesn't exists in trained model" % segment.name) files = [] for name, feature in trainer.features[segment.name].iteritems(): if "transformer" in feature and \ feature["transformer"] is not None: try: data = feature["transformer"].load_vocabulary() files.append( _save_content(data, name, feature["transformer-type"])) except AttributeError: continue arc_name = "{0}-{1}-{2}.zip".format(model.name, segment.name, fformat) with ZipFile(arc_name, "w") as z: for f in files: z.write(f, compress_type=ZIP_DEFLATED) z.close() for f in files: os.remove(f) self.assertEqual(arc_name, "{0}-default-json.zip".format(model.name)) fh = open(arc_name, 'rb') z = ZipFile(fh) for name in z.namelist(): outpath = "./" z.extract(name, outpath) file_list = z.namelist() fh.close() self.assertEqual(len(file_list), 2) self.assertEqual( set(file_list), set([ "default-contractor.dev_blurb-Tfidf-data.json", "default-contractor.dev_profile_title-Tfidf-data.json" ])) fh = open("default-contractor.dev_blurb-Tfidf-data.json", 'r') content = fh.read() res = json.loads(content) self.assertEqual(set(res[0].keys()), set(["word", "index", "weight"])) finally: for f in files: os.remove(f) if os.path.exists(arc_name): os.remove(arc_name)