def transform_dataset_for_download(model_id, dataset_id): model = Model.query.get(model_id) dataset = DataSet.query.get(dataset_id) init_logger('transform_for_download_log', obj=int(dataset_id)) logging.info('Starting Transform For Download Task') try: transformed = model.transform_dataset(dataset) logging.info('Saving transformed data to disk') temp_file = tempfile.NamedTemporaryFile() numpy.savez_compressed(temp_file, **transformed) s3_filename = "dataset_{0}_vectorized_for_model_{1}.npz".format( dataset.id, model.id) from api.amazon_utils import AmazonS3Helper s3 = AmazonS3Helper() logging.info('Uploading file {0} to s3 with name {1}...'.format( temp_file.name, s3_filename)) s3.save_key(s3_filename, temp_file.name, { 'model_id': model.id, 'dataset_id': dataset.id}, compressed=False) s3.close() return s3.get_download_url(s3_filename, 60 * 60 * 24 * 7) except Exception as e: logging.error("Got exception when transforming dataset: " " {0} \n {1}".format(e.message, get_task_traceback(e))) raise TaskException(e.message, e)
def request_spot_instance(instance_type=None, model_id=None): """ Requests instance on the Amazon spot market. instance_type: string The type of instance to run. model_id: int Id of the Model which tries to request spot instance for training. """ init_logger('trainmodel_log', obj=int(model_id)) model = Model.query.get(model_id) model.status = model.STATUS_REQUESTING_INSTANCE model.save() try: ec2 = AmazonEC2Helper() logging.info('Request spot instance type: %s' % instance_type) request = ec2.request_spot_instance(instance_type) logging.info('Request id: %s:' % request['SpotInstanceRequestId']) except ClientError as e: model.set_error(e.error_message) raise TaskException(e.error_message, e) model.spot_instance_request_id = request['SpotInstanceRequestId'] model.save() return request['SpotInstanceRequestId']
def upload_dataset(dataset_id): """ Upload dataset to S3. """ dataset = DataSet.query.get(dataset_id) try: if not dataset: raise ValueError('DataSet not found') init_logger('importdata_log', obj=dataset.id) logging.info('Uploading dataset %s' % dataset.id) dataset.status = dataset.STATUS_UPLOADING dataset.save() logging.info('Saving file to Amazon S3') dataset.save_to_s3() logging.info('File saved to Amazon S3') dataset.status = dataset.STATUS_IMPORTED if dataset.records_count is None: dataset.records_count = 0 dataset.save() except Exception, exc: logging.error('Got exception when uploading dataset: {0}\n {1}'.format( exc.message, get_task_traceback(exc))) dataset.set_error(exc) raise TaskException(exc.message, exc)
def upload_import_handler_to_server(server_id, handler_type, handler_id, user_id): """ Upload importhandler to S3 for cloudml-predict. """ init_logger('importdata_log', obj=int(handler_id)) logging.info('Starting uploading to cloudml_predict') try: server = Server.query.get(server_id) user = User.query.get(user_id) handler = XmlImportHandler.query.get(handler_id) handler_files = server.list_keys(FOLDER_IMPORT_HANDLERS) for file_ in handler_files: if file_['name'] == handler.name: raise ValueError('Import Handler with name "{0}" already exist' ' on the server {1}'.format( handler.name, server.name)) uid = get_a_Uuid() # TODO: Shall we use another account? s3 = AmazonS3Helper( bucket_name=app.config['CLOUDML_PREDICT_BUCKET_NAME']) path = '{0}/{1}/{2}.{3}'.format( server.folder.strip('/'), FOLDER_IMPORT_HANDLERS, uid, 'xml' if handler_type == XmlImportHandler.TYPE else 'json') meta = { 'id': handler.id, 'name': handler.name, 'object_name': handler.name, 'type': handler.TYPE, 'user_id': user.id, 'user_name': user.name, 'hide': "False", 'uploaded_on': str(datetime.now()), 'crc32': handler.crc32 } handler_data = handler.get_plan_config() handler.locked = True s_ids = list(handler.servers_ids) if (isinstance( handler.servers_ids, list)) else [] s_ids.append(server.id) handler.servers_ids = list(s_ids) handler.save() s3.save_key_string(path, handler_data, meta) s3.close() logging.info('Import Handler has been uploaded: %s' % handler.name) return '{0}/{1}.{2}'.format( FOLDER_IMPORT_HANDLERS, uid, 'xml' if handler_type == XmlImportHandler.TYPE else 'json') except Exception as e: logging.error("Got exception on uploading import handler to predict: " " {0} \n {1}".format(e.message, get_task_traceback(e))) raise TaskException(e.message, e)
def get_classifier_parameters_grid(grid_params_id): """ Exhaustive search over specified parameter values for an estimator. Fills parameters_grid field of the ClassifierGridParams object with grid search result. grid_params_id: int ID of the ClassifierGridParams model, that contains link to the model, datasets for training and testing, parameters, choosen by user. """ init_logger('gridsearch_log', obj=int(grid_params_id)) logging.info('Parameters grid search task started') from api.ml_models.models import ClassifierGridParams grid_params = ClassifierGridParams.query.get(grid_params_id) grid_params.status = 'Calculating' grid_params.save() try: feature_model = FeatureModel( grid_params.model.get_features_json(), is_file=False) trainer = Trainer(feature_model) def _get_iter(dataset): fp = None if fp: fp.close() fp = dataset.get_data_stream() for row in dataset.get_iterator(fp): yield row if fp: fp.close() clfs = trainer.grid_search( grid_params.parameters, _get_iter(grid_params.train_dataset), _get_iter(grid_params.test_dataset), score=grid_params.scoring) grids = {} for segment, clf in clfs.iteritems(): grids[segment] = [{ 'parameters': item.parameters, 'mean': item.mean_validation_score, 'std': np.std(item.cv_validation_scores)} for item in clf.grid_scores_] grid_params.parameters_grid = grids grid_params.status = 'Completed' grid_params.save() except Exception, e: logging.error('Got exception on grid params search: {0} \n {1}' .format(e.message, get_task_traceback(e))) grid_params.status = 'Error' grid_params.save() raise TaskException(e.message, e)
def run(self, verification_id, count, *args, **kwargs): self.init_logger(verification_id) logging.info('Starting model verification') self.verification = self.get_verification(verification_id) try: self.process(count) except Exception, exc: self.verification.status = self.verification.STATUS_ERROR self.verification.error = str(exc) self.verification.save() logging.error( "Exception when verifying the model: {0} \n {1}".format( exc.message, get_task_traceback(exc))) raise TaskException(exc.message, exc)
def train_model(dataset_ids, model_id, user_id, delete_metadata=False): """ Train or re-train the model. dataset_ids: list of integers List of dataset ids used for model training. model_id: int Id of the model to train. user_id: int Id of the user, who initiate training the model. delete_metadata: bool Whether we need model related db logs, test results and other model related data. """ init_logger('trainmodel_log', obj=int(model_id)) logging.info('Start training task') user = User.query.get(user_id) model = Model.query.get(model_id) datasets = DataSet.query.filter(DataSet.id.in_(dataset_ids)).all() logging.info('Preparing the model `%s` for training.' % model.name) try: model.prepare_fields_for_train(user=user, datasets=datasets) logging.info( 'DataSet files chosen for training: %s' % ', '.join(['{0} (id #{1})'. format(dataset.filename, dataset.id) for dataset in datasets])) logging.info('Perform model training') feature_model = FeatureModel(model.get_features_json(), is_file=False) trainer = Trainer(feature_model) get_or_create_data_folder() def _chain_datasets(ds_list): fp = None for d in ds_list: if fp: fp.close() fp = d.get_data_stream() for row in d.get_iterator(fp): yield row if fp: fp.close() train_iter = _chain_datasets(datasets) from memory_profiler import memory_usage # mem_usage = memory_usage((trainer.train, # (train_iter,)), interval=0) train_begin_time = datetime.datetime.utcnow() from api.ml_models.models import get_transformer trainer.set_transformer_getter(get_transformer) trainer.train(train_iter) mem_usage = memory_usage(-1, interval=0, timeout=None) trainer.clear_temp_data() logging.info('Store trainer to s3') model.set_trainer(trainer) model.save() logging.info('Set train records') model.memory_usage = max(mem_usage) model.train_records_count = int(sum(( d.records_count for d in model.datasets))) train_end_time = datetime.datetime.utcnow() model.training_time = int((train_end_time - train_begin_time).seconds) model.save() logging.info('Get segment info') segments = trainer._get_segments_info() if not segments or not segments.keys(): raise Exception('No segments in the model') model.create_segments(segments) chord((visualize_model.s(model.id, segment.id) for segment in model.segments), calculate_model_parts_size.s(model.id))() except Exception, exc: app.sql_db.session.rollback() logging.error('Got exception when train model: {0} \n {1}' .format(exc.message, get_task_traceback(exc))) model.status = model.STATUS_ERROR model.error = str(exc)[:299] model.save() raise TaskException(exc.message, exc)
def generate_visualization_tree(model_id, deep): """ Generates Visualization tree with specified `deep`. Parameters ---------- model_id: integer Database id of the model deep : positive integer Maximum tree deep. Notes ----- Decision Tree Classifier and Random Forest Classifier are supported. """ from cloudml.trainer.classifier_settings import DECISION_TREE_CLASSIFIER, \ RANDOM_FOREST_CLASSIFIER, EXTRA_TREES_CLASSIFIER, XGBOOST_CLASSIFIER from exceptions import VisualizationException init_logger('trainmodel_log', obj=int(model_id)) model = Model.query.get(model_id) try: if model is None: raise ValueError('model not found: %s' % model_id) if model.classifier is None or 'type' not in model.classifier: raise ValueError('model has invalid classifier') clf_type = model.classifier['type'] if clf_type not in (DECISION_TREE_CLASSIFIER, RANDOM_FOREST_CLASSIFIER, EXTRA_TREES_CLASSIFIER, XGBOOST_CLASSIFIER): raise VisualizationException( "model with %s classifier doesn't support tree" " visualization" % clf_type, VisualizationException.CLASSIFIER_NOT_SUPPORTED) # Checking that all_weights had been stored while training model # For old models we need to retrain the model. if model.visualization_data is None: raise VisualizationException( "we don't support the visualization re-generation for " "models trained before may 2015." "please re-train the model to use this feature.", error_code=VisualizationException.ALL_WEIGHT_NOT_FILLED) trainer = model.get_trainer() from copy import deepcopy data = deepcopy(model.visualization_data) segments = [segment.name for segment in model.segments] \ or [DEFAULT_SEGMENT] for segment in segments: if segment not in model.visualization_data \ or 'all_weights' not in model.visualization_data[segment]: raise VisualizationException( "we don't support the visualization re-generation for " "models trained before may 2015." "please re-train the model to use this feature.", error_code=VisualizationException.ALL_WEIGHT_NOT_FILLED) if clf_type == DECISION_TREE_CLASSIFIER: tree = trainer.model_visualizer.regenerate_tree( segment, data[segment]['all_weights'], deep=deep) data[segment]['tree'] = tree elif clf_type == RANDOM_FOREST_CLASSIFIER or \ clf_type == EXTRA_TREES_CLASSIFIER: trees = trainer.model_visualizer.regenerate_trees( segment, data[segment]['all_weights'], deep=deep) data[segment]['trees'] = trees elif clf_type == XGBOOST_CLASSIFIER: # TODO XGBOOST visualize trees = trainer.model_visualizer.regenerate_trees( segment, data[segment]['all_weights'], deep=deep) data[segment]['trees'] = trees data[segment]['parameters'] = {'deep': deep, 'status': 'done'} model.visualize_model(data) except Exception as e: logging.error("Got exception on tree visualization: " " {0} \n {1}".format(e.message, get_task_traceback(e))) raise TaskException(e.message, e) return "Tree visualization was completed"
def visualize_model(model_id, segment_id=None): """ Generates Trained model visualization. Fills weights and creates weights tree. model_id: int Id of the model. segment_id: int ID of the model segment. """ init_logger('trainmodel_log', obj=int(model_id)) model, segment = _get_model_and_segment_or_raise(model_id, segment_id) logging.info( "Starting to visualize trained model {0}. Segment: {1}".format( model.name, segment.name)) trainer = model.get_trainer() # need to sync model weights count = len(segment.weights) if count > 0: from api.base.exceptions import InvalidOperationError raise InvalidOperationError( 'Weights for model %s already filled: %s' % (model_id, count)) weights_dict = None model.status = model.STATUS_FILLING_WEIGHTS model.save() try: visualization_data = trainer.get_visualization(segment.name) have_weights = 'weights' in visualization_data if have_weights: from utils import fill_model_weights weights_dict = visualization_data.pop('weights') weights_added = 0 categories_added = 0 classes_processed = 0 for clazz in weights_dict.keys(): c, w = fill_model_weights(model, segment, clazz, weights_dict[clazz]) categories_added += c weights_added += w classes_processed += 1 app.sql_db.session.commit() logging.info( 'Model %s parameters weights was added to db. %s weights, ' 'in %s categories for %s classes' % (model.name, weights_added, categories_added, classes_processed)) else: logging.info('Weights are unavailable for the classifier') # TODO: from cloudml.trainer.classifier_settings import CLASSIFIER_MODELS clf_type = model.classifier.get('type') if clf_type in CLASSIFIER_MODELS and not model.labels: model.labels = trainer._get_labels() model.visualize_model(segment=segment.name, data=visualization_data, commit=False) model.status = model.STATUS_TRAINED model.weights_synchronized = have_weights model.save() except Exception, exc: logging.error('Got exception when visualize the model: {0} \n {1}' .format(exc.message, get_task_traceback(exc))) model.status = model.STATUS_ERROR model.error = str(exc)[:299] model.save() raise TaskException(exc.message, exc)
def get_csv_results(model_id, test_id, fields): """ Get test classification results in csv format and saves file to Amazon S3. model_id: int ID of the model test_id: int ID of the test, which examples it planned to export. fields: list of string List of field names from TestExample to export to csv file. """ from api.amazon_utils import AmazonS3Helper def generate(test, name): from api.base.io_utils import get_or_create_data_folder path = get_or_create_data_folder() filename = os.path.join(path, name) header = list(fields) if 'prob' in header: prob_index = header.index('prob') for label in reversed(test.classes_set): header.insert(prob_index, 'prob_%s' % label) header.remove('prob') with open(filename, 'w') as fp: writer = csv.writer(fp, delimiter=',', quoting=csv.QUOTE_ALL) writer.writerow(header) for example in TestExample.get_data(test_id, fields): rows = [] for field in fields: if field == '_id': field = 'id' if field == 'id': field = 'example_id' val = example[field] if field in example else '' if field == 'prob': rows += val else: rows.append(val) writer.writerow(rows) return filename init_logger('runtest_log', obj=int(test_id)) try: test = TestResult.query.filter_by(model_id=model_id, id=test_id).first() if test is None: logging.error('Test not found') return name = 'Examples-{0!s}.csv'.format(uuid.uuid1()) expires = 60 * 60 * 24 * 7 # 7 days logging.info('Creating file {0}...'.format(name)) s3 = AmazonS3Helper() filename = generate(test, name) logging.info('Uploading file {0} to s3...'.format(filename)) s3.save_key(name, filename, { 'model_id': model_id, 'test_id': test_id }, compressed=False) s3.close() os.remove(filename) url = s3.get_download_url(name, expires) return url except Exception as e: logging.error("Got exception on getting test classification results: " " {0} \n {1}".format(e.message, get_task_traceback(e))) raise TaskException(e.message, e)
def run_test(dataset_ids, test_id): """ Running test for trained model. dataset_ids: list List of dataset ids used for testing model. Now only first one in the list used. (Multidataset testing hasn't implemented yet). model_id: int ID of the model to test. """ init_logger('runtest_log', obj=int(test_id)) test = TestResult.query.get(test_id) datasets = DataSet.query.filter(DataSet.id.in_(dataset_ids)).all() dataset = datasets[0] model = test.model try: if model.status != model.STATUS_TRAINED: raise InvalidOperationError("Train the model before") test.status = test.STATUS_IN_PROGRESS test.error = "" test.save() logging.info('Getting metrics and raw data') from memory_profiler import memory_usage logging.info('Using {0} (id #{1}) dataset'.format( dataset.filename, dataset.id)) result = Model.run_test(model, dataset) metrics, raw_data = result logging.info("Memory usage: %f", memory_usage(-1, interval=0, timeout=None)[0]) metrics_dict = metrics.get_metrics_dict() if 'accuracy' in metrics_dict: test.accuracy = metrics.accuracy logging.info('Accuracy: %f', test.accuracy) # TODO: Refactor this. Here are possible issues with conformity # between labels and values if 'confusion_matrix' in metrics_dict: confusion_matrix = metrics_dict['confusion_matrix'] confusion_matrix_ex = [] for i, val in enumerate(metrics.classes_set): confusion_matrix_ex.append((str(val), confusion_matrix[i])) metrics_dict['confusion_matrix'] = confusion_matrix_ex if 'precision_recall_curve' in metrics_dict: n = len(metrics._labels) / 100 or 1 if metrics.classes_count == 2: metrics_dict['precision_recall_curve'][1] = \ metrics_dict['precision_recall_curve'][1][0::n] metrics_dict['precision_recall_curve'][0] = \ metrics_dict['precision_recall_curve'][0][0::n] if 'roc_curve' in metrics_dict: n = len(metrics._labels) / 100 or 1 calc_range = [1] if metrics.classes_count == 2 \ else range(len(metrics.classes_set)) for i in calc_range: label = metrics.classes_set[i] sub_fpr = metrics_dict['roc_curve'][label][0][0::n] sub_tpr = metrics_dict['roc_curve'][label][1][0::n] metrics_dict['roc_curve'][label] = [sub_fpr, sub_tpr] if 'roc_curve' in metrics_dict: test.roc_auc = metrics_dict.pop('roc_auc') test.metrics = metrics_dict test.classes_set = list(metrics.classes_set) test.status = TestResult.STATUS_STORING if not model.comparable: # TODO: fix this model = Model.query.get(model.id) model.comparable = True model.save() all_count = metrics._preds.size test.dataset = dataset # lock dataset used for testing dataset.locked = True dataset.save() test.examples_count = all_count test.memory_usage = memory_usage(-1, interval=0, timeout=None)[0] vect_data = metrics._true_data from bson import Binary test.save() def fill_weights(trainer, test, segment): for clazz, weights in trainer.get_weights( segment.name).iteritems(): positive = weights['positive'] negative = weights['negative'] def process_weights(weight_list): for weight_dict in weight_list: weight = Weight.query.filter_by( model=model, segment=segment, class_label=str(clazz), name=weight_dict['name']).one() if weight.test_weights is not None: test_weights = weight.test_weights.copy() else: test_weights = {} test_weights[test_id] = weight_dict['feature_weight'] weight.test_weights = test_weights weight.save(commit=False) app.sql_db.session.add(weight) process_weights(positive) process_weights(negative) # Filling test weights if test.fill_weights: trainer = model.get_trainer() for segment in model.segments: logging.info('Storing test feature weights for segment %s', segment.name) fill_weights(trainer, test, segment) data = [] for segment, d in raw_data.iteritems(): data = data + d logging.info('Storing test examples') if metrics._probs is None: probs = list(repeat(None, len(data))) else: probs = metrics._probs examples = izip(range(len(data)), data, metrics._labels, metrics._preds, probs) logging.info("Memory usage: %f", memory_usage(-1, interval=0, timeout=None)[0]) for n, row, label, pred, prob in examples: if n % (all_count / 10) == 0: if all_count > 0: app.sql_db.session.commit() logging.info('Processed %s rows so far' % n) example, new_row = _add_example_to_db(test, row, label, pred, prob, n) # test.examples_size += (get_doc_size(example) / 1024.0 / 1024.0) # example_ids.append(str(example.id)) app.sql_db.session.commit() test.status = TestResult.STATUS_COMPLETED test.save() logging.info('Test %s completed' % test.name) except Exception, exc: if isinstance(exc, SQLAlchemyError): app.sql_db.session.rollback() logging.error('Got exception when test model: {0} \n {1}'.format( exc.message, get_task_traceback(exc))) test.status = test.STATUS_ERROR error_column_size = TestResult.error.type.length str_exc = str(exc) msg = ' ... TRUNCATED' test.error = str_exc if len(str_exc) <= error_column_size else \ (str_exc[:error_column_size - len(msg)] + msg) test.save() raise TaskException(exc.message, exc)
def calculate_confusion_matrix(test_id, weights): """ Calculate confusion matrix for test with weights. test_id: int ID of the model test. weights: list of tuples (label, weight) where label - test.model.label - class label weight: positive float class weight Note: Now we support calculating confusion matrix for binary and multi class classifiers. """ log_id = test_id from api.async_tasks.models import AsyncTask if calculate_confusion_matrix.request.id is not None: tasks = AsyncTask.query\ .filter_by(task_id=calculate_confusion_matrix.request.id).limit(1) log_id = tasks[0].id init_logger('confusion_matrix_log', obj=int(log_id)) logging.info('Starting re-calculating confusion matrix') logging.info('Checking confusion matrix weights') try: all_zero = True for weight in weights: if weight[1] != 0: all_zero = False if weight[1] < 0: logging.error("Negative weights found") raise ValueError('Negative weights are not allowed') if all_zero: logging.error("All weights are zero") raise ValueError('All weights can not be 0') test = TestResult.query.get(test_id) if test is None: logging.error('Test with id {0!s} not found!'.format(test_id)) raise ValueError('Test with id {0!s} not found!'.format(test_id)) if test.status != TestResult.STATUS_COMPLETED: logging.error('Test is not completed! Please wait and try again') raise ValueError('Test is not completed!') model = test.model if model is None: logging.error('Model with id {0!s} not found!'.format( test.model_id)) raise ValueError('Model with id {0!s} not found!'.format( test.model_id)) logging.info('Start calculating confusion matrix for test id {0!s}, ' 'log id {1} with weights {2}'.format( test_id, log_id, weights)) dim = len(weights) matrix = [[0 for x in range(dim)] for x in range(dim)] i = 1 for example in test.examples: true_value_idx = model.labels.index(example.label) weighted_sum = 0 for weight in weights: index = model.labels.index(weight[0]) weighted_sum += weight[1] * example.prob[index] if weighted_sum == 0: import json logging.error("Weighted sum is 0 on calculating test example " "#{0} (probabilities: {1})".format( i, json.dumps(example.prob))) raise ValueError("Weighted sum is 0. Try another weights " "or retest model.") weighted_prob = [] for weight in weights: index = model.labels.index(weight[0]) weighted_prob.append(weight[1] * example.prob[index] / weighted_sum) predicted = weighted_prob.index(max(weighted_prob)) matrix[true_value_idx][predicted] += 1 if i % 500 == 0: logging.info("{0} test examples processed".format(i)) i += 1 logging.info("Confusion matrix calculation completed") return zip(model.labels, matrix) except Exception as e: logging.error("Got exception om matrix recalculation: " " {0} \n {1}".format(e.message, get_task_traceback(e))) raise TaskException(e.message, e)
def upload_model_to_server(server_id, model_id, user_id): """ Upload model to S3 for cloudml-predict. """ init_logger('trainmodel_log', obj=int(model_id)) logging.info('Starting uploading to cloudml_predict') try: server = Server.query.get(server_id) user = User.query.get(user_id) model = Model.query.get(model_id) # TODO: Checking name, whether it's enough of the memory, etc. model_files = server.list_keys(FOLDER_MODELS) for file_ in model_files: if file_['name'] == model.name: raise ValueError('Model with name "{0}" already exist on ' 'the server {1}'.format( model.name, server.name)) uid = get_a_Uuid() # TODO: Shall we use another account? s3 = AmazonS3Helper( bucket_name=app.config['CLOUDML_PREDICT_BUCKET_NAME']) path = '{0}/{1}/{2}.model'.format(server.folder.strip('/'), FOLDER_MODELS, uid) meta = { 'id': model.id, 'object_name': model.name, 'name': model.name, 'user_id': user.id, 'user_name': user.name, 'hide': "False", 'uploaded_on': str(datetime.now()) } trainer = model.get_trainer() #from cloudml.trainer.store import load_trainer #trainer = load_trainer(trainer_data) from cloudml.trainer.store import TrainerStorage from bson import Binary import cPickle as pickle trainer_data = Binary(TrainerStorage(trainer).dumps()) logging.info(len(trainer_data)) #trainer.visualization = None #trainer_data = store_trainer(trainer) #trainer_data = model.trainer s3.save_key_string(path, trainer_data, meta) s3.close() model.locked = True s_ids = list(model.servers_ids) if (isinstance(model.servers_ids, list)) else [] s_ids.append(server.id) model.servers_ids = list(s_ids) model.save() feature_set = model.features_set feature_set.locked = True feature_set.save() logging.info('Creating grafan dashboard for model') update_grafana_dashboard(server, model) logging.info('Model has been uploaded: %s' % model.name) return '{0}/{1}.model'.format(FOLDER_MODELS, uid) except Exception as e: logging.error("Got exception on uploading model to predict: " " {0} \n {1}".format(e.message, get_task_traceback(e))) raise TaskException(e.message, e)
def import_data(dataset_id, model_id=None, test_id=None, transformer_id=None): """ Import data from database. """ def get_parent_object(): if model_id is not None: return Model.query.get(model_id) if test_id is not None: return TestResult.query.get(test_id) if transformer_id is not None: return Transformer.query.get(transformer_id) def set_error(err, ds=None, parent=None): if ds is not None: ds.set_error(err) if parent is not None: msg = 'Got exception when importing dataset #{0}. Error: {1}'\ .format(ds.id, err or 'Unknown') parent.set_error(msg) obj = get_parent_object() parent_handler = None dataset = DataSet.query.get(dataset_id) if dataset is None: set_error("Dataset with id %s not found" % dataset_id, parent=obj) dataset.status = dataset.STATUS_IMPORTING dataset.save() import_handler = dataset.import_handler if import_handler is None: set_error("Import handler for dataset %s not found" % dataset_id, ds=dataset, parent=obj) if obj: obj.status = obj.STATUS_IMPORTING obj.save() try: logger = init_logger('importdata_log', obj=dataset.id) if obj: name = 'trainmodel_log' if isinstance(obj, Model) \ else 'runtest_log' parent_handler = logger_handler(name, obj=obj.id) if parent_handler: logger.addHandler(parent_handler) logging.info('Loading dataset %s' % dataset.id) import_start_time = datetime.now() def callback(**kwargs): jobflow_id = kwargs.get('jobflow_id') # required master_dns = kwargs.get('master_dns', None) s3_logs_folder = kwargs.get('s3_logs_folder', None) step_number = kwargs.get('step_number', None) pig_row = kwargs.get('pig_row', None) jobflow_id, master_dns, s3_logs_folder, step_number cluster = Cluster.query.filter( Cluster.jobflow_id == jobflow_id ).first() if cluster is None: cluster = Cluster(jobflow_id=jobflow_id) if master_dns is not None: cluster.master_node_dns = master_dns if s3_logs_folder is not None: cluster.logs_folder = s3_logs_folder cluster.save() dataset.cluster = cluster if step_number is not None: dataset.pig_step = step_number if pig_row is not None: dataset.pig_row = pig_row dataset.save() if master_dns is not None: logging.info('Master dns %s' % master_dns) cluster.create_ssh_tunnel() logging.info("Import dataset using import handler '%s' \ with%s compression", import_handler.name, '' if dataset.compress else 'out') if parent_handler: logger.removeHandler(parent_handler) dataset.import_handler_xml = import_handler.data handler_iterator = import_handler.get_iterator( dataset.import_params, callback) logging.info('The dataset will be stored to file %s', dataset.filename) if dataset.format == dataset.FORMAT_CSV: handler_iterator.store_data_csv( dataset.filename, dataset.compress) else: handler_iterator.store_data_json( dataset.filename, dataset.compress) logging.info('Import dataset completed') logging.info('Retrieving data fields') with dataset.get_data_stream() as fp: if dataset.format == dataset.FORMAT_CSV: reader = csv.DictReader( fp, quotechar="'", quoting=csv.QUOTE_ALL ) row = next(reader) else: row = next(fp) if row: row = json.loads(row) if row: dataset.data_fields = row.keys() logging.info('Dataset fields: {0!s}'.format(dataset.data_fields)) dataset.filesize = long(_get_uncompressed_filesize(dataset.filename)) dataset.records_count = handler_iterator.count dataset.status = dataset.STATUS_UPLOADING dataset.save() logging.info('Saving file to Amazon S3') dataset.save_to_s3() logging.info('File saved to Amazon S3') dataset.time = (datetime.now() - import_start_time).seconds dataset.status = dataset.STATUS_IMPORTED if obj: obj.status = obj.STATUS_IMPORTED obj.save() dataset.save() logging.info('DataSet was loaded') except Exception, exc: if parent_handler: logger.addHandler(parent_handler) logging.error('Got exception when import dataset: {0}\n {1}'.format( exc.message, get_task_traceback(exc))) set_error(exc.message, ds=dataset, parent=obj) raise TaskException(exc.message, exc)
def upload_segment_features_transformers(model_id, segment_id, fformat): model = Model.query.get(model_id) segment = Segment.query.get(segment_id) log_id = segment_id from api.async_tasks.models import AsyncTask if upload_segment_features_transformers.request.id is not None: tasks = AsyncTask.query\ .filter_by( task_id=upload_segment_features_transformers.request.id ).limit(1) log_id = tasks[0].id init_logger('prepare_transformer_for_download_log', obj=int(log_id)) logging.info('Start preparing segment features transformers for download') try: from zipfile import ZipFile, ZIP_DEFLATED from api.amazon_utils import AmazonS3Helper import os from tempfile import NamedTemporaryFile files = [] arc_name = "{0}-{1}-{2}.zip".format(model.name, segment.name, fformat) def _save_content(content, feature_name, transformer_type): filename = "{0}-{1}-{2}-data.{3}".format(segment.name, feature_name, transformer_type, fformat) logging.info("Creating %s" % filename) if fformat == 'csv': import csv import StringIO si = StringIO.StringIO() if len(content): fieldnames = content[0].keys() writer = csv.DictWriter(si, fieldnames=fieldnames) writer.writeheader() for c in content: writer.writerow(c) response = si.getvalue() else: import json response = json.dumps(content, indent=2) with open(filename, 'w') as fh: fh.write(response) fh.close() return filename trainer = model.get_trainer() if segment.name not in trainer.features: raise TaskException("Segment %s doesn't exists in trained model" % segment.name) for name, feature in trainer.features[segment.name].iteritems(): if "transformer" in feature and feature["transformer"] is not None: try: data = feature["transformer"].load_vocabulary() files.append(_save_content(data, name, feature["transformer-type"])) except AttributeError: logging.warning( "Can't load transformer data for segment {0} feature " "{1} transformer {2}. Transformer doesn't have " "vocabulary to return or feature haven't been " "transformed on model training" .format(segment.name, name, feature["transformer-type"])) continue logging.info("Add files to archive") with ZipFile(arc_name, "w") as z: for f in files: z.write(f, compress_type=ZIP_DEFLATED) z.close() s3 = AmazonS3Helper() logging.info('Uploading archive to s3 with name {0}'.format(arc_name)) s3.save_key(arc_name, arc_name, { 'model_id': model.id, 'segment_id': segment_id}, compressed=False) s3.close() return s3.get_download_url(arc_name, 60 * 60 * 24 * 7) except Exception, e: logging.error("Got exception when preparing features transformers " "of segment {0} for download: {1} \n {2}" .format(segment.name, e.message, get_task_traceback(e))) raise TaskException(e.message, e)
def export_results_to_db(model_id, test_id, datasource_id, table_name, fields): """ Export model test examples data to PostgreSQL db. model_id: int ID of the model test_id: int ID of the test, which examples it planned to export. datasource_id: int ID of datasource, which would be used to connect to PostgreSQL db. table_name: string Name of the table fields: list of string List of field names from TestExample to export to the database. """ import psycopg2 import psycopg2.extras init_logger('runtest_log', obj=int(test_id)) try: test = TestResult.query.filter_by(model_id=model_id, id=test_id).first() if not test: logging.error('Test not found') return datasource = PredefinedDataSource.query.get(datasource_id) if not datasource: logging.error('Datasource not found') return db_fields = [] for field in fields: if 'prob' == field: for label in test.classes_set: db_fields.append("prob_%s varchar(25)" % label) else: db_fields.append("%s varchar(25)" % field.replace('->', '__')) logging.info('Creating db connection %s' % datasource.db) conn = psycopg2.connect(datasource.db['conn']) query = "drop table if exists %s;" % table_name cursor = conn.cursor().execute(query) logging.info('Creating table %s' % table_name) query = "CREATE TABLE %s (%s);" % (table_name, ','.join(db_fields)) cursor = conn.cursor().execute(query) count = 0 for example in TestExample.get_data(test_id, fields): rows = [] if count % 10 == 0: logging.info('Processed %s rows so far' % (count, )) for field in fields: if field == '_id': field = 'id' if field == 'id': field = 'example_id' val = example[field] if field in example else '' if field == 'prob': rows += val else: rows.append(val) rows = map(lambda x: "'%s'" % x, rows) query = "INSERT INTO %s VALUES (%s)" % (table_name, ",".join(rows)) cursor = conn.cursor().execute(query) count += 1 conn.commit() logging.info('Export completed.') except Exception as e: logging.error("Got exception on exporting to db: " " {0} \n {1}".format(e.message, get_task_traceback(e))) raise TaskException(e.message, e) return
def train_transformer(dataset_ids, transformer_id, user_id, delete_metadata=False): """ Train the transformer. dataset_ids: list of integers List of dataset ids used for transformer training. transformer_id: int Id of the transformer to train. user_id: int Id of the user, who initiate training the transformer. delete_metadata: bool Whether we need transformer related db logs. """ init_logger(LogMessage.TRAIN_TRANSFORMER, obj=int(transformer_id)) user = User.query.get(user_id) transformer = Transformer.query.get(transformer_id) datasets = DataSet.query.filter(DataSet.id.in_(dataset_ids)).all() logging.info('Prepare for transformer %s training.' % transformer.name) try: transformer.datasets = datasets transformer.status = transformer.STATUS_TRAINING transformer.error = "" transformer.trained_by = user transformer.save(commit=True) if delete_metadata: logging.info('Remove logs on retrain transformer') LogMessage.delete_related_logs(transformer.id, type_=LogMessage.TRAIN_TRANSFORMER) logging.info('Perform transformer training') from api.base.io_utils import get_or_create_data_folder get_or_create_data_folder() trainer = {} def _chain_datasets(ds_list): fp = None for d in ds_list: if fp: fp.close() fp = d.get_data_stream() for row in d.get_iterator(fp): yield row if fp: fp.close() train_iter = _chain_datasets(datasets) logging.info('DataSet files chosen for training: %s' % ', '.join([ '{0} (id #{1})'.format(dataset.filename, dataset.id) for dataset in datasets ])) from memory_profiler import memory_usage train_begin_time = datetime.utcnow() trainer = transformer.train(train_iter) mem_usage = memory_usage(-1, interval=0, timeout=None) # trainer.clear_temp_data() transformer.status = transformer.STATUS_TRAINED transformer.set_trainer(trainer) transformer.save() transformer.memory_usage = max(mem_usage) transformer.train_records_count = int( sum((d.records_count for d in transformer.datasets))) train_end_time = datetime.utcnow() transformer.training_time = int( (train_end_time - train_begin_time).seconds) transformer.save() except Exception, exc: app.sql_db.session.rollback() logging.error( 'Got exception when train transformer: {0} \n {1}'.format( exc.message, get_task_traceback(exc))) transformer.status = transformer.STATUS_ERROR transformer.error = str(exc)[:299] transformer.save() raise TaskException(exc.message, exc)