Example #1
0
def transform_dataset_for_download(model_id, dataset_id):
    model = Model.query.get(model_id)
    dataset = DataSet.query.get(dataset_id)

    init_logger('transform_for_download_log', obj=int(dataset_id))
    logging.info('Starting Transform For Download Task')
    try:
        transformed = model.transform_dataset(dataset)

        logging.info('Saving transformed data to disk')
        temp_file = tempfile.NamedTemporaryFile()
        numpy.savez_compressed(temp_file, **transformed)

        s3_filename = "dataset_{0}_vectorized_for_model_{1}.npz".format(
            dataset.id, model.id)

        from api.amazon_utils import AmazonS3Helper
        s3 = AmazonS3Helper()
        logging.info('Uploading file {0} to s3 with name {1}...'.format(
            temp_file.name, s3_filename))
        s3.save_key(s3_filename, temp_file.name, {
            'model_id': model.id,
            'dataset_id': dataset.id}, compressed=False)
        s3.close()
        return s3.get_download_url(s3_filename, 60 * 60 * 24 * 7)
    except Exception as e:
        logging.error("Got exception when transforming dataset: "
                      " {0} \n {1}".format(e.message, get_task_traceback(e)))
        raise TaskException(e.message, e)
Example #2
0
def calculate_model_parts_size(data_from_training, model_id, deep=7):
    """Calculates size of model trainer and records result json"""
    from pympler.asizeof import asized
    init_logger('trainmodel_log', obj=int(model_id))
    logging.info('Starting calculation size of model parts')

    model = Model.query.get(model_id)
    trainer = model.get_trainer()

    def walk_asized(asz):
        """Prepares output dict from Asized object"""
        res = {"name": asz.name, "size": asz.size, "properties": []}
        for r in asz.refs:
            res["properties"].append(
                {"name": r.name, "size": r.size,
                 "properties": [walk_asized(k) for k in r.refs]})
        return res

    try:
        result = walk_asized(asized(trainer, detail=deep))
        result["name"] = "trainer"
        model.model_parts_size = result
        model.save()
    except Exception as e:
        logging.error("Exception while calculating model parts size: {0} "
                      " \n {1}".format(e.message, get_task_traceback(e)))
    logging.info('Finished calculation')
Example #3
0
def upload_dataset(dataset_id):
    """
    Upload dataset to S3.
    """
    dataset = DataSet.query.get(dataset_id)
    try:
        if not dataset:
            raise ValueError('DataSet not found')
        init_logger('importdata_log', obj=dataset.id)
        logging.info('Uploading dataset %s' % dataset.id)

        dataset.status = dataset.STATUS_UPLOADING
        dataset.save()

        logging.info('Saving file to Amazon S3')
        dataset.save_to_s3()
        logging.info('File saved to Amazon S3')

        dataset.status = dataset.STATUS_IMPORTED
        if dataset.records_count is None:
            dataset.records_count = 0
        dataset.save()

    except Exception, exc:
        logging.error('Got exception when uploading dataset: {0}\n {1}'.format(
                      exc.message, get_task_traceback(exc)))
        dataset.set_error(exc)
        raise TaskException(exc.message, exc)
Example #4
0
def upload_import_handler_to_server(server_id, handler_type, handler_id,
                                    user_id):
    """
    Upload importhandler to S3 for cloudml-predict.
    """
    init_logger('importdata_log', obj=int(handler_id))
    logging.info('Starting uploading to cloudml_predict')

    try:
        server = Server.query.get(server_id)
        user = User.query.get(user_id)
        handler = XmlImportHandler.query.get(handler_id)

        handler_files = server.list_keys(FOLDER_IMPORT_HANDLERS)
        for file_ in handler_files:
            if file_['name'] == handler.name:
                raise ValueError('Import Handler with name "{0}" already exist'
                                 ' on the server {1}'.format(
                                     handler.name, server.name))

        uid = get_a_Uuid()
        # TODO: Shall we use another account?
        s3 = AmazonS3Helper(
            bucket_name=app.config['CLOUDML_PREDICT_BUCKET_NAME'])
        path = '{0}/{1}/{2}.{3}'.format(
            server.folder.strip('/'), FOLDER_IMPORT_HANDLERS, uid,
            'xml' if handler_type == XmlImportHandler.TYPE else 'json')
        meta = {
            'id': handler.id,
            'name': handler.name,
            'object_name': handler.name,
            'type': handler.TYPE,
            'user_id': user.id,
            'user_name': user.name,
            'hide': "False",
            'uploaded_on': str(datetime.now()),
            'crc32': handler.crc32
        }

        handler_data = handler.get_plan_config()
        handler.locked = True
        s_ids = list(handler.servers_ids) if (isinstance(
            handler.servers_ids, list)) else []
        s_ids.append(server.id)
        handler.servers_ids = list(s_ids)
        handler.save()
        s3.save_key_string(path, handler_data, meta)
        s3.close()

        logging.info('Import Handler has been uploaded: %s' % handler.name)

        return '{0}/{1}.{2}'.format(
            FOLDER_IMPORT_HANDLERS, uid,
            'xml' if handler_type == XmlImportHandler.TYPE else 'json')
    except Exception as e:
        logging.error("Got exception on uploading import handler to predict: "
                      " {0} \n {1}".format(e.message, get_task_traceback(e)))
        raise TaskException(e.message, e)
Example #5
0
def get_classifier_parameters_grid(grid_params_id):
    """
    Exhaustive search over specified parameter values for an estimator.
    Fills parameters_grid field of the ClassifierGridParams object with
    grid search result.

    grid_params_id: int
        ID of the ClassifierGridParams model, that contains link to the
        model, datasets for training and testing, parameters, choosen by
        user.
    """
    init_logger('gridsearch_log', obj=int(grid_params_id))
    logging.info('Parameters grid search task started')

    from api.ml_models.models import ClassifierGridParams
    grid_params = ClassifierGridParams.query.get(grid_params_id)
    grid_params.status = 'Calculating'
    grid_params.save()
    try:
        feature_model = FeatureModel(
            grid_params.model.get_features_json(), is_file=False)
        trainer = Trainer(feature_model)

        def _get_iter(dataset):
            fp = None
            if fp:
                fp.close()
            fp = dataset.get_data_stream()
            for row in dataset.get_iterator(fp):
                yield row
            if fp:
                fp.close()

        clfs = trainer.grid_search(
            grid_params.parameters,
            _get_iter(grid_params.train_dataset),
            _get_iter(grid_params.test_dataset),
            score=grid_params.scoring)
        grids = {}
        for segment, clf in clfs.iteritems():
            grids[segment] = [{
                'parameters': item.parameters,
                'mean': item.mean_validation_score,
                'std': np.std(item.cv_validation_scores)}
                for item in clf.grid_scores_]
        grid_params.parameters_grid = grids
        grid_params.status = 'Completed'
        grid_params.save()
    except Exception, e:
        logging.error('Got exception on grid params search: {0} \n {1}'
                      .format(e.message, get_task_traceback(e)))
        grid_params.status = 'Error'
        grid_params.save()
        raise TaskException(e.message, e)
Example #6
0
    def run(self, verification_id, count, *args, **kwargs):
        self.init_logger(verification_id)

        logging.info('Starting model verification')
        self.verification = self.get_verification(verification_id)
        try:
            self.process(count)
        except Exception, exc:
            self.verification.status = self.verification.STATUS_ERROR
            self.verification.error = str(exc)
            self.verification.save()
            logging.error(
                "Exception when verifying the model: {0} \n {1}".format(
                    exc.message, get_task_traceback(exc)))
            raise TaskException(exc.message, exc)
Example #7
0
def train_model(dataset_ids, model_id, user_id, delete_metadata=False):
    """
    Train or re-train the model.

    dataset_ids: list of integers
        List of dataset ids used for model training.
    model_id: int
        Id of the model to train.
    user_id: int
        Id of the user, who initiate training the model.
    delete_metadata: bool
        Whether we need model related db logs, test results and
        other model related data.
    """
    init_logger('trainmodel_log', obj=int(model_id))
    logging.info('Start training task')

    user = User.query.get(user_id)
    model = Model.query.get(model_id)
    datasets = DataSet.query.filter(DataSet.id.in_(dataset_ids)).all()
    logging.info('Preparing the model `%s` for training.' % model.name)
    try:
        model.prepare_fields_for_train(user=user, datasets=datasets)
        logging.info(
            'DataSet files chosen for training: %s'
            % ', '.join(['{0} (id #{1})'.
                        format(dataset.filename, dataset.id)
                         for dataset in datasets]))
        logging.info('Perform model training')
        feature_model = FeatureModel(model.get_features_json(),
                                     is_file=False)
        trainer = Trainer(feature_model)
        get_or_create_data_folder()

        def _chain_datasets(ds_list):
            fp = None
            for d in ds_list:
                if fp:
                    fp.close()
                fp = d.get_data_stream()
                for row in d.get_iterator(fp):
                    yield row
            if fp:
                fp.close()

        train_iter = _chain_datasets(datasets)

        from memory_profiler import memory_usage
        # mem_usage = memory_usage((trainer.train,
        #                          (train_iter,)), interval=0)
        train_begin_time = datetime.datetime.utcnow()

        from api.ml_models.models import get_transformer
        trainer.set_transformer_getter(get_transformer)
        trainer.train(train_iter)

        mem_usage = memory_usage(-1, interval=0, timeout=None)
        trainer.clear_temp_data()

        logging.info('Store trainer to s3')
        model.set_trainer(trainer)
        model.save()
        logging.info('Set train records')
        model.memory_usage = max(mem_usage)
        model.train_records_count = int(sum((
            d.records_count for d in model.datasets)))
        train_end_time = datetime.datetime.utcnow()
        model.training_time = int((train_end_time - train_begin_time).seconds)
        model.save()
        logging.info('Get segment info')
        segments = trainer._get_segments_info()
        if not segments or not segments.keys():
            raise Exception('No segments in the model')

        model.create_segments(segments)

        chord((visualize_model.s(model.id, segment.id)
               for segment in model.segments),
              calculate_model_parts_size.s(model.id))()

    except Exception, exc:
        app.sql_db.session.rollback()
        logging.error('Got exception when train model: {0} \n {1}'
                      .format(exc.message, get_task_traceback(exc)))
        model.status = model.STATUS_ERROR
        model.error = str(exc)[:299]
        model.save()
        raise TaskException(exc.message, exc)
Example #8
0
def generate_visualization_tree(model_id, deep):
    """
    Generates Visualization tree with specified `deep`.

    Parameters
    ----------
    model_id: integer
        Database id of the model

    deep : positive integer
        Maximum tree deep.

    Notes
    -----
    Decision Tree Classifier and Random Forest Classifier are supported.
    """
    from cloudml.trainer.classifier_settings import DECISION_TREE_CLASSIFIER, \
        RANDOM_FOREST_CLASSIFIER, EXTRA_TREES_CLASSIFIER, XGBOOST_CLASSIFIER
    from exceptions import VisualizationException

    init_logger('trainmodel_log', obj=int(model_id))
    model = Model.query.get(model_id)

    try:
        if model is None:
            raise ValueError('model not found: %s' % model_id)

        if model.classifier is None or 'type' not in model.classifier:
            raise ValueError('model has invalid classifier')

        clf_type = model.classifier['type']
        if clf_type not in (DECISION_TREE_CLASSIFIER,
                            RANDOM_FOREST_CLASSIFIER,
                            EXTRA_TREES_CLASSIFIER,
                            XGBOOST_CLASSIFIER):
            raise VisualizationException(
                "model with %s classifier doesn't support tree"
                " visualization" % clf_type,
                VisualizationException.CLASSIFIER_NOT_SUPPORTED)

        # Checking that all_weights had been stored while training model
        # For old models we need to retrain the model.
        if model.visualization_data is None:
            raise VisualizationException(
                "we don't support the visualization re-generation for "
                "models trained before may 2015."
                "please re-train the model to use this feature.",
                error_code=VisualizationException.ALL_WEIGHT_NOT_FILLED)
        
        trainer = model.get_trainer()
        from copy import deepcopy
        data = deepcopy(model.visualization_data)
        segments = [segment.name for segment in model.segments] \
            or [DEFAULT_SEGMENT]
        for segment in segments:
            if segment not in model.visualization_data \
                    or 'all_weights' not in model.visualization_data[segment]:
                raise VisualizationException(
                    "we don't support the visualization re-generation for "
                    "models trained before may 2015."
                    "please re-train the model to use this feature.",
                    error_code=VisualizationException.ALL_WEIGHT_NOT_FILLED)


            if clf_type == DECISION_TREE_CLASSIFIER:
                tree = trainer.model_visualizer.regenerate_tree(
                    segment, data[segment]['all_weights'], deep=deep)
                data[segment]['tree'] = tree
            elif clf_type == RANDOM_FOREST_CLASSIFIER or \
                    clf_type == EXTRA_TREES_CLASSIFIER:
                trees = trainer.model_visualizer.regenerate_trees(
                    segment, data[segment]['all_weights'], deep=deep)
                data[segment]['trees'] = trees
            elif clf_type == XGBOOST_CLASSIFIER:
                # TODO XGBOOST visualize
                trees = trainer.model_visualizer.regenerate_trees(
                    segment, data[segment]['all_weights'], deep=deep)
                data[segment]['trees'] = trees
            data[segment]['parameters'] = {'deep': deep, 'status': 'done'}

        model.visualize_model(data)

    except Exception as e:
        logging.error("Got exception on tree visualization: "
                      " {0} \n {1}".format(e.message, get_task_traceback(e)))
        raise TaskException(e.message, e)

    return "Tree visualization was completed"
Example #9
0
def visualize_model(model_id, segment_id=None):
    """
    Generates Trained model visualization. Fills weights and creates weights
    tree.

    model_id: int
        Id of the model.
    segment_id: int
        ID of the model segment.
    """
    init_logger('trainmodel_log', obj=int(model_id))
    model, segment = _get_model_and_segment_or_raise(model_id, segment_id)

    logging.info(
        "Starting to visualize trained model {0}. Segment: {1}".format(
            model.name, segment.name))
    trainer = model.get_trainer()

    # need to sync model weights
    count = len(segment.weights)
    if count > 0:
        from api.base.exceptions import InvalidOperationError
        raise InvalidOperationError(
            'Weights for model %s already filled: %s' % (model_id, count))

    weights_dict = None
    model.status = model.STATUS_FILLING_WEIGHTS
    model.save()

    try:
        visualization_data = trainer.get_visualization(segment.name)
        have_weights = 'weights' in visualization_data
        if have_weights:
            from utils import fill_model_weights
            weights_dict = visualization_data.pop('weights')
            weights_added = 0
            categories_added = 0
            classes_processed = 0

            for clazz in weights_dict.keys():
                c, w = fill_model_weights(model, segment, clazz, weights_dict[clazz])
                categories_added += c
                weights_added += w
                classes_processed += 1

            app.sql_db.session.commit()
            logging.info(
                'Model %s parameters weights was added to db. %s weights, '
                'in %s categories for %s classes' %
                (model.name, weights_added, categories_added,
                 classes_processed))
        else:
            logging.info('Weights are unavailable for the classifier')

        # TODO:
        from cloudml.trainer.classifier_settings import CLASSIFIER_MODELS
        clf_type = model.classifier.get('type')
        if clf_type in CLASSIFIER_MODELS and not model.labels:
            model.labels = trainer._get_labels()
        model.visualize_model(segment=segment.name, data=visualization_data,
                              commit=False)
        model.status = model.STATUS_TRAINED
        model.weights_synchronized = have_weights
        model.save()

    except Exception, exc:
        logging.error('Got exception when visualize the model: {0} \n {1}'
                      .format(exc.message, get_task_traceback(exc)))
        model.status = model.STATUS_ERROR
        model.error = str(exc)[:299]
        model.save()
        raise TaskException(exc.message, exc)
Example #10
0
def get_csv_results(model_id, test_id, fields):
    """
    Get test classification results in csv format and saves file
    to Amazon S3.

    model_id: int
        ID of the model
    test_id: int
        ID of the test, which examples it planned to export.
    fields: list of string
        List of field names from TestExample to export to csv file.
    """
    from api.amazon_utils import AmazonS3Helper

    def generate(test, name):
        from api.base.io_utils import get_or_create_data_folder
        path = get_or_create_data_folder()
        filename = os.path.join(path, name)
        header = list(fields)
        if 'prob' in header:
            prob_index = header.index('prob')
            for label in reversed(test.classes_set):
                header.insert(prob_index, 'prob_%s' % label)
            header.remove('prob')

        with open(filename, 'w') as fp:
            writer = csv.writer(fp, delimiter=',', quoting=csv.QUOTE_ALL)
            writer.writerow(header)
            for example in TestExample.get_data(test_id, fields):
                rows = []
                for field in fields:
                    if field == '_id':
                        field = 'id'
                    if field == 'id':
                        field = 'example_id'
                    val = example[field] if field in example else ''
                    if field == 'prob':
                        rows += val
                    else:
                        rows.append(val)
                writer.writerow(rows)
        return filename

    init_logger('runtest_log', obj=int(test_id))

    try:
        test = TestResult.query.filter_by(model_id=model_id,
                                          id=test_id).first()
        if test is None:
            logging.error('Test not found')
            return

        name = 'Examples-{0!s}.csv'.format(uuid.uuid1())
        expires = 60 * 60 * 24 * 7  # 7 days

        logging.info('Creating file {0}...'.format(name))

        s3 = AmazonS3Helper()
        filename = generate(test, name)
        logging.info('Uploading file {0} to s3...'.format(filename))
        s3.save_key(name,
                    filename, {
                        'model_id': model_id,
                        'test_id': test_id
                    },
                    compressed=False)
        s3.close()
        os.remove(filename)
        url = s3.get_download_url(name, expires)

        return url
    except Exception as e:
        logging.error("Got exception on getting test classification results: "
                      " {0} \n {1}".format(e.message, get_task_traceback(e)))
        raise TaskException(e.message, e)
Example #11
0
def run_test(dataset_ids, test_id):
    """
    Running test for trained model.

    dataset_ids: list
        List of dataset ids used for testing model. Now only first one in the
        list used. (Multidataset testing hasn't implemented yet).
    model_id: int
        ID of the model to test.
    """
    init_logger('runtest_log', obj=int(test_id))

    test = TestResult.query.get(test_id)
    datasets = DataSet.query.filter(DataSet.id.in_(dataset_ids)).all()
    dataset = datasets[0]
    model = test.model

    try:
        if model.status != model.STATUS_TRAINED:
            raise InvalidOperationError("Train the model before")

        test.status = test.STATUS_IN_PROGRESS
        test.error = ""
        test.save()

        logging.info('Getting metrics and raw data')
        from memory_profiler import memory_usage
        logging.info('Using {0} (id #{1}) dataset'.format(
            dataset.filename, dataset.id))
        result = Model.run_test(model, dataset)

        metrics, raw_data = result
        logging.info("Memory usage: %f",
                     memory_usage(-1, interval=0, timeout=None)[0])
        metrics_dict = metrics.get_metrics_dict()
        if 'accuracy' in metrics_dict:
            test.accuracy = metrics.accuracy
            logging.info('Accuracy: %f', test.accuracy)

        # TODO: Refactor this. Here are possible issues with conformity
        # between labels and values
        if 'confusion_matrix' in metrics_dict:
            confusion_matrix = metrics_dict['confusion_matrix']
            confusion_matrix_ex = []
            for i, val in enumerate(metrics.classes_set):
                confusion_matrix_ex.append((str(val), confusion_matrix[i]))
            metrics_dict['confusion_matrix'] = confusion_matrix_ex

        if 'precision_recall_curve' in metrics_dict:
            n = len(metrics._labels) / 100 or 1
            if metrics.classes_count == 2:
                metrics_dict['precision_recall_curve'][1] = \
                    metrics_dict['precision_recall_curve'][1][0::n]
                metrics_dict['precision_recall_curve'][0] = \
                    metrics_dict['precision_recall_curve'][0][0::n]

        if 'roc_curve' in metrics_dict:
            n = len(metrics._labels) / 100 or 1
            calc_range = [1] if metrics.classes_count == 2 \
                else range(len(metrics.classes_set))
            for i in calc_range:
                label = metrics.classes_set[i]
                sub_fpr = metrics_dict['roc_curve'][label][0][0::n]
                sub_tpr = metrics_dict['roc_curve'][label][1][0::n]
                metrics_dict['roc_curve'][label] = [sub_fpr, sub_tpr]

        if 'roc_curve' in metrics_dict:
            test.roc_auc = metrics_dict.pop('roc_auc')
        test.metrics = metrics_dict
        test.classes_set = list(metrics.classes_set)
        test.status = TestResult.STATUS_STORING

        if not model.comparable:
            # TODO: fix this
            model = Model.query.get(model.id)
            model.comparable = True
            model.save()

        all_count = metrics._preds.size
        test.dataset = dataset
        # lock dataset used for testing
        dataset.locked = True
        dataset.save()

        test.examples_count = all_count
        test.memory_usage = memory_usage(-1, interval=0, timeout=None)[0]

        vect_data = metrics._true_data
        from bson import Binary
        test.save()

        def fill_weights(trainer, test, segment):
            for clazz, weights in trainer.get_weights(
                    segment.name).iteritems():
                positive = weights['positive']
                negative = weights['negative']

                def process_weights(weight_list):
                    for weight_dict in weight_list:
                        weight = Weight.query.filter_by(
                            model=model,
                            segment=segment,
                            class_label=str(clazz),
                            name=weight_dict['name']).one()
                        if weight.test_weights is not None:
                            test_weights = weight.test_weights.copy()
                        else:
                            test_weights = {}
                        test_weights[test_id] = weight_dict['feature_weight']
                        weight.test_weights = test_weights
                        weight.save(commit=False)
                        app.sql_db.session.add(weight)

                process_weights(positive)
                process_weights(negative)

        # Filling test weights
        if test.fill_weights:
            trainer = model.get_trainer()
            for segment in model.segments:
                logging.info('Storing test feature weights for segment %s',
                             segment.name)
                fill_weights(trainer, test, segment)

        data = []
        for segment, d in raw_data.iteritems():
            data = data + d

        logging.info('Storing test examples')
        if metrics._probs is None:
            probs = list(repeat(None, len(data)))
        else:
            probs = metrics._probs
        examples = izip(range(len(data)), data, metrics._labels,
                        metrics._preds, probs)
        logging.info("Memory usage: %f",
                     memory_usage(-1, interval=0, timeout=None)[0])

        for n, row, label, pred, prob in examples:
            if n % (all_count / 10) == 0:
                if all_count > 0:
                    app.sql_db.session.commit()
                logging.info('Processed %s rows so far' % n)

            example, new_row = _add_example_to_db(test, row, label, pred, prob,
                                                  n)
            # test.examples_size += (get_doc_size(example) / 1024.0 / 1024.0)
            # example_ids.append(str(example.id))
        app.sql_db.session.commit()

        test.status = TestResult.STATUS_COMPLETED
        test.save()
        logging.info('Test %s completed' % test.name)

    except Exception, exc:
        if isinstance(exc, SQLAlchemyError):
            app.sql_db.session.rollback()
        logging.error('Got exception when test model: {0} \n {1}'.format(
            exc.message, get_task_traceback(exc)))
        test.status = test.STATUS_ERROR
        error_column_size = TestResult.error.type.length
        str_exc = str(exc)
        msg = ' ... TRUNCATED'
        test.error = str_exc if len(str_exc) <= error_column_size else \
            (str_exc[:error_column_size - len(msg)] + msg)
        test.save()
        raise TaskException(exc.message, exc)
Example #12
0
def calculate_confusion_matrix(test_id, weights):
    """
    Calculate confusion matrix for test with weights.

    test_id: int
        ID of the model test.
    weights: list of tuples (label, weight)
        where label - test.model.label - class label
        weight: positive float class weight

    Note:
    Now we support calculating confusion matrix for
    binary and multi class classifiers.
    """
    log_id = test_id
    from api.async_tasks.models import AsyncTask
    if calculate_confusion_matrix.request.id is not None:
        tasks = AsyncTask.query\
            .filter_by(task_id=calculate_confusion_matrix.request.id).limit(1)
        log_id = tasks[0].id

    init_logger('confusion_matrix_log', obj=int(log_id))
    logging.info('Starting re-calculating confusion matrix')
    logging.info('Checking confusion matrix weights')

    try:
        all_zero = True
        for weight in weights:
            if weight[1] != 0:
                all_zero = False
            if weight[1] < 0:
                logging.error("Negative weights found")
                raise ValueError('Negative weights are not allowed')

        if all_zero:
            logging.error("All weights are zero")
            raise ValueError('All weights can not be 0')

        test = TestResult.query.get(test_id)
        if test is None:
            logging.error('Test with id {0!s} not found!'.format(test_id))
            raise ValueError('Test with id {0!s} not found!'.format(test_id))

        if test.status != TestResult.STATUS_COMPLETED:
            logging.error('Test is not completed! Please wait and try again')
            raise ValueError('Test is not completed!')

        model = test.model
        if model is None:
            logging.error('Model with id {0!s} not found!'.format(
                test.model_id))
            raise ValueError('Model with id {0!s} not found!'.format(
                test.model_id))

        logging.info('Start calculating confusion matrix for test id {0!s}, '
                     'log id {1} with weights {2}'.format(
                         test_id, log_id, weights))

        dim = len(weights)
        matrix = [[0 for x in range(dim)] for x in range(dim)]

        i = 1
        for example in test.examples:
            true_value_idx = model.labels.index(example.label)

            weighted_sum = 0
            for weight in weights:
                index = model.labels.index(weight[0])
                weighted_sum += weight[1] * example.prob[index]

            if weighted_sum == 0:
                import json
                logging.error("Weighted sum is 0 on calculating test example "
                              "#{0} (probabilities: {1})".format(
                                  i, json.dumps(example.prob)))
                raise ValueError("Weighted sum is 0. Try another weights "
                                 "or retest model.")

            weighted_prob = []
            for weight in weights:
                index = model.labels.index(weight[0])
                weighted_prob.append(weight[1] * example.prob[index] /
                                     weighted_sum)

            predicted = weighted_prob.index(max(weighted_prob))
            matrix[true_value_idx][predicted] += 1
            if i % 500 == 0:
                logging.info("{0} test examples processed".format(i))
            i += 1

        logging.info("Confusion matrix calculation completed")

        return zip(model.labels, matrix)
    except Exception as e:
        logging.error("Got exception om matrix recalculation: "
                      " {0} \n {1}".format(e.message, get_task_traceback(e)))
        raise TaskException(e.message, e)
Example #13
0
def upload_model_to_server(server_id, model_id, user_id):
    """
    Upload model to S3 for cloudml-predict.
    """
    init_logger('trainmodel_log', obj=int(model_id))
    logging.info('Starting uploading to cloudml_predict')

    try:
        server = Server.query.get(server_id)
        user = User.query.get(user_id)
        model = Model.query.get(model_id)

        # TODO: Checking name, whether it's enough of the memory, etc.
        model_files = server.list_keys(FOLDER_MODELS)
        for file_ in model_files:
            if file_['name'] == model.name:
                raise ValueError('Model with name "{0}" already exist on '
                                 'the server {1}'.format(
                                     model.name, server.name))

        uid = get_a_Uuid()

        # TODO: Shall we use another account?
        s3 = AmazonS3Helper(
            bucket_name=app.config['CLOUDML_PREDICT_BUCKET_NAME'])
        path = '{0}/{1}/{2}.model'.format(server.folder.strip('/'),
                                          FOLDER_MODELS, uid)
        meta = {
            'id': model.id,
            'object_name': model.name,
            'name': model.name,
            'user_id': user.id,
            'user_name': user.name,
            'hide': "False",
            'uploaded_on': str(datetime.now())
        }

        trainer = model.get_trainer()
        #from cloudml.trainer.store import load_trainer
        #trainer = load_trainer(trainer_data)
        from cloudml.trainer.store import TrainerStorage
        from bson import Binary
        import cPickle as pickle
        trainer_data = Binary(TrainerStorage(trainer).dumps())
        logging.info(len(trainer_data))
        #trainer.visualization = None
        #trainer_data = store_trainer(trainer)
        #trainer_data = model.trainer
        s3.save_key_string(path, trainer_data, meta)
        s3.close()
        model.locked = True
        s_ids = list(model.servers_ids) if (isinstance(model.servers_ids,
                                                       list)) else []
        s_ids.append(server.id)
        model.servers_ids = list(s_ids)
        model.save()
        feature_set = model.features_set
        feature_set.locked = True
        feature_set.save()
        logging.info('Creating grafan dashboard for model')
        update_grafana_dashboard(server, model)
        logging.info('Model has been uploaded: %s' % model.name)

        return '{0}/{1}.model'.format(FOLDER_MODELS, uid)
    except Exception as e:
        logging.error("Got exception on uploading model to predict: "
                      " {0} \n {1}".format(e.message, get_task_traceback(e)))
        raise TaskException(e.message, e)
Example #14
0
def import_data(dataset_id, model_id=None, test_id=None, transformer_id=None):
    """
    Import data from database.
    """
    def get_parent_object():
        if model_id is not None:
            return Model.query.get(model_id)
        if test_id is not None:
            return TestResult.query.get(test_id)
        if transformer_id is not None:
            return Transformer.query.get(transformer_id)

    def set_error(err, ds=None, parent=None):
        if ds is not None:
            ds.set_error(err)
        if parent is not None:
            msg = 'Got exception when importing dataset #{0}. Error: {1}'\
                .format(ds.id, err or 'Unknown')
            parent.set_error(msg)

    obj = get_parent_object()
    parent_handler = None
    dataset = DataSet.query.get(dataset_id)
    if dataset is None:
        set_error("Dataset with id %s not found" % dataset_id, parent=obj)

    dataset.status = dataset.STATUS_IMPORTING
    dataset.save()

    import_handler = dataset.import_handler
    if import_handler is None:
        set_error("Import handler for dataset %s not found" %
                  dataset_id, ds=dataset, parent=obj)
    if obj:
        obj.status = obj.STATUS_IMPORTING
        obj.save()

    try:
        logger = init_logger('importdata_log', obj=dataset.id)
        if obj:
            name = 'trainmodel_log' if isinstance(obj, Model) \
                else 'runtest_log'
            parent_handler = logger_handler(name, obj=obj.id)
            if parent_handler:
                logger.addHandler(parent_handler)

        logging.info('Loading dataset %s' % dataset.id)

        import_start_time = datetime.now()

        def callback(**kwargs):
            jobflow_id = kwargs.get('jobflow_id')  # required
            master_dns = kwargs.get('master_dns', None)
            s3_logs_folder = kwargs.get('s3_logs_folder', None)
            step_number = kwargs.get('step_number', None)
            pig_row = kwargs.get('pig_row', None)

            jobflow_id, master_dns, s3_logs_folder, step_number
            cluster = Cluster.query.filter(
                Cluster.jobflow_id == jobflow_id
            ).first()
            if cluster is None:
                cluster = Cluster(jobflow_id=jobflow_id)
                if master_dns is not None:
                    cluster.master_node_dns = master_dns
                if s3_logs_folder is not None:
                    cluster.logs_folder = s3_logs_folder
                cluster.save()
            dataset.cluster = cluster
            if step_number is not None:
                dataset.pig_step = step_number
            if pig_row is not None:
                dataset.pig_row = pig_row
            dataset.save()
            if master_dns is not None:
                logging.info('Master dns %s' % master_dns)
                cluster.create_ssh_tunnel()

        logging.info("Import dataset using import handler '%s' \
with%s compression", import_handler.name, '' if dataset.compress else 'out')

        if parent_handler:
            logger.removeHandler(parent_handler)

        dataset.import_handler_xml = import_handler.data
        handler_iterator = import_handler.get_iterator(
            dataset.import_params, callback)

        logging.info('The dataset will be stored to file %s', dataset.filename)

        if dataset.format == dataset.FORMAT_CSV:
            handler_iterator.store_data_csv(
                dataset.filename, dataset.compress)
        else:
            handler_iterator.store_data_json(
                dataset.filename, dataset.compress)

        logging.info('Import dataset completed')

        logging.info('Retrieving data fields')
        with dataset.get_data_stream() as fp:
            if dataset.format == dataset.FORMAT_CSV:
                reader = csv.DictReader(
                    fp,
                    quotechar="'",
                    quoting=csv.QUOTE_ALL
                )
                row = next(reader)
            else:
                row = next(fp)
                if row:
                    row = json.loads(row)
            if row:
                dataset.data_fields = row.keys()

        logging.info('Dataset fields: {0!s}'.format(dataset.data_fields))

        dataset.filesize = long(_get_uncompressed_filesize(dataset.filename))
        dataset.records_count = handler_iterator.count
        dataset.status = dataset.STATUS_UPLOADING
        dataset.save()

        logging.info('Saving file to Amazon S3')
        dataset.save_to_s3()
        logging.info('File saved to Amazon S3')
        dataset.time = (datetime.now() - import_start_time).seconds

        dataset.status = dataset.STATUS_IMPORTED
        if obj:
            obj.status = obj.STATUS_IMPORTED
            obj.save()

        dataset.save()

        logging.info('DataSet was loaded')
    except Exception, exc:
        if parent_handler:
            logger.addHandler(parent_handler)
        logging.error('Got exception when import dataset: {0}\n {1}'.format(
                      exc.message, get_task_traceback(exc)))
        set_error(exc.message, ds=dataset, parent=obj)
        raise TaskException(exc.message, exc)
Example #15
0
def upload_segment_features_transformers(model_id, segment_id, fformat):
    model = Model.query.get(model_id)
    segment = Segment.query.get(segment_id)
    log_id = segment_id
    from api.async_tasks.models import AsyncTask
    if upload_segment_features_transformers.request.id is not None:
        tasks = AsyncTask.query\
            .filter_by(
                task_id=upload_segment_features_transformers.request.id
            ).limit(1)
        log_id = tasks[0].id

    init_logger('prepare_transformer_for_download_log',
                obj=int(log_id))
    logging.info('Start preparing segment features transformers for download')

    try:
        from zipfile import ZipFile, ZIP_DEFLATED
        from api.amazon_utils import AmazonS3Helper
        import os
        from tempfile import NamedTemporaryFile
        files = []
        arc_name = "{0}-{1}-{2}.zip".format(model.name, segment.name, fformat)

        def _save_content(content, feature_name, transformer_type):
            filename = "{0}-{1}-{2}-data.{3}".format(segment.name,
                                                     feature_name,
                                                     transformer_type,
                                                     fformat)
            logging.info("Creating %s" % filename)
            if fformat == 'csv':
                import csv
                import StringIO
                si = StringIO.StringIO()
                if len(content):
                    fieldnames = content[0].keys()
                    writer = csv.DictWriter(si, fieldnames=fieldnames)
                    writer.writeheader()
                    for c in content:
                        writer.writerow(c)
                response = si.getvalue()
            else:
                import json
                response = json.dumps(content, indent=2)

            with open(filename, 'w') as fh:
                fh.write(response)
                fh.close()
            return filename

        trainer = model.get_trainer()
        if segment.name not in trainer.features:
            raise TaskException("Segment %s doesn't exists in trained model" %
                                segment.name)
        for name, feature in trainer.features[segment.name].iteritems():
            if "transformer" in feature and feature["transformer"] is not None:
                try:
                    data = feature["transformer"].load_vocabulary()
                    files.append(_save_content(data, name,
                                               feature["transformer-type"]))
                except AttributeError:
                    logging.warning(
                        "Can't load transformer data for segment {0} feature "
                        "{1} transformer {2}. Transformer doesn't have "
                        "vocabulary to return or feature haven't been "
                        "transformed on model training"
                        .format(segment.name, name,
                                feature["transformer-type"]))
                    continue

        logging.info("Add files to archive")
        with ZipFile(arc_name, "w") as z:
            for f in files:
                z.write(f, compress_type=ZIP_DEFLATED)
            z.close()

        s3 = AmazonS3Helper()
        logging.info('Uploading archive to s3 with name {0}'.format(arc_name))
        s3.save_key(arc_name, arc_name, {
            'model_id': model.id,
            'segment_id': segment_id}, compressed=False)
        s3.close()
        return s3.get_download_url(arc_name, 60 * 60 * 24 * 7)

    except Exception, e:
        logging.error("Got exception when preparing features transformers "
                      "of segment {0} for download: {1} \n {2}"
                      .format(segment.name, e.message, get_task_traceback(e)))
        raise TaskException(e.message, e)
Example #16
0
def export_results_to_db(model_id, test_id, datasource_id, table_name, fields):
    """
    Export model test examples data to PostgreSQL db.

    model_id: int
        ID of the model
    test_id: int
        ID of the test, which examples it planned to export.
    datasource_id: int
        ID of datasource, which would be used to connect to PostgreSQL db.
    table_name: string
        Name of the table
    fields: list of string
        List of field names from TestExample to export to the database.
    """
    import psycopg2
    import psycopg2.extras
    init_logger('runtest_log', obj=int(test_id))

    try:
        test = TestResult.query.filter_by(model_id=model_id,
                                          id=test_id).first()
        if not test:
            logging.error('Test not found')
            return

        datasource = PredefinedDataSource.query.get(datasource_id)
        if not datasource:
            logging.error('Datasource not found')
            return

        db_fields = []
        for field in fields:
            if 'prob' == field:
                for label in test.classes_set:
                    db_fields.append("prob_%s varchar(25)" % label)
            else:
                db_fields.append("%s varchar(25)" % field.replace('->', '__'))

        logging.info('Creating db connection %s' % datasource.db)
        conn = psycopg2.connect(datasource.db['conn'])
        query = "drop table if exists %s;" % table_name
        cursor = conn.cursor().execute(query)
        logging.info('Creating table %s' % table_name)
        query = "CREATE TABLE %s (%s);" % (table_name, ','.join(db_fields))
        cursor = conn.cursor().execute(query)

        count = 0
        for example in TestExample.get_data(test_id, fields):
            rows = []
            if count % 10 == 0:
                logging.info('Processed %s rows so far' % (count, ))
            for field in fields:
                if field == '_id':
                    field = 'id'
                if field == 'id':
                    field = 'example_id'
                val = example[field] if field in example else ''
                if field == 'prob':
                    rows += val
                else:
                    rows.append(val)
            rows = map(lambda x: "'%s'" % x, rows)
            query = "INSERT INTO %s VALUES (%s)" % (table_name, ",".join(rows))
            cursor = conn.cursor().execute(query)
            count += 1
        conn.commit()
        logging.info('Export completed.')
    except Exception as e:
        logging.error("Got exception on exporting to db: "
                      " {0} \n {1}".format(e.message, get_task_traceback(e)))
        raise TaskException(e.message, e)

    return
Example #17
0
def train_transformer(dataset_ids,
                      transformer_id,
                      user_id,
                      delete_metadata=False):
    """
    Train the transformer.

    dataset_ids: list of integers
        List of dataset ids used for transformer training.
    transformer_id: int
        Id of the transformer to train.
    user_id: int
        Id of the user, who initiate training the transformer.
    delete_metadata: bool
        Whether we need transformer related db logs.
    """
    init_logger(LogMessage.TRAIN_TRANSFORMER, obj=int(transformer_id))

    user = User.query.get(user_id)
    transformer = Transformer.query.get(transformer_id)
    datasets = DataSet.query.filter(DataSet.id.in_(dataset_ids)).all()
    logging.info('Prepare for transformer %s training.' % transformer.name)

    try:
        transformer.datasets = datasets
        transformer.status = transformer.STATUS_TRAINING
        transformer.error = ""
        transformer.trained_by = user
        transformer.save(commit=True)

        if delete_metadata:
            logging.info('Remove logs on retrain transformer')
            LogMessage.delete_related_logs(transformer.id,
                                           type_=LogMessage.TRAIN_TRANSFORMER)

        logging.info('Perform transformer training')
        from api.base.io_utils import get_or_create_data_folder
        get_or_create_data_folder()

        trainer = {}

        def _chain_datasets(ds_list):
            fp = None
            for d in ds_list:
                if fp:
                    fp.close()
                fp = d.get_data_stream()
                for row in d.get_iterator(fp):
                    yield row
            if fp:
                fp.close()

        train_iter = _chain_datasets(datasets)
        logging.info('DataSet files chosen for training: %s' % ', '.join([
            '{0} (id #{1})'.format(dataset.filename, dataset.id)
            for dataset in datasets
        ]))
        from memory_profiler import memory_usage
        train_begin_time = datetime.utcnow()
        trainer = transformer.train(train_iter)

        mem_usage = memory_usage(-1, interval=0, timeout=None)
        # trainer.clear_temp_data()

        transformer.status = transformer.STATUS_TRAINED
        transformer.set_trainer(trainer)
        transformer.save()
        transformer.memory_usage = max(mem_usage)
        transformer.train_records_count = int(
            sum((d.records_count for d in transformer.datasets)))
        train_end_time = datetime.utcnow()
        transformer.training_time = int(
            (train_end_time - train_begin_time).seconds)
        transformer.save()
    except Exception, exc:
        app.sql_db.session.rollback()
        logging.error(
            'Got exception when train transformer: {0} \n {1}'.format(
                exc.message, get_task_traceback(exc)))
        transformer.status = transformer.STATUS_ERROR
        transformer.error = str(exc)[:299]
        transformer.save()
        raise TaskException(exc.message, exc)