コード例 #1
0
def get_blob_ids_of_model_generic_files(model_blob_id):
    oauth = get_oauth()
    fetch_token(oauth)

    file_info = get_file_info_from_blob(oauth, model_blob_id).json()
    if 'ModelInfo' in file_info['metadata'].keys():
        info_key = 'ModelInfo'
    elif 'modelInfo' in file_info['metadata'].keys():
        info_key = 'modelInfo'
    else:
        raise KeyError('No model info')

    model_id = json.loads(file_info['metadata'][info_key])['ModelId']

    file_to_read = open('{}/{}'.format(TEMP_FOLDER, model_id), 'r')
    lines = file_to_read.readlines()
    file_to_read.close()

    generic_files_ids = []
    for line in lines:
        generic_file_ids = json.loads(line.replace('\'', '"'))
        if generic_file_ids['parent_id'] != model_id:
            continue

        generic_files_ids.append(generic_file_ids['blob_id'])

    return generic_files_ids
コード例 #2
0
    def test_regression_properties_predicted(self):
        """
        Test for 'regression classic properties successful predicted' workflow
        """

        # logging test start
        LOGGER.info('Regresion properties predicted test start')
        # create MassTransit emulation dict
        PROPERTIES_PREDICTED_TEST['event_callback'] = properties_predicted
        PROPERTIES_PREDICTED_TEST['command_callback'] = properties_predicted
        properties_predicted_publisher = MTPublisher(PROPERTIES_PREDICTED_TEST)

        # create valid message for send
        message_body = {
            'ParentId': self.parent_id,
            'FolderId': self.user_id,
            'FolderName': 'test_regression_prediction',
            'DatasetFileName': 'test',
            'DatasetBlobId': self.source_blob_id,
            'DatasetBucket': self.user_id,
            'ModelBlobId': ELASTIC_NETWORK_MODEL_BLOB_ID,
            'ModelBucket': CLASSIC_CLASSIFICATION_MODEL_BUCKET,
            'Id': 'd397a097-68eb-464a-86b2-abd2315ddebf',
            'UserId': self.user_id,
            'CorrelationId': 'c1cc0000-5d8b-0015-16ec-08d52f4161fe'
        }
        # send message
        fetch_token(self.oauth)
        properties_predicted_publisher.publish(message_body)

        self.assertEqual(True, CLASSIC_CLASSIFICATION_PREDICTED_FLAG)
コード例 #3
0
    def test_feature_vectors_calculator_failed(self):
        """
        Test for feature vectors calculator failed case
        """

        # logging test start
        LOGGER.info('Feature vectors calculator failed case test start')
        # create MassTransit emulation dict
        FEATURE_VECTORS_CALCULATOR_FAIL_TEST['event_callback'] = feature_vectors_calculation_failed
        FEATURE_VECTORS_CALCULATOR_FAIL_TEST['command_callback'] = feature_vectors_calculation_failed
        feature_vectors_calculation_failed_publisher = MTPublisher(FEATURE_VECTORS_CALCULATOR_FAIL_TEST)

        # create valid message for send
        message_body = {
            'CorrelationId': 'c1cc1111-5d8b-0015-16ec-08d52f4161fb',
            'Fingerprints': [
                {'Type': 'DESC'},
                {'Type': 'FCFC', 'Size': 512, 'Radius': 3}
            ],
            'FileType': 'sdf'
        }

        REDIS_CLIENT.set(
            '{}-file'.format(message_body['CorrelationId']),
            'empty'
        )

        # send message
        fetch_token(self.oauth)
        feature_vectors_calculation_failed_publisher.publish(message_body)

        self.assertEqual(True, FEATURE_VECTORS_CALCULATION_FAILED_FLAG)
コード例 #4
0
    def test_classic_classification_report_generation(self):
        """
        Test for 'generate report' workflow
        """

        # logging test start
        LOGGER.info('Model trained test start')
        # create MassTransit emulation dict
        GENERATE_REPORT_TEST[
            'event_callback'] = classic_classification_report_generation
        GENERATE_REPORT_TEST[
            'command_callback'] = classic_classification_report_generation
        generate_report_publisher = MTPublisher(GENERATE_REPORT_TEST)
        models_blob_ids = [
            NAIVE_BAYES_MODEL_BLOB_ID,
            LOGISTIC_REGRESSION_MODEL_BLOB_ID
        ]
        # create valid message for send
        message_body = {
            'CorrelationId': 'c1cc0000-5d8b-0015-16ac-08d52f4161fe',
            'ParentId': self.parent_id,
            'TimeStamp': 'time stamp there',
            'Models': create_models_data(models_blob_ids),
            'UserId': self.user_id
        }
        # send message
        fetch_token(self.oauth)
        generate_report_publisher.publish(message_body)

        self.assertEqual(True, True)
コード例 #5
0
    def test_feature_vectors_calculator_cif(self):
        """
        Test for cif file feature vectors calculator
        """

        # logging test start
        LOGGER.info('Feature vectors calculator test start')
        # create MassTransit emulation dict
        FEATURE_VECTORS_CALCULATOR_TEST['event_callback'] = feature_vectors_calculated
        FEATURE_VECTORS_CALCULATOR_TEST['command_callback'] = feature_vectors_calculated
        feature_vectors_calculated_publisher = MTPublisher(FEATURE_VECTORS_CALCULATOR_TEST)
        # create valid message for send
        message_body = {
            'CorrelationId': 'c1cc1111-5d8b-0015-16ec-08d52f4261af',
            'Fingerprints': [
                {'Type': 'universal	'}
            ],
            'FileType': 'cif'
        }
        file_to_send = open(self.test_file_cif, 'rb')
        REDIS_CLIENT.set(
            '{}-file'.format(message_body['CorrelationId']),
            file_to_send.read()
        )
        file_to_send.close()
        # send message
        fetch_token(self.oauth)
        feature_vectors_calculated_publisher.publish(message_body)
        time.sleep(10)
        self.assertEqual(True, FEATURE_VECTORS_CALCULATED_FLAG)
コード例 #6
0
    def test_posting_data_to_blob(self):
        """
        Test for post_data_to_blob method. Used in ML services a lot
        """

        # logging test start
        LOGGER.info('Post data to blob test start')

        # create test file in file system
        make_test_file = open(
            os.path.join(self.temp_folder, self.test_file_name), 'a')
        make_test_file.write('123 312 222\n')
        make_test_file.close()

        # make metadata for POST
        message_object = {
            'ParentId': self.parent_id,
            'UserId': self.user_id
        }
        # make file data POST
        data_file_path = os.path.join(self.temp_folder, self.test_file_name)
        data_file_type = 'text/txt'
        # make multipart object, which used for simplifying POSTing
        multipart_object = get_multipart_object(
            message_object, data_file_path, data_file_type)

        # POST data to blob storage
        fetch_token(self.oauth)
        response = post_data_to_blob(self.oauth, multipart_object)

        # if post successful, 200 status code
        self.assertEqual(200, response.status_code)
コード例 #7
0
def conduct_training(model_trainer, method_code, training_parameters, oauth):
    """
    Method for train model, calculate metrics, plot graphs (and thumbnail) and
    POST all data to blob storage

    :param model_trainer: model trainer object
    :param method_code: code of training model using in OSDR
    :param training_parameters: model training parameters
    :param oauth: using in ml service OAuth2Session object
    :type model_trainer: ClassicClassifier, ClassicRegressor, DNNClassifier,
        DNNRegressor
    :type method_code: str
    :type training_parameters: dict
    :return: variables and blob ids using in OSDR
        (for rabbitmq model trained message)
    :rtype: dict
    """

    # add additional keys for training parameters, depends on method_code
    update_current_model_data(training_parameters, method_code)

    # train model, using trainer
    # add models, metrics, plots, etc as trainer attributes
    model_trainer.train_model(method_code)

    # send, http POST request to blob storage api with model data
    fetch_token(oauth)
    response_model = model_trainer.post_model(training_parameters, oauth)
    model_blob_id = response_model.json()[0]
    LOGGER.info('Send model to {}/{}'.format(BLOB_URL, CLIENT_ID))
    # change model trained flag
    # using in model trained or training failed rabbitmq messages to OSDR
    training_parameters['IsModelTrained'] = True
    # post QMRF report generated for current model
    model_trainer.post_qmrf_report(training_parameters, oauth)

    # get model file's variables, using in OSDR
    file_data = get_file_info_from_blob(oauth, model_blob_id).json()
    file_length = file_data['length']
    file_md5 = file_data['mD5']

    # POST plots with model metrics and metrics to blob storage
    path_to_csv_file = model_trainer.post_performance(training_parameters,
                                                      oauth)
    model_trainer.post_plots(training_parameters, oauth)
    post_thumbnail(model_trainer, training_parameters, oauth)

    return {
        'model_blob_id': model_blob_id,
        'model_file_length': file_length,
        'model_file_md5': file_md5,
        'path_to_csv_file': path_to_csv_file,
        'bins_number': model_trainer.bins
    }
コード例 #8
0
    def test_classic_classification_model_training_naive_bayes(self):
        """
        Test for 'model train' workflow
        """

        # logging test start
        LOGGER.info('Model trained test start')
        # create MassTransit emulation dict
        MODEL_TRAINED_TEST['event_callback'] = classic_classification_train_naive_bayes
        MODEL_TRAINED_TEST['command_callback'] = classic_classification_train_naive_bayes
        model_trained_publisher = MTPublisher(MODEL_TRAINED_TEST)
        # create valid message for send
        message_body = {
            'SourceBlobId': self.source_blob_id,
            'SourceBucket': self.user_id,
            'ParentId': self.parent_id,
            'Name': 'test_classic_classification_naive_bayes',
            'Method': CODES[NAIVE_BAYES],
            'ClassName': 'Soluble',
            'SubSampleSize': 1.0,
            'TestDatasetSize': 0.2,
            'KFold': 5,
            'Fingerprints': [
                {'Type': 'DESC'},
                {'Type': 'MACCS'},
                {'Type': 'AVALON', 'Size': 512},
                {'Type': 'FCFC', 'Size': 512, 'Radius': 3}
            ],
            'CorrelationId': 'c1cc0000-5d8b-0015-2aab-08d52f3ea201',
            'UserId': self.user_id,
            'Scaler': 'Standard',
            'HyperParameters': {
                'optimizationMethod': 'default',
                'numberOfIterations': 100
            },
            'DnnLayers': None,
            'DnnNeurons': None
        }
        # send message
        fetch_token(self.oauth)
        model_trained_publisher.publish(message_body)
        # wait while model training
        time.sleep(10)

        self.assertEqual(True, CLASSIC_CLASSIFICATION_NAIVE_TRAINED_FLAG)
コード例 #9
0
    def test_predictor_fail_event(self):
        """
        Test for 'properties prediction failed' workflow
        """

        # logging test start
        LOGGER.info('Predictor fail test start')
        # create MassTransit emulation dict
        PREDICTOR_FAIL_TEST['event_callback'] = predictor_fail
        PREDICTOR_FAIL_TEST['command_callback'] = predictor_fail
        fail_predictor_publisher = MTPublisher(PREDICTOR_FAIL_TEST)
        # create invalid message for send
        message_body = {'123': 123}
        # send message
        fetch_token(self.oauth)
        fail_predictor_publisher.publish(message_body)

        self.assertEqual(True, PREDICTOR_FAIL_FLAG)
コード例 #10
0
    def test_modeler_fail_event(self):
        """
        Test for 'modeling failed' workflow
        """

        # logging test start
        LOGGER.info('Modeler fail test start')
        # create MassTransit emulation dict
        MODELER_FAIL_TEST['event_callback'] = modeler_fail
        MODELER_FAIL_TEST['command_callback'] = modeler_fail
        fail_modeler_publisher = MTPublisher(MODELER_FAIL_TEST)
        # create invalid message for send
        message_body = {'123': 123}
        # send message
        fetch_token(self.oauth)
        fail_modeler_publisher.publish(message_body)

        self.assertEqual(True, MODELER_FAIL_FLAG)
コード例 #11
0
    def test_classic_classifier_single_properties_prediction(self):
        """
        Test for classic regressor training optimizer
        """

        # logging test start
        LOGGER.info('Classic regression training optimizer test start')
        # create MassTransit emulation dict
        PREDICT_SINGLE_STRUCTURE_TEST[
            'event_callback'] = classification_single_structure_predicted
        PREDICT_SINGLE_STRUCTURE_TEST[
            'command_callback'] = classification_single_structure_predicted
        single_structure_property_predicted_publisher = MTPublisher(
            PREDICT_SINGLE_STRUCTURE_TEST)

        # create valid message for send
        message_body = {
            'CorrelationId': 'c1cc0000-5d8b-0015-16ec-08d52f4161fb',
            'Id': 'd397a097-68eb-464a-86b2-abd2315ddeba',
            'Structure': '\n  Ketcher  5 71818102D 1   1.00000     0.00000     0\n\n 13 13  0     0  0            999 V2000\n   -2.3818   -0.6252    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n   -1.5159   -0.1254    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n   -1.5157    0.8745    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0\n   -0.6501   -0.6254    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0\n    0.2159   -0.1256    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    0.2162    0.8747    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    1.0824    1.3752    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    1.9483    0.8753    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    1.9479   -0.1251    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    1.0818   -0.6256    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    1.0820   -1.6254    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    1.9479   -2.1252    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0\n    0.2161   -2.1254    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0\n  1  2  1  0  0  0  0\n  2  3  2  0  0  0  0\n  2  4  1  0  0  0  0\n  4  5  1  0  0  0  0\n  5  6  2  0  0  0  0\n  6  7  1  0  0  0  0\n  7  8  2  0  0  0  0\n  8  9  1  0  0  0  0\n  9 10  2  0  0  0  0\n  5 10  1  0  0  0  0\n 10 11  1  0  0  0  0\n 11 12  1  0  0  0  0\n 11 13  2  0  0  0  0\nM  END\n',
            'Format': 'MOL',
            'PropertyName': 'Soluble',
            'Models': [
                {
                    'Id': str(uuid.uuid1()),
                    'Blob': {
                        'id': LOGISTIC_REGRESSION_MODEL_BLOB_ID,
                        'bucket': CLIENT_ID
                    }
                },
                {
                    'Id': str(uuid.uuid1()),
                    'Blob': {
                        'id': NAIVE_BAYES_MODEL_BLOB_ID,
                        'bucket': CLIENT_ID
                    }
                }
            ]
        }
        # send message
        fetch_token(self.oauth)
        single_structure_property_predicted_publisher.publish(message_body)

        self.assertEqual(True, CLASSIFICATION_SINGLE_STRUCTURE_PREDICTED_FLAG)
コード例 #12
0
def preload_ssp_models():
    """
    Method to preload all SSP models to memory, model should have SSP target.
    Using to spped up predictions later
    Add preloaded SSP models to MODELS_IN_MEMORY_CACHE general_helper.py module
    """

    oauth = get_oauth()

    # set default GET request header and parameters values
    headers = {'Accept': 'application/json'}
    params = (
        ('$filter', 'Targets eq \'SSP\''),
        ('PageNumber', '1'),
        ('PageSize', '100'),
    )

    # GET all SSP models information from OSDR web api
    response = requests.get(API_MODELS_ENTITIES_URL,
                            headers=headers,
                            params=params,
                            verify=False)

    # loop to preload all SSP models
    for model_data in response.json():
        fetch_token(oauth)
        # get model blob id and bucket from response
        model_blob_id = model_data['blob']['id']
        model_bucket = model_data['blob']['bucket']
        # get model info
        blob_model_info = get_model_info(oauth, model_blob_id, model_bucket)
        blob_model_info['ModelBlobId'] = model_blob_id
        blob_model_info['ModelBucket'] = model_bucket
        # preset parameters using to prediction
        prediction_parameters = dict()
        prepare_prediction_files(oauth, prediction_parameters, blob_model_info)
        # add model to cache, if it not in cache
        if model_blob_id not in MODELS_IN_MEMORY_CACHE.keys():
            try:
                MODELS_IN_MEMORY_CACHE[model_blob_id] = cache_model(
                    prediction_parameters['ModelsFolder'])
            except FileNotFoundError:
                LOGGER.error('Cant preload SSP model with blob id: {}'.format(
                    model_blob_id))
コード例 #13
0
def classic_classification_train_logistic_regression(body):
    """
    Callback function which work when modelling success

    :param body: MassTransit message
    """
    global CLASSIC_CLASSIFICATION_LOGISTIC_TRAINED_FLAG
    global CLASSIC_CLASSIFICATION_FILES_BLOB_IDS
    global LOGISTIC_REGRESSION_MODEL_BLOB_ID

    oauth = get_oauth()
    fetch_token(oauth)

    list_of_ids = get_blob_ids_of_model_generic_files(body['BlobId'])
    CLASSIC_CLASSIFICATION_FILES_BLOB_IDS.extend(list_of_ids)
    LOGISTIC_REGRESSION_MODEL_BLOB_ID = body['BlobId']
    CLASSIC_CLASSIFICATION_LOGISTIC_TRAINED_FLAG = True

    return None
コード例 #14
0
    def test_regression_model_trained_event(self):
        """
        Test for 'classic regression model successful trained' workflow
        """

        # logging test start
        LOGGER.info('Regression model trained test start')
        # create MassTransit emulation dict
        MODEL_TRAINED_TEST['event_callback'] = classic_regression_train
        MODEL_TRAINED_TEST['command_callback'] = classic_regression_train
        model_trained_publisher = MTPublisher(MODEL_TRAINED_TEST)
        # create valid message for send
        message_body = {
            'CorrelationId': 'c1cc0000-5d8b-0015-2aab-08d52f3ea203',
            'SourceBlobId': self.source_blob_id,
            'SourceBucket': self.user_id,
            'ParentId': self.parent_id,
            'Name': 'test_regression',
            'Method': CODES[ELASTIC_NETWORK],
            'ClassName': 'logS',
            'SubSampleSize': 1.0,
            'TestDatasetSize': 0.2,
            'KFold': 2,
            'Fingerprints': [{'Type': 'ECFP', 'Radius': 4, 'Size': 512}],
            'UserId': self.user_id,
            'Scaler': 'Standard',
            'HyperParameters': {
                'OptimizationMethod': 'default',
                'NumberOfIterations': 100
            },
            'DnnLayers': None,
            'DnnNeurons': None
        }
        # send message
        fetch_token(self.oauth)
        model_trained_publisher.publish(message_body)
        # wait while model training
        time.sleep(10)

        self.assertEqual(True, CLASSIC_REGRESSION_TRAINED_FLAG)
コード例 #15
0
def classic_classification_train_naive_bayes(body):
    """
    Callback function which work when modelling success

    :param body: MassTransit message
    """
    global CLASSIC_CLASSIFICATION_NAIVE_TRAINED_FLAG
    global NAIVE_BAYES_MODEL_BLOB_ID
    global CLASSIC_CLASSIFICATION_MODEL_BUCKET
    global CLASSIC_CLASSIFICATION_FILES_BLOB_IDS

    oauth = get_oauth()
    fetch_token(oauth)

    list_of_ids = get_blob_ids_of_model_generic_files(body['BlobId'])
    NAIVE_BAYES_MODEL_BLOB_ID = body['BlobId']
    CLASSIC_CLASSIFICATION_MODEL_BUCKET = body['Bucket']
    CLASSIC_CLASSIFICATION_FILES_BLOB_IDS.extend(list_of_ids)

    CLASSIC_CLASSIFICATION_NAIVE_TRAINED_FLAG = True

    return None
コード例 #16
0
    def test_dnn_classifying_model_trained_event(self):
        """
        Test for 'DNN classifying model successful trained' workflow
        """

        # logging test start
        LOGGER.info('DNN classifying model trained test start')
        # create MassTransit emulation dict
        MODEL_TRAINED_TEST['event_callback'] = callback_models_trained
        MODEL_TRAINED_TEST['command_callback'] = callback_models_trained
        model_trained_publisher = MTPublisher(MODEL_TRAINED_TEST)
        # create valid message for send
        message_body = {
            'SourceBlobId': self.source_blob_id,
            'SourceBucket': self.user_id,
            'ParentId': self.parent_id,
            'Name': 'test_dnn_classifying',
            'SourceFileName': 'test.sdf',
            'Method': CODES[DNN_CLASSIFIER],
            'ClassName': 'Soluble',
            'SubSampleSize': 1.0,
            'TestDatasetSize': 0.2,
            'KFold': 2,
            'Fingerprints': [{'Type': 'ECFP', 'Radius': 2, 'Size': 512}],
            'CorrelationId': 'c1cc0000-5d8b-0015-2aab-08d52f3ea221',
            'UserId': self.user_id,
            'Scaler': 'Standard',
            'HyperParameters': {
                'optimizationMethod': 'default',
                'numberOfIterations': 100
            },
            'DnnLayers': 2,
            'DnnNeurons': 128
        }

        # send message
        fetch_token(self.oauth)
        model_trained_publisher.publish(message_body)
コード例 #17
0
    def test_classifier_training_optimizer(self):
        """
        Test for classic classifier training optimizer
        """

        # logging test start
        LOGGER.info('Classic classification training optimizer test start')
        # create MassTransit emulation dict
        OPTIMIZE_TRAINING_TEST['event_callback'] = regression_training_optimized
        OPTIMIZE_TRAINING_TEST['command_callback'] = regression_training_optimized
        training_optimized_publisher = MTPublisher(OPTIMIZE_TRAINING_TEST)

        # create valid message for send
        message_body = {
            'CorrelationId': 'c1ca0000-5d8b-0015-16ec-08d52f4161fe',
            'Id': 'd397a097-68eb-464a-86b2-abd2315ddebf',
            'UserId': self.user_id,
            'SourceFileName': 'FileNameThere',
            'TargetFolderId': self.parent_id,
            'TimeStamp': '00',
            'SourceBlobId': self.source_blob_id,
            'SourceBucket': self.user_id,
            'Methods': [CODES[NAIVE_BAYES], CODES[LOGISTIC_REGRESSION]],
            'ClassName': 'Soluble',
            'HyperParameters': {
                'NumberOfIterations': 100,
                'OptimizationMethod': 'default'
            },
            'DnnLayers': None,
            'DnnNeurons': None
        }
        # send message
        fetch_token(self.oauth)
        training_optimized_publisher.publish(message_body)

        self.assertEqual(True, REGRESSOR_TRAINING_OPTIMIZED)
コード例 #18
0
    def test_training_optimizer_fail(self):
        """
        Test for classic regressor training optimizer
        """

        # logging test start
        LOGGER.info('Training optimizer failed test start')
        # create MassTransit emulation dict
        OPTIMIZE_TRAINING_FAIL_TEST[
            'event_callback'] = training_optimizer_fail
        OPTIMIZE_TRAINING_FAIL_TEST[
            'command_callback'] = training_optimizer_fail
        properties_predicted_publisher = MTPublisher(
            OPTIMIZE_TRAINING_FAIL_TEST)

        # create valid message for send
        message_body = {
            'Methods': []
        }
        # send message
        fetch_token(self.oauth)
        properties_predicted_publisher.publish(message_body)

        self.assertEqual(True, TRAINING_OPTIMIZER_FAIL_FLAG)
コード例 #19
0
def get_blob_id(test_object):
    """
    Method for add file to blob storage and get file's blob id

    :param test_object: TestCase class object
    :return: saved file blob id
    """

    fetch_token(test_object.oauth)

    # make file's metadata
    metadata = {
        'ParentId': test_object.parent_id,
        'UserId': test_object.user_id
    }
    # make path to file independent of OS
    path_to_file = test_object.test_file
    # prepare multipart object to POST
    multipart_file = get_multipart_object(metadata, path_to_file, 'text/sdf')
    # POST file with metadata to blob storage
    response = post_data_to_blob(
        test_object.oauth, multipart_file, bucket_id=test_object.user_id)

    return response.json()[0]
コード例 #20
0
def generate_training_report(body):
    """
    Pika callback function used by training report generator.
    Make plots files, general metrics csv file and report file if success.

    :param body: RabbitMQ MT message's body
    """

    oauth = get_oauth()
    fetch_token(oauth)

    # define using variables for ml reporter
    model = body['Models'][0]
    model_blob_id = model['blobId']
    file_info = get_file_info_from_blob(oauth, model_blob_id).json()
    if 'ModelInfo' in file_info['metadata'].keys():
        info_key = 'ModelInfo'
    elif 'modelInfo' in file_info['metadata'].keys():
        info_key = 'modelInfo'
    else:
        raise KeyError('No model info')
    model_info = json.loads(file_info['metadata'][info_key])
    body['Bins'] = model_info['Bins']
    model_name = model_info['ModelName']
    model_type = model_info['ModelType']
    base_folder = '{}/general_training_report_{}'.format(
        TEMP_FOLDER, uuid.uuid1())
    make_directory(base_folder)
    LOGGER.info('MODEL INFO: {}'.format(model_info))
    LOGGER.info('MODEL NAME: {}'.format(model_name))
    LOGGER.info('MODEL TYPE: {}'.format(model_type))

    # generate general metrics dict
    all_models_metrics = []
    for model in body['Models']:
        # if something wrong with model file from blob storage
        if 'genericFiles' not in model.keys():
            raise TypeError('Empty model\'s generic files blob ids')

        for file_blob_id in model['genericFiles']:
            file_info = get_file_info_from_blob(oauth, file_blob_id).json()
            if 'fileInfo' in file_info['metadata'].keys():
                fileinfo_key = 'fileInfo'
            elif 'FileInfo' in file_info['metadata'].keys():
                fileinfo_key = 'FileInfo'
            else:
                raise KeyError('No fileInfo key in: {}'.format(
                    file_info['metadata'].keys()))

            file_info = json.loads(file_info['metadata'][fileinfo_key])

            if 'fileType' in file_info.keys():
                filetype_key = 'fileType'
            elif 'FileType' in file_info.keys():
                filetype_key = 'FileType'
            else:
                filetype_key = None

            if filetype_key and file_info[filetype_key] == TRAINING_CSV_METRICS:
                csv_blob_id = file_blob_id
                csv_model_metrics = get_file_from_blob(csv_blob_id,
                                                       oauth).content
                all_models_metrics.append(csv_model_metrics.decode())

            LOGGER.info('CURRENT MODEL INFO: {}'.format(file_info))

    LOGGER.info('ALL MODELS METRICS: {}'.format(all_models_metrics))
    # write general metrics data to csv file
    csv_files_names = write_csvs_files(all_models_metrics)
    general_csv_dict = merge_csv_files(csv_files_names)
    rows = make_general_csv_rows(general_csv_dict)
    general_csv_file_path = write_rows_to_csv_file(rows, base_folder)
    metrics = html_metrics_from_dict(general_csv_dict)

    fetch_token(oauth)
    # make csv info for blob storage
    general_csv_info = {
        'FileInfo':
        json.dumps({
            'modelName': model_name,
            'fileType': ALL_MODELS_TRAINING_CSV_METRICS
        }),
        'SkipOsdrProcessing':
        'true'
    }
    # make multipart object prepared to POST to blob storage
    # include csv file and file info
    multipart_general_csv = get_multipart_object(
        body,
        general_csv_file_path,
        'text/csv',
        additional_fields=general_csv_info)
    # POST metrcis csv file to blob storage
    post_data_to_blob(oauth, multipart_general_csv)

    # create general images
    body['NumberOfGenericFiles'] = 0
    path_to_radar_plot = None
    try:
        if model_type == CLASSIFIER:
            LOGGER.info('Creating radar_plot')
            nbits = body['Bins']
            path_to_radar_plot = radar_plot(general_csv_file_path,
                                            base_folder,
                                            nbits,
                                            titlename=model_name)
            # make radar plot multipart encoded object
            multipart_radar_plot = get_multipart_object(
                body,
                path_to_radar_plot,
                'image/png',
                additional_fields={'correlationId': body['CorrelationId']})

            # send, http POST request to blob storage api with radar plot
            post_data_to_blob(oauth, multipart_radar_plot)
            body['NumberOfGenericFiles'] += 1
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        raise Exception('Post generic data exception')

    optimizer_metrics = REDIS_CLIENT.get('optimizer_metrics_{}'.format(
        body['CorrelationId']))

    if optimizer_metrics:
        optimizer_metrics = html_optimal_metrics_from_dict(
            json.loads(optimizer_metrics), model_type)

    # add metrics and images to pdf report file
    context = {
        'metrics': metrics,
        'radar_plots': [path_to_radar_plot],
        'optimizer': optimizer_metrics
    }
    pdf_path = make_pdf_report(base_folder, context, model_name='general')
    fetch_token(oauth)
    multipart_general_csv = get_multipart_object(
        body,
        pdf_path,
        'application/pdf',
        additional_fields={'correlationId': body['CorrelationId']})
    post_data_to_blob(oauth, multipart_general_csv)
    body['NumberOfGenericFiles'] += 1

    # remove temporary directory
    shutil.rmtree(base_folder, ignore_errors=True)

    report_generated = training_report_generated_message(body)
    model_report_generated_message_publisher = PurePublisher(REPORT_GENERATED)
    model_report_generated_message_publisher.publish(report_generated)

    LOGGER.info('Report generated!')

    return None
コード例 #21
0
def train_model(body):
    """
    Pika callback function used by ml modeller. Make plots files, metrics files
    and model file if success.

    :param body: RabbitMQ MT message's body
    """

    # prepared needed for calculations variables
    body['CurrentModelId'] = body['CorrelationId']
    oauth = get_oauth()

    if not body['Method']:
        raise ValueError('Empty method in model trainer')

    body['Method'] = body['Method'].lower()
    method_code = body['Method']
    body['Name'] = '_'.join(algorithm_name_by_code(method_code).split())

    # publish training started event to OSDR
    fetch_token(oauth)
    publish_start_training_event(body)

    # validate input data
    # raise error with invalid parameter
    validate_user_input(body)

    # make dataframe using input parameters
    model_type = model_type_by_code(method_code)
    try:
        dataframe = make_dataframe(model_type, body, oauth)
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        raise Exception('Make dataframe exception')

    # calculate scaler model
    model_trainer = make_model_trainer(method_code, body, dataframe)
    LOGGER.info('Sending scaler file to {}/{}'.format(BLOB_URL, CLIENT_ID))
    fetch_token(oauth)

    # make applicability domain files and calculate values
    LOGGER.info('Calculate applicability domain')
    model_trainer.make_applicability_domain()

    # generic files for current training model, except model (*.sav) files
    body['NumberOfGenericFiles'] = 0
    # train chosen model
    LOGGER.info('Start model training')
    try:
        model_training_data = conduct_training(model_trainer, method_code,
                                               body, oauth)
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        raise Exception('Model training exception')

    # POST model's files (images, csvs etc)
    try:
        # POST classifier model files
        if model_type == CLASSIFIER:
            post_classifier(model_trainer, body, oauth,
                            model_training_data['path_to_csv_file'])
        # POST regressor model files
        elif model_type == REGRESSOR:
            post_regressor(model_trainer, body, oauth)
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        raise Exception('Post generic data exception')

    # update template tags with all needed data for model's report
    model_trainer.make_report_text()

    # make pdf report for trained model
    pdf_path = make_pdf_report(model_trainer.sub_folder,
                               model_trainer.template_tags,
                               model_name=body['Name'])
    LOGGER.info('Sending pdf report to {}/{}'.format(BLOB_URL, CLIENT_ID))

    # POST pdf report to blob storage
    fetch_token(oauth)
    model_report_info = {
        'FileInfo':
        json.dumps({
            'modelName': model_trainer.cv_model.model_name,
            'fileType': MODEL_PDF_REPORT
        }),
        'ParentId':
        body['CurrentModelId']
    }
    multipart_pdf = get_multipart_object(body,
                                         pdf_path,
                                         'application/pdf',
                                         additional_fields=model_report_info)
    post_data_to_blob(oauth, multipart_pdf)
    body['NumberOfGenericFiles'] += 1

    # remove temporary directory
    shutil.rmtree(model_trainer.sub_folder, ignore_errors=True)

    # send model trained message to OSDR via rabbitmq
    model_trained = model_trained_message(body, model_training_data,
                                          model_trainer)
    model_trained_message_publisher = PurePublisher(MODEL_TRAINED)
    model_trained_message_publisher.publish(model_trained)

    LOGGER.info('Finished Calculations!')

    return None
コード例 #22
0
def callback(body):
    """
    Pika callback function used by ml predictor.
    Make file with predicted properties by picked model.
    Send file to blob storage for OSDR

    :param body: RabbitMQ MT message's body
    """

    oauth = get_oauth()
    prediction_parameters = dict()
    fetch_token(oauth)
    # update prediction parameters
    # add dataset as bytes
    # add dataset file name, just for report message
    prediction_parameters['Dataset'], prediction_parameters[
        'DatasetFileName'] = get_dataset(oauth, body)
    model_info = get_model_info(oauth, body['ModelBlobId'],
                                body['ModelBucket'])

    fetch_token(oauth)
    model_info['ModelBlobId'] = body['ModelBlobId']
    model_info['ModelBucket'] = body['ModelBucket']
    model_info['ModelCode'] = algorithm_code_by_name(model_info['Method'])
    # update prediction parameters with model paramers, such as density matrix,
    # distance model, MODI, fingerprints etc
    prepare_prediction_parameters(oauth, prediction_parameters, model_info)

    # update prediction parameters
    # add list of molecules
    prediction_parameters['Molecules'] = get_molecules_from_sdf_bytes(
        prediction_parameters['Dataset'])

    # define predictor object using prediction parameters
    # make prediction for all molecules
    ml_predictor = MLPredictor(prediction_parameters)
    ml_predictor.make_prediction()

    # send prediction result to OSDR
    # write prediction to csv
    prediction_csv_path = ml_predictor.write_prediction_to_csv()
    fetch_token(oauth)
    # prepare multipart object
    multipart_csv = get_multipart_object(body, prediction_csv_path, 'text/csv')
    # POST data to blob storage
    response_csv = post_data_to_blob(oauth, multipart_csv)

    # get prediction blob id and publish message in properties predicted queue
    prediction_created_event = property_predicted(
        body, os.path.basename(prediction_csv_path),
        response_csv.json()[0])
    properties_predicted_publisher = PurePublisher(PROPERTIES_PREDICTED)
    properties_predicted_publisher.publish(prediction_created_event)

    # remove prediction file from temporary folder
    os.remove(prediction_csv_path)
    # remove temporary models folder
    shutil.rmtree(prediction_parameters['ModelsFolder'], ignore_errors=True)
    # clear memory
    del MODELS_IN_MEMORY_CACHE[model_info['ModelBlobId']]

    return None
コード例 #23
0
def callback(body):
    """
    Pika callback function used by single structure predictor.
    Make list of json with prediction data for each model prediction.

    :param body: RabbitMQ MT message's body
    """

    # start total prediction time counter
    start_prediction_total = time()
    models_data = []
    oauth = get_oauth()
    # try to reformat molecule, if using "\\" instead of "\" in mol string
    if '\\n' in body['Structure']:
        body['Structure'] = body['Structure'].replace('\\n', '\n')

    # set molecules converter function depends on input structure type
    molecules_converter_method = None
    if body['Format'] == 'MOL':
        molecules_converter_method = molecules_from_mol_strings
    elif body['Format'] == 'SMILES':
        molecules_converter_method = molecules_from_smiles

    # read molecules from input mol string or SMILES
    molecules = None
    exception_message = None
    try:
        molecules = molecules_converter_method([body['Structure']])
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        exception_message = 'Get molecule from molstring exception'
    # get models ids, models blob ids, models buckets from input message
    fetch_token(oauth)
    model_id, models_blob_ids, models_buckets = get_models_from_body_message(
        body)
    # make prediction for all models
    for model_id, model_blob_id, model_bucket in zip(model_id, models_blob_ids,
                                                     models_buckets):
        # start current prediction counter
        start_current_prediction = time()

        start_timer = time()
        # define exception message for current prediction
        if not exception_message:
            exception_message = None

        # initialize prediction parameters
        prediction_parameters = dict()
        prediction_parameters['DatasetFileName'] = 'Single Structure Predict'
        prediction_parameters['Molecules'] = molecules

        # get current model info
        fetch_token(oauth)
        blob_model_info = get_model_info(oauth, model_blob_id, model_bucket)
        blob_model_info['ModelBlobId'] = model_blob_id
        blob_model_info['ModelBucket'] = model_bucket
        blob_model_info['ModelCode'] = algorithm_code_by_name(
            blob_model_info['Method'])

        # add prediction data to json
        # add training parameters
        # add dataset parameters
        predicted_data = {
            'id': model_id,
            'trainingParameters': training_parameters(blob_model_info),
            'property': property_parameters(blob_model_info),
            'dataset': dataset(blob_model_info),
            'reportId': str(uuid.uuid1())
        }

        fetch_token(oauth)
        LOGGER.info('INITIAL PREPARING: {} sec'.format(time() - start_timer))
        try:
            # update prediction parameters
            prepare_prediction_parameters(oauth, prediction_parameters,
                                          blob_model_info)
        except:
            # log error traceback
            logging_exception_message(LOGGER)
            if not exception_message:
                exception_message = 'Prepare prediction parameters exception'

        start_timer = time()
        try:
            # make prediction
            ml_predictor = MLPredictor(prediction_parameters)
            ml_predictor.make_prediction()
            prediction = ml_predictor.prediction
        except:
            # log error traceback
            logging_exception_message(LOGGER)
            if not exception_message:
                exception_message = 'Predictor exception'
        LOGGER.info('PREDICTION TIME: {} sec'.format(time() - start_timer))

        # add error to json if something wrong
        if exception_message:
            predicted_data['error'] = error(exception_message)
        else:
            predicted_data['result'] = result(prediction)
            predicted_data['applicabilityDomain'] = applicability_domain(
                prediction)

        # stop current prediction counter
        current_prediction_time_seconds = time() - start_current_prediction
        predicted_data['predictionElapsedTime'] = int(
            current_prediction_time_seconds * 1000)

        models_data.append(predicted_data)

    # stop total predictions counter
    total_prediction_time_seconds = time() - start_prediction_total
    body['Data'] = {
        'predictionElapsedTime': int(total_prediction_time_seconds * 1000),
        'models': models_data
    }
    # add prediction consensus data to sending message
    consensus_data = consensus(models_data)
    if consensus_data:
        body['Data'].update({'consensus': consensus_data})

    # send (publish) properties predicted message to OSDR
    prediction_created_event = single_structure_property_predicted(body)
    properties_predicted_publisher = PurePublisher(SINGLE_STRUCTURE_PREDICTED)
    properties_predicted_publisher.publish(prediction_created_event)

    return None
コード例 #24
0
ファイル: optimizer.py プロジェクト: scidatasoft/ml-services
def find_optimal_parameters(body):
    """
    Pika callback function used by ml optimizer
    Find optimal training fingerprints set for input dataset
    Using only 1000 (by default) or less structures from input dataset
    Send overall optimizing result to Redis, to use it in ml training report

    :param body: RabbitMQ MT message's body
    :type body: dict
    """

    oauth = get_oauth()

    # check input methods
    if not body['Methods']:
        raise ValueError('Empty Methods')

    # calculate metrics for each fingerprints set
    metrics, target_metric = fingerprints_grid_search(
        oauth, body, BASE_FINGERPRINTS)
    # send all metrics to redis
    # later use it to add to training report
    REDIS_CLIENT.setex(
        'optimizer_metrics_{}'.format(body['CorrelationId']), EXPIRATION_TIME,
        json.dumps(metrics)
    )
    # find best fingerprints set
    optimal_fingerprints = sorted(
        metrics.values(), key=lambda value: value['metrics'][target_metric],
        reverse=True
    )[0]['fptype']
    # set other default 'optimal' parameters for training model
    body['SubSampleSize'] = 1.0
    body['TestDataSize'] = 0.3
    body['Scaler'] = 'MinMax'
    body['KFold'] = 5
    body['Fingerprints'] = optimal_fingerprints
    body['OptimizationMethod'] = 'default'
    body['NumberOfIterations'] = 100

    # make optimizer metrics csv and post it to blob storage
    formatted_metrics = TMP_TMP(
        metrics, model_type_by_code(body['Methods'][0].lower()))
    csv_path = '{}/ml_optimizer/{}/optimizing.csv'.format(
        TEMP_FOLDER, body['CorrelationId'])
    write_optimized_metrics_to_csv(formatted_metrics, csv_path)
    multipart_model = get_multipart_object(
        body, csv_path, 'application/x-spss-sav',
        additional_fields={'ParentId': body['TargetFolderId']}
    )

    # send optimizer metrics csv file to blob storage
    fetch_token(oauth)
    response = post_data_to_blob(oauth, multipart_model)
    LOGGER.info('Optimizer csv status code: {}'.format(response.status_code))

    # send best fingerprints set and 'optimal' parameters to training model
    training_optimized = model_training_optimized(body)
    training_optimized_message_publisher = PurePublisher(TRAINING_OPTIMIZED)
    training_optimized_message_publisher.publish(training_optimized)

    # clear current optimization folder
    shutil.rmtree(
        '{}/ml_optimizer/{}'.format(TEMP_FOLDER, body['CorrelationId']),
        ignore_errors=True
    )