Exemple #1
0
    def test_posting_data_to_blob(self):
        """
        Test for post_data_to_blob method. Used in ML services a lot
        """

        # logging test start
        LOGGER.info('Post data to blob test start')

        # create test file in file system
        make_test_file = open(
            os.path.join(self.temp_folder, self.test_file_name), 'a')
        make_test_file.write('123 312 222\n')
        make_test_file.close()

        # make metadata for POST
        message_object = {
            'ParentId': self.parent_id,
            'UserId': self.user_id
        }
        # make file data POST
        data_file_path = os.path.join(self.temp_folder, self.test_file_name)
        data_file_type = 'text/txt'
        # make multipart object, which used for simplifying POSTing
        multipart_object = get_multipart_object(
            message_object, data_file_path, data_file_type)

        # POST data to blob storage
        fetch_token(self.oauth)
        response = post_data_to_blob(self.oauth, multipart_object)

        # if post successful, 200 status code
        self.assertEqual(200, response.status_code)
def post_classifier(classifier, training_parameters, oauth, metrics_path):
    """
    Method for POST special for classifier model files, such as radar plot

    :param classifier: model trainer object
    :param training_parameters: model training parameters
    :param oauth: using in ml service OAuth2Session object
    :param metrics_path: path to model's metrics csv file
    :type classifier: ClassicClassifier, DNNClassifier
    :type training_parameters: dict
    """

    LOGGER.info('Creating radar_plot')
    # get nbits number from model trainer
    nbits = classifier.bins

    # generate radar plot image
    # get path to radar plot image
    path_to_radar_plot = radar_plot(metrics_path,
                                    classifier.sub_folder,
                                    nbits,
                                    titlename=training_parameters['Name'])
    # update model trainer template tags
    # using on pdf report creation
    classifier.template_tags['radar_plots'].append(path_to_radar_plot)
    # generate radar plot info for blob storage
    radar_plot_info = {
        'FileInfo': json.dumps({
            'modelName': classifier.cv_model.model_name,
        }),
        'ParentId': training_parameters['CurrentModelId']
    }
    # make radar plot multipart encoded object
    multipart_radar_plot = get_multipart_object(
        training_parameters,
        path_to_radar_plot,
        'image/png',
        additional_fields=radar_plot_info)
    # log sending multipart encoded object
    LOGGER.info('Sending radar plot to {}/{}'.format(BLOB_URL, CLIENT_ID))
    training_parameters['NumberOfGenericFiles'] += 1
    # send, http POST request to blob storage api with radar plot
    post_data_to_blob(oauth, multipart_radar_plot)
def post_thumbnail(model_trainer, training_parameters, oauth):
    """
    Method which send (publish) message with model's thumbnail data to OSDR

    :param model_trainer: model trainer object
    :param training_parameters: model training parameters
    :param oauth: using in ml service OAuth2Session object
    :type model_trainer: ClassicClassifier, ClassicRegressor, DNNClassifier,
        DNNRegressor
    :type training_parameters: dict
    """

    # get path to thumbnail image from model trainer
    path_to_thumbnail = model_trainer.plots['mean']['thumbnail_plot_path']
    # generate thumbnail info (using in blob storage)
    thumbnail_info = {
        'FileInfo':
        json.dumps({
            'modelName': model_trainer.cv_model.model_name,
            'fileType': THUMBNAIL_IMAGE
        }),
        'SkipOsdrProcessing':
        'true',
        'ParentId':
        training_parameters['CurrentModelId']
    }
    # prepare thumbnail multipart object for send
    multipart_thumbnail = get_multipart_object(
        training_parameters,
        path_to_thumbnail,
        'image/png',
        additional_fields=thumbnail_info)
    # POST prepared thumbnail object (image and info) to blob storage
    # get response from blob storage
    response = post_data_to_blob(oauth, multipart_thumbnail)
    # send (publish) thumbnail blob id in OSDR via rabbitmq
    thumbnail_blob_id = response.json()[0]
    publish_thumbnail_generated(training_parameters, thumbnail_blob_id)
def post_regressor(regresor, training_parameters, oauth):
    """
    Method for POST special for regresor model files,
    such as distribution plot

    :param regresor: model trainer object
    :param training_parameters: model training parameters
    :param oauth: using in ml service OAuth2Session object
    :type regresor: ClassicRegressor, DNNRegressor
    :type training_parameters: dict
    """

    LOGGER.info('Creating distribution_plot')

    # generate distribution plot
    # get path to distribution plot
    path_to_distribution = distribution_plot(
        regresor, model_name=training_parameters['Name'])
    # generate distribution plot info for blob storage
    distribution_plot_info = {
        'FileInfo': json.dumps({
            'modelName': regresor.cv_model.model_name,
        }),
        'ParentId': training_parameters['CurrentModelId']
    }
    # make distribution plot multipart encoded object
    multipart_distribution_plot = get_multipart_object(
        training_parameters,
        path_to_distribution,
        'image/png',
        additional_fields=distribution_plot_info)
    # log sending multipart encoded object
    LOGGER.info('Sending distribution plot to {}/{}'.format(
        BLOB_URL, CLIENT_ID))
    training_parameters['NumberOfGenericFiles'] += 1
    # send, http POST request to blob storage api with distribution plot
    post_data_to_blob(oauth, multipart_distribution_plot)
Exemple #5
0
def get_blob_id(test_object):
    """
    Method for add file to blob storage and get file's blob id

    :param test_object: TestCase class object
    :return: saved file blob id
    """

    fetch_token(test_object.oauth)

    # make file's metadata
    metadata = {
        'ParentId': test_object.parent_id,
        'UserId': test_object.user_id
    }
    # make path to file independent of OS
    path_to_file = test_object.test_file
    # prepare multipart object to POST
    multipart_file = get_multipart_object(metadata, path_to_file, 'text/sdf')
    # POST file with metadata to blob storage
    response = post_data_to_blob(
        test_object.oauth, multipart_file, bucket_id=test_object.user_id)

    return response.json()[0]
def generate_training_report(body):
    """
    Pika callback function used by training report generator.
    Make plots files, general metrics csv file and report file if success.

    :param body: RabbitMQ MT message's body
    """

    oauth = get_oauth()
    fetch_token(oauth)

    # define using variables for ml reporter
    model = body['Models'][0]
    model_blob_id = model['blobId']
    file_info = get_file_info_from_blob(oauth, model_blob_id).json()
    if 'ModelInfo' in file_info['metadata'].keys():
        info_key = 'ModelInfo'
    elif 'modelInfo' in file_info['metadata'].keys():
        info_key = 'modelInfo'
    else:
        raise KeyError('No model info')
    model_info = json.loads(file_info['metadata'][info_key])
    body['Bins'] = model_info['Bins']
    model_name = model_info['ModelName']
    model_type = model_info['ModelType']
    base_folder = '{}/general_training_report_{}'.format(
        TEMP_FOLDER, uuid.uuid1())
    make_directory(base_folder)
    LOGGER.info('MODEL INFO: {}'.format(model_info))
    LOGGER.info('MODEL NAME: {}'.format(model_name))
    LOGGER.info('MODEL TYPE: {}'.format(model_type))

    # generate general metrics dict
    all_models_metrics = []
    for model in body['Models']:
        # if something wrong with model file from blob storage
        if 'genericFiles' not in model.keys():
            raise TypeError('Empty model\'s generic files blob ids')

        for file_blob_id in model['genericFiles']:
            file_info = get_file_info_from_blob(oauth, file_blob_id).json()
            if 'fileInfo' in file_info['metadata'].keys():
                fileinfo_key = 'fileInfo'
            elif 'FileInfo' in file_info['metadata'].keys():
                fileinfo_key = 'FileInfo'
            else:
                raise KeyError('No fileInfo key in: {}'.format(
                    file_info['metadata'].keys()))

            file_info = json.loads(file_info['metadata'][fileinfo_key])

            if 'fileType' in file_info.keys():
                filetype_key = 'fileType'
            elif 'FileType' in file_info.keys():
                filetype_key = 'FileType'
            else:
                filetype_key = None

            if filetype_key and file_info[filetype_key] == TRAINING_CSV_METRICS:
                csv_blob_id = file_blob_id
                csv_model_metrics = get_file_from_blob(csv_blob_id,
                                                       oauth).content
                all_models_metrics.append(csv_model_metrics.decode())

            LOGGER.info('CURRENT MODEL INFO: {}'.format(file_info))

    LOGGER.info('ALL MODELS METRICS: {}'.format(all_models_metrics))
    # write general metrics data to csv file
    csv_files_names = write_csvs_files(all_models_metrics)
    general_csv_dict = merge_csv_files(csv_files_names)
    rows = make_general_csv_rows(general_csv_dict)
    general_csv_file_path = write_rows_to_csv_file(rows, base_folder)
    metrics = html_metrics_from_dict(general_csv_dict)

    fetch_token(oauth)
    # make csv info for blob storage
    general_csv_info = {
        'FileInfo':
        json.dumps({
            'modelName': model_name,
            'fileType': ALL_MODELS_TRAINING_CSV_METRICS
        }),
        'SkipOsdrProcessing':
        'true'
    }
    # make multipart object prepared to POST to blob storage
    # include csv file and file info
    multipart_general_csv = get_multipart_object(
        body,
        general_csv_file_path,
        'text/csv',
        additional_fields=general_csv_info)
    # POST metrcis csv file to blob storage
    post_data_to_blob(oauth, multipart_general_csv)

    # create general images
    body['NumberOfGenericFiles'] = 0
    path_to_radar_plot = None
    try:
        if model_type == CLASSIFIER:
            LOGGER.info('Creating radar_plot')
            nbits = body['Bins']
            path_to_radar_plot = radar_plot(general_csv_file_path,
                                            base_folder,
                                            nbits,
                                            titlename=model_name)
            # make radar plot multipart encoded object
            multipart_radar_plot = get_multipart_object(
                body,
                path_to_radar_plot,
                'image/png',
                additional_fields={'correlationId': body['CorrelationId']})

            # send, http POST request to blob storage api with radar plot
            post_data_to_blob(oauth, multipart_radar_plot)
            body['NumberOfGenericFiles'] += 1
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        raise Exception('Post generic data exception')

    optimizer_metrics = REDIS_CLIENT.get('optimizer_metrics_{}'.format(
        body['CorrelationId']))

    if optimizer_metrics:
        optimizer_metrics = html_optimal_metrics_from_dict(
            json.loads(optimizer_metrics), model_type)

    # add metrics and images to pdf report file
    context = {
        'metrics': metrics,
        'radar_plots': [path_to_radar_plot],
        'optimizer': optimizer_metrics
    }
    pdf_path = make_pdf_report(base_folder, context, model_name='general')
    fetch_token(oauth)
    multipart_general_csv = get_multipart_object(
        body,
        pdf_path,
        'application/pdf',
        additional_fields={'correlationId': body['CorrelationId']})
    post_data_to_blob(oauth, multipart_general_csv)
    body['NumberOfGenericFiles'] += 1

    # remove temporary directory
    shutil.rmtree(base_folder, ignore_errors=True)

    report_generated = training_report_generated_message(body)
    model_report_generated_message_publisher = PurePublisher(REPORT_GENERATED)
    model_report_generated_message_publisher.publish(report_generated)

    LOGGER.info('Report generated!')

    return None
def callback(body):
    """
    Pika callback function used by ml predictor.
    Make file with predicted properties by picked model.
    Send file to blob storage for OSDR

    :param body: RabbitMQ MT message's body
    """

    oauth = get_oauth()
    prediction_parameters = dict()
    fetch_token(oauth)
    # update prediction parameters
    # add dataset as bytes
    # add dataset file name, just for report message
    prediction_parameters['Dataset'], prediction_parameters[
        'DatasetFileName'] = get_dataset(oauth, body)
    model_info = get_model_info(oauth, body['ModelBlobId'],
                                body['ModelBucket'])

    fetch_token(oauth)
    model_info['ModelBlobId'] = body['ModelBlobId']
    model_info['ModelBucket'] = body['ModelBucket']
    model_info['ModelCode'] = algorithm_code_by_name(model_info['Method'])
    # update prediction parameters with model paramers, such as density matrix,
    # distance model, MODI, fingerprints etc
    prepare_prediction_parameters(oauth, prediction_parameters, model_info)

    # update prediction parameters
    # add list of molecules
    prediction_parameters['Molecules'] = get_molecules_from_sdf_bytes(
        prediction_parameters['Dataset'])

    # define predictor object using prediction parameters
    # make prediction for all molecules
    ml_predictor = MLPredictor(prediction_parameters)
    ml_predictor.make_prediction()

    # send prediction result to OSDR
    # write prediction to csv
    prediction_csv_path = ml_predictor.write_prediction_to_csv()
    fetch_token(oauth)
    # prepare multipart object
    multipart_csv = get_multipart_object(body, prediction_csv_path, 'text/csv')
    # POST data to blob storage
    response_csv = post_data_to_blob(oauth, multipart_csv)

    # get prediction blob id and publish message in properties predicted queue
    prediction_created_event = property_predicted(
        body, os.path.basename(prediction_csv_path),
        response_csv.json()[0])
    properties_predicted_publisher = PurePublisher(PROPERTIES_PREDICTED)
    properties_predicted_publisher.publish(prediction_created_event)

    # remove prediction file from temporary folder
    os.remove(prediction_csv_path)
    # remove temporary models folder
    shutil.rmtree(prediction_parameters['ModelsFolder'], ignore_errors=True)
    # clear memory
    del MODELS_IN_MEMORY_CACHE[model_info['ModelBlobId']]

    return None
def train_model(body):
    """
    Pika callback function used by ml modeller. Make plots files, metrics files
    and model file if success.

    :param body: RabbitMQ MT message's body
    """

    # prepared needed for calculations variables
    body['CurrentModelId'] = body['CorrelationId']
    oauth = get_oauth()

    if not body['Method']:
        raise ValueError('Empty method in model trainer')

    body['Method'] = body['Method'].lower()
    method_code = body['Method']
    body['Name'] = '_'.join(algorithm_name_by_code(method_code).split())

    # publish training started event to OSDR
    fetch_token(oauth)
    publish_start_training_event(body)

    # validate input data
    # raise error with invalid parameter
    validate_user_input(body)

    # make dataframe using input parameters
    model_type = model_type_by_code(method_code)
    try:
        dataframe = make_dataframe(model_type, body, oauth)
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        raise Exception('Make dataframe exception')

    # calculate scaler model
    model_trainer = make_model_trainer(method_code, body, dataframe)
    LOGGER.info('Sending scaler file to {}/{}'.format(BLOB_URL, CLIENT_ID))
    fetch_token(oauth)

    # make applicability domain files and calculate values
    LOGGER.info('Calculate applicability domain')
    model_trainer.make_applicability_domain()

    # generic files for current training model, except model (*.sav) files
    body['NumberOfGenericFiles'] = 0
    # train chosen model
    LOGGER.info('Start model training')
    try:
        model_training_data = conduct_training(model_trainer, method_code,
                                               body, oauth)
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        raise Exception('Model training exception')

    # POST model's files (images, csvs etc)
    try:
        # POST classifier model files
        if model_type == CLASSIFIER:
            post_classifier(model_trainer, body, oauth,
                            model_training_data['path_to_csv_file'])
        # POST regressor model files
        elif model_type == REGRESSOR:
            post_regressor(model_trainer, body, oauth)
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        raise Exception('Post generic data exception')

    # update template tags with all needed data for model's report
    model_trainer.make_report_text()

    # make pdf report for trained model
    pdf_path = make_pdf_report(model_trainer.sub_folder,
                               model_trainer.template_tags,
                               model_name=body['Name'])
    LOGGER.info('Sending pdf report to {}/{}'.format(BLOB_URL, CLIENT_ID))

    # POST pdf report to blob storage
    fetch_token(oauth)
    model_report_info = {
        'FileInfo':
        json.dumps({
            'modelName': model_trainer.cv_model.model_name,
            'fileType': MODEL_PDF_REPORT
        }),
        'ParentId':
        body['CurrentModelId']
    }
    multipart_pdf = get_multipart_object(body,
                                         pdf_path,
                                         'application/pdf',
                                         additional_fields=model_report_info)
    post_data_to_blob(oauth, multipart_pdf)
    body['NumberOfGenericFiles'] += 1

    # remove temporary directory
    shutil.rmtree(model_trainer.sub_folder, ignore_errors=True)

    # send model trained message to OSDR via rabbitmq
    model_trained = model_trained_message(body, model_training_data,
                                          model_trainer)
    model_trained_message_publisher = PurePublisher(MODEL_TRAINED)
    model_trained_message_publisher.publish(model_trained)

    LOGGER.info('Finished Calculations!')

    return None
Exemple #9
0
def find_optimal_parameters(body):
    """
    Pika callback function used by ml optimizer
    Find optimal training fingerprints set for input dataset
    Using only 1000 (by default) or less structures from input dataset
    Send overall optimizing result to Redis, to use it in ml training report

    :param body: RabbitMQ MT message's body
    :type body: dict
    """

    oauth = get_oauth()

    # check input methods
    if not body['Methods']:
        raise ValueError('Empty Methods')

    # calculate metrics for each fingerprints set
    metrics, target_metric = fingerprints_grid_search(
        oauth, body, BASE_FINGERPRINTS)
    # send all metrics to redis
    # later use it to add to training report
    REDIS_CLIENT.setex(
        'optimizer_metrics_{}'.format(body['CorrelationId']), EXPIRATION_TIME,
        json.dumps(metrics)
    )
    # find best fingerprints set
    optimal_fingerprints = sorted(
        metrics.values(), key=lambda value: value['metrics'][target_metric],
        reverse=True
    )[0]['fptype']
    # set other default 'optimal' parameters for training model
    body['SubSampleSize'] = 1.0
    body['TestDataSize'] = 0.3
    body['Scaler'] = 'MinMax'
    body['KFold'] = 5
    body['Fingerprints'] = optimal_fingerprints
    body['OptimizationMethod'] = 'default'
    body['NumberOfIterations'] = 100

    # make optimizer metrics csv and post it to blob storage
    formatted_metrics = TMP_TMP(
        metrics, model_type_by_code(body['Methods'][0].lower()))
    csv_path = '{}/ml_optimizer/{}/optimizing.csv'.format(
        TEMP_FOLDER, body['CorrelationId'])
    write_optimized_metrics_to_csv(formatted_metrics, csv_path)
    multipart_model = get_multipart_object(
        body, csv_path, 'application/x-spss-sav',
        additional_fields={'ParentId': body['TargetFolderId']}
    )

    # send optimizer metrics csv file to blob storage
    fetch_token(oauth)
    response = post_data_to_blob(oauth, multipart_model)
    LOGGER.info('Optimizer csv status code: {}'.format(response.status_code))

    # send best fingerprints set and 'optimal' parameters to training model
    training_optimized = model_training_optimized(body)
    training_optimized_message_publisher = PurePublisher(TRAINING_OPTIMIZED)
    training_optimized_message_publisher.publish(training_optimized)

    # clear current optimization folder
    shutil.rmtree(
        '{}/ml_optimizer/{}'.format(TEMP_FOLDER, body['CorrelationId']),
        ignore_errors=True
    )