Esempio n. 1
0
def get_blob_ids_of_model_generic_files(model_blob_id):
    oauth = get_oauth()
    fetch_token(oauth)

    file_info = get_file_info_from_blob(oauth, model_blob_id).json()
    if 'ModelInfo' in file_info['metadata'].keys():
        info_key = 'ModelInfo'
    elif 'modelInfo' in file_info['metadata'].keys():
        info_key = 'modelInfo'
    else:
        raise KeyError('No model info')

    model_id = json.loads(file_info['metadata'][info_key])['ModelId']

    file_to_read = open('{}/{}'.format(TEMP_FOLDER, model_id), 'r')
    lines = file_to_read.readlines()
    file_to_read.close()

    generic_files_ids = []
    for line in lines:
        generic_file_ids = json.loads(line.replace('\'', '"'))
        if generic_file_ids['parent_id'] != model_id:
            continue

        generic_files_ids.append(generic_file_ids['blob_id'])

    return generic_files_ids
Esempio n. 2
0
def preload_ssp_models():
    """
    Method to preload all SSP models to memory, model should have SSP target.
    Using to spped up predictions later
    Add preloaded SSP models to MODELS_IN_MEMORY_CACHE general_helper.py module
    """

    oauth = get_oauth()

    # set default GET request header and parameters values
    headers = {'Accept': 'application/json'}
    params = (
        ('$filter', 'Targets eq \'SSP\''),
        ('PageNumber', '1'),
        ('PageSize', '100'),
    )

    # GET all SSP models information from OSDR web api
    response = requests.get(API_MODELS_ENTITIES_URL,
                            headers=headers,
                            params=params,
                            verify=False)

    # loop to preload all SSP models
    for model_data in response.json():
        fetch_token(oauth)
        # get model blob id and bucket from response
        model_blob_id = model_data['blob']['id']
        model_bucket = model_data['blob']['bucket']
        # get model info
        blob_model_info = get_model_info(oauth, model_blob_id, model_bucket)
        blob_model_info['ModelBlobId'] = model_blob_id
        blob_model_info['ModelBucket'] = model_bucket
        # preset parameters using to prediction
        prediction_parameters = dict()
        prepare_prediction_files(oauth, prediction_parameters, blob_model_info)
        # add model to cache, if it not in cache
        if model_blob_id not in MODELS_IN_MEMORY_CACHE.keys():
            try:
                MODELS_IN_MEMORY_CACHE[model_blob_id] = cache_model(
                    prediction_parameters['ModelsFolder'])
            except FileNotFoundError:
                LOGGER.error('Cant preload SSP model with blob id: {}'.format(
                    model_blob_id))
Esempio n. 3
0
def classic_classification_train_logistic_regression(body):
    """
    Callback function which work when modelling success

    :param body: MassTransit message
    """
    global CLASSIC_CLASSIFICATION_LOGISTIC_TRAINED_FLAG
    global CLASSIC_CLASSIFICATION_FILES_BLOB_IDS
    global LOGISTIC_REGRESSION_MODEL_BLOB_ID

    oauth = get_oauth()
    fetch_token(oauth)

    list_of_ids = get_blob_ids_of_model_generic_files(body['BlobId'])
    CLASSIC_CLASSIFICATION_FILES_BLOB_IDS.extend(list_of_ids)
    LOGISTIC_REGRESSION_MODEL_BLOB_ID = body['BlobId']
    CLASSIC_CLASSIFICATION_LOGISTIC_TRAINED_FLAG = True

    return None
Esempio n. 4
0
    def setUp(self):
        """
        Method for do some needed stuff before test starts
        """

        os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
        # initialise test client for flask FAR application
        # and set it to internal TestCase variable
        self.blob_version_url = '{}/version'.format(
            os.environ['OSDR_BLOB_SERVICE_URL'])
        self.temp_folder = os.environ['OSDR_TEMP_FILES_FOLDER']
        self.test_file_name = 'ML_test.txt'
        self.oauth = get_oauth()
        self.test_file = 'DNN_data_solubility.sdf'
        self.test_file_cif = 'test.cif'
        self.parent_id = 'c1cc0000-5d8b-0015-d72b-08d52f3ea2a9'
        self.user_id = '8d76f88c-fc99-45ca-8951-74d3a5fda263'
        self.source_blob_id = get_blob_id(self)
        self.model_bucket = os.environ['OSDR_ML_MODELER_CLIENT_ID']
Esempio n. 5
0
def classic_classification_train_naive_bayes(body):
    """
    Callback function which work when modelling success

    :param body: MassTransit message
    """
    global CLASSIC_CLASSIFICATION_NAIVE_TRAINED_FLAG
    global NAIVE_BAYES_MODEL_BLOB_ID
    global CLASSIC_CLASSIFICATION_MODEL_BUCKET
    global CLASSIC_CLASSIFICATION_FILES_BLOB_IDS

    oauth = get_oauth()
    fetch_token(oauth)

    list_of_ids = get_blob_ids_of_model_generic_files(body['BlobId'])
    NAIVE_BAYES_MODEL_BLOB_ID = body['BlobId']
    CLASSIC_CLASSIFICATION_MODEL_BUCKET = body['Bucket']
    CLASSIC_CLASSIFICATION_FILES_BLOB_IDS.extend(list_of_ids)

    CLASSIC_CLASSIFICATION_NAIVE_TRAINED_FLAG = True

    return None
Esempio n. 6
0
def callback(body):
    """
    Pika callback function used by single structure predictor.
    Make list of json with prediction data for each model prediction.

    :param body: RabbitMQ MT message's body
    """

    # start total prediction time counter
    start_prediction_total = time()
    models_data = []
    oauth = get_oauth()
    # try to reformat molecule, if using "\\" instead of "\" in mol string
    if '\\n' in body['Structure']:
        body['Structure'] = body['Structure'].replace('\\n', '\n')

    # set molecules converter function depends on input structure type
    molecules_converter_method = None
    if body['Format'] == 'MOL':
        molecules_converter_method = molecules_from_mol_strings
    elif body['Format'] == 'SMILES':
        molecules_converter_method = molecules_from_smiles

    # read molecules from input mol string or SMILES
    molecules = None
    exception_message = None
    try:
        molecules = molecules_converter_method([body['Structure']])
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        exception_message = 'Get molecule from molstring exception'
    # get models ids, models blob ids, models buckets from input message
    fetch_token(oauth)
    model_id, models_blob_ids, models_buckets = get_models_from_body_message(
        body)
    # make prediction for all models
    for model_id, model_blob_id, model_bucket in zip(model_id, models_blob_ids,
                                                     models_buckets):
        # start current prediction counter
        start_current_prediction = time()

        start_timer = time()
        # define exception message for current prediction
        if not exception_message:
            exception_message = None

        # initialize prediction parameters
        prediction_parameters = dict()
        prediction_parameters['DatasetFileName'] = 'Single Structure Predict'
        prediction_parameters['Molecules'] = molecules

        # get current model info
        fetch_token(oauth)
        blob_model_info = get_model_info(oauth, model_blob_id, model_bucket)
        blob_model_info['ModelBlobId'] = model_blob_id
        blob_model_info['ModelBucket'] = model_bucket
        blob_model_info['ModelCode'] = algorithm_code_by_name(
            blob_model_info['Method'])

        # add prediction data to json
        # add training parameters
        # add dataset parameters
        predicted_data = {
            'id': model_id,
            'trainingParameters': training_parameters(blob_model_info),
            'property': property_parameters(blob_model_info),
            'dataset': dataset(blob_model_info),
            'reportId': str(uuid.uuid1())
        }

        fetch_token(oauth)
        LOGGER.info('INITIAL PREPARING: {} sec'.format(time() - start_timer))
        try:
            # update prediction parameters
            prepare_prediction_parameters(oauth, prediction_parameters,
                                          blob_model_info)
        except:
            # log error traceback
            logging_exception_message(LOGGER)
            if not exception_message:
                exception_message = 'Prepare prediction parameters exception'

        start_timer = time()
        try:
            # make prediction
            ml_predictor = MLPredictor(prediction_parameters)
            ml_predictor.make_prediction()
            prediction = ml_predictor.prediction
        except:
            # log error traceback
            logging_exception_message(LOGGER)
            if not exception_message:
                exception_message = 'Predictor exception'
        LOGGER.info('PREDICTION TIME: {} sec'.format(time() - start_timer))

        # add error to json if something wrong
        if exception_message:
            predicted_data['error'] = error(exception_message)
        else:
            predicted_data['result'] = result(prediction)
            predicted_data['applicabilityDomain'] = applicability_domain(
                prediction)

        # stop current prediction counter
        current_prediction_time_seconds = time() - start_current_prediction
        predicted_data['predictionElapsedTime'] = int(
            current_prediction_time_seconds * 1000)

        models_data.append(predicted_data)

    # stop total predictions counter
    total_prediction_time_seconds = time() - start_prediction_total
    body['Data'] = {
        'predictionElapsedTime': int(total_prediction_time_seconds * 1000),
        'models': models_data
    }
    # add prediction consensus data to sending message
    consensus_data = consensus(models_data)
    if consensus_data:
        body['Data'].update({'consensus': consensus_data})

    # send (publish) properties predicted message to OSDR
    prediction_created_event = single_structure_property_predicted(body)
    properties_predicted_publisher = PurePublisher(SINGLE_STRUCTURE_PREDICTED)
    properties_predicted_publisher.publish(prediction_created_event)

    return None
def generate_training_report(body):
    """
    Pika callback function used by training report generator.
    Make plots files, general metrics csv file and report file if success.

    :param body: RabbitMQ MT message's body
    """

    oauth = get_oauth()
    fetch_token(oauth)

    # define using variables for ml reporter
    model = body['Models'][0]
    model_blob_id = model['blobId']
    file_info = get_file_info_from_blob(oauth, model_blob_id).json()
    if 'ModelInfo' in file_info['metadata'].keys():
        info_key = 'ModelInfo'
    elif 'modelInfo' in file_info['metadata'].keys():
        info_key = 'modelInfo'
    else:
        raise KeyError('No model info')
    model_info = json.loads(file_info['metadata'][info_key])
    body['Bins'] = model_info['Bins']
    model_name = model_info['ModelName']
    model_type = model_info['ModelType']
    base_folder = '{}/general_training_report_{}'.format(
        TEMP_FOLDER, uuid.uuid1())
    make_directory(base_folder)
    LOGGER.info('MODEL INFO: {}'.format(model_info))
    LOGGER.info('MODEL NAME: {}'.format(model_name))
    LOGGER.info('MODEL TYPE: {}'.format(model_type))

    # generate general metrics dict
    all_models_metrics = []
    for model in body['Models']:
        # if something wrong with model file from blob storage
        if 'genericFiles' not in model.keys():
            raise TypeError('Empty model\'s generic files blob ids')

        for file_blob_id in model['genericFiles']:
            file_info = get_file_info_from_blob(oauth, file_blob_id).json()
            if 'fileInfo' in file_info['metadata'].keys():
                fileinfo_key = 'fileInfo'
            elif 'FileInfo' in file_info['metadata'].keys():
                fileinfo_key = 'FileInfo'
            else:
                raise KeyError('No fileInfo key in: {}'.format(
                    file_info['metadata'].keys()))

            file_info = json.loads(file_info['metadata'][fileinfo_key])

            if 'fileType' in file_info.keys():
                filetype_key = 'fileType'
            elif 'FileType' in file_info.keys():
                filetype_key = 'FileType'
            else:
                filetype_key = None

            if filetype_key and file_info[filetype_key] == TRAINING_CSV_METRICS:
                csv_blob_id = file_blob_id
                csv_model_metrics = get_file_from_blob(csv_blob_id,
                                                       oauth).content
                all_models_metrics.append(csv_model_metrics.decode())

            LOGGER.info('CURRENT MODEL INFO: {}'.format(file_info))

    LOGGER.info('ALL MODELS METRICS: {}'.format(all_models_metrics))
    # write general metrics data to csv file
    csv_files_names = write_csvs_files(all_models_metrics)
    general_csv_dict = merge_csv_files(csv_files_names)
    rows = make_general_csv_rows(general_csv_dict)
    general_csv_file_path = write_rows_to_csv_file(rows, base_folder)
    metrics = html_metrics_from_dict(general_csv_dict)

    fetch_token(oauth)
    # make csv info for blob storage
    general_csv_info = {
        'FileInfo':
        json.dumps({
            'modelName': model_name,
            'fileType': ALL_MODELS_TRAINING_CSV_METRICS
        }),
        'SkipOsdrProcessing':
        'true'
    }
    # make multipart object prepared to POST to blob storage
    # include csv file and file info
    multipart_general_csv = get_multipart_object(
        body,
        general_csv_file_path,
        'text/csv',
        additional_fields=general_csv_info)
    # POST metrcis csv file to blob storage
    post_data_to_blob(oauth, multipart_general_csv)

    # create general images
    body['NumberOfGenericFiles'] = 0
    path_to_radar_plot = None
    try:
        if model_type == CLASSIFIER:
            LOGGER.info('Creating radar_plot')
            nbits = body['Bins']
            path_to_radar_plot = radar_plot(general_csv_file_path,
                                            base_folder,
                                            nbits,
                                            titlename=model_name)
            # make radar plot multipart encoded object
            multipart_radar_plot = get_multipart_object(
                body,
                path_to_radar_plot,
                'image/png',
                additional_fields={'correlationId': body['CorrelationId']})

            # send, http POST request to blob storage api with radar plot
            post_data_to_blob(oauth, multipart_radar_plot)
            body['NumberOfGenericFiles'] += 1
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        raise Exception('Post generic data exception')

    optimizer_metrics = REDIS_CLIENT.get('optimizer_metrics_{}'.format(
        body['CorrelationId']))

    if optimizer_metrics:
        optimizer_metrics = html_optimal_metrics_from_dict(
            json.loads(optimizer_metrics), model_type)

    # add metrics and images to pdf report file
    context = {
        'metrics': metrics,
        'radar_plots': [path_to_radar_plot],
        'optimizer': optimizer_metrics
    }
    pdf_path = make_pdf_report(base_folder, context, model_name='general')
    fetch_token(oauth)
    multipart_general_csv = get_multipart_object(
        body,
        pdf_path,
        'application/pdf',
        additional_fields={'correlationId': body['CorrelationId']})
    post_data_to_blob(oauth, multipart_general_csv)
    body['NumberOfGenericFiles'] += 1

    # remove temporary directory
    shutil.rmtree(base_folder, ignore_errors=True)

    report_generated = training_report_generated_message(body)
    model_report_generated_message_publisher = PurePublisher(REPORT_GENERATED)
    model_report_generated_message_publisher.publish(report_generated)

    LOGGER.info('Report generated!')

    return None
def callback(body):
    """
    Pika callback function used by ml predictor.
    Make file with predicted properties by picked model.
    Send file to blob storage for OSDR

    :param body: RabbitMQ MT message's body
    """

    oauth = get_oauth()
    prediction_parameters = dict()
    fetch_token(oauth)
    # update prediction parameters
    # add dataset as bytes
    # add dataset file name, just for report message
    prediction_parameters['Dataset'], prediction_parameters[
        'DatasetFileName'] = get_dataset(oauth, body)
    model_info = get_model_info(oauth, body['ModelBlobId'],
                                body['ModelBucket'])

    fetch_token(oauth)
    model_info['ModelBlobId'] = body['ModelBlobId']
    model_info['ModelBucket'] = body['ModelBucket']
    model_info['ModelCode'] = algorithm_code_by_name(model_info['Method'])
    # update prediction parameters with model paramers, such as density matrix,
    # distance model, MODI, fingerprints etc
    prepare_prediction_parameters(oauth, prediction_parameters, model_info)

    # update prediction parameters
    # add list of molecules
    prediction_parameters['Molecules'] = get_molecules_from_sdf_bytes(
        prediction_parameters['Dataset'])

    # define predictor object using prediction parameters
    # make prediction for all molecules
    ml_predictor = MLPredictor(prediction_parameters)
    ml_predictor.make_prediction()

    # send prediction result to OSDR
    # write prediction to csv
    prediction_csv_path = ml_predictor.write_prediction_to_csv()
    fetch_token(oauth)
    # prepare multipart object
    multipart_csv = get_multipart_object(body, prediction_csv_path, 'text/csv')
    # POST data to blob storage
    response_csv = post_data_to_blob(oauth, multipart_csv)

    # get prediction blob id and publish message in properties predicted queue
    prediction_created_event = property_predicted(
        body, os.path.basename(prediction_csv_path),
        response_csv.json()[0])
    properties_predicted_publisher = PurePublisher(PROPERTIES_PREDICTED)
    properties_predicted_publisher.publish(prediction_created_event)

    # remove prediction file from temporary folder
    os.remove(prediction_csv_path)
    # remove temporary models folder
    shutil.rmtree(prediction_parameters['ModelsFolder'], ignore_errors=True)
    # clear memory
    del MODELS_IN_MEMORY_CACHE[model_info['ModelBlobId']]

    return None
Esempio n. 9
0
def train_model(body):
    """
    Pika callback function used by ml modeller. Make plots files, metrics files
    and model file if success.

    :param body: RabbitMQ MT message's body
    """

    # prepared needed for calculations variables
    body['CurrentModelId'] = body['CorrelationId']
    oauth = get_oauth()

    if not body['Method']:
        raise ValueError('Empty method in model trainer')

    body['Method'] = body['Method'].lower()
    method_code = body['Method']
    body['Name'] = '_'.join(algorithm_name_by_code(method_code).split())

    # publish training started event to OSDR
    fetch_token(oauth)
    publish_start_training_event(body)

    # validate input data
    # raise error with invalid parameter
    validate_user_input(body)

    # make dataframe using input parameters
    model_type = model_type_by_code(method_code)
    try:
        dataframe = make_dataframe(model_type, body, oauth)
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        raise Exception('Make dataframe exception')

    # calculate scaler model
    model_trainer = make_model_trainer(method_code, body, dataframe)
    LOGGER.info('Sending scaler file to {}/{}'.format(BLOB_URL, CLIENT_ID))
    fetch_token(oauth)

    # make applicability domain files and calculate values
    LOGGER.info('Calculate applicability domain')
    model_trainer.make_applicability_domain()

    # generic files for current training model, except model (*.sav) files
    body['NumberOfGenericFiles'] = 0
    # train chosen model
    LOGGER.info('Start model training')
    try:
        model_training_data = conduct_training(model_trainer, method_code,
                                               body, oauth)
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        raise Exception('Model training exception')

    # POST model's files (images, csvs etc)
    try:
        # POST classifier model files
        if model_type == CLASSIFIER:
            post_classifier(model_trainer, body, oauth,
                            model_training_data['path_to_csv_file'])
        # POST regressor model files
        elif model_type == REGRESSOR:
            post_regressor(model_trainer, body, oauth)
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        raise Exception('Post generic data exception')

    # update template tags with all needed data for model's report
    model_trainer.make_report_text()

    # make pdf report for trained model
    pdf_path = make_pdf_report(model_trainer.sub_folder,
                               model_trainer.template_tags,
                               model_name=body['Name'])
    LOGGER.info('Sending pdf report to {}/{}'.format(BLOB_URL, CLIENT_ID))

    # POST pdf report to blob storage
    fetch_token(oauth)
    model_report_info = {
        'FileInfo':
        json.dumps({
            'modelName': model_trainer.cv_model.model_name,
            'fileType': MODEL_PDF_REPORT
        }),
        'ParentId':
        body['CurrentModelId']
    }
    multipart_pdf = get_multipart_object(body,
                                         pdf_path,
                                         'application/pdf',
                                         additional_fields=model_report_info)
    post_data_to_blob(oauth, multipart_pdf)
    body['NumberOfGenericFiles'] += 1

    # remove temporary directory
    shutil.rmtree(model_trainer.sub_folder, ignore_errors=True)

    # send model trained message to OSDR via rabbitmq
    model_trained = model_trained_message(body, model_training_data,
                                          model_trainer)
    model_trained_message_publisher = PurePublisher(MODEL_TRAINED)
    model_trained_message_publisher.publish(model_trained)

    LOGGER.info('Finished Calculations!')

    return None
Esempio n. 10
0
def find_optimal_parameters(body):
    """
    Pika callback function used by ml optimizer
    Find optimal training fingerprints set for input dataset
    Using only 1000 (by default) or less structures from input dataset
    Send overall optimizing result to Redis, to use it in ml training report

    :param body: RabbitMQ MT message's body
    :type body: dict
    """

    oauth = get_oauth()

    # check input methods
    if not body['Methods']:
        raise ValueError('Empty Methods')

    # calculate metrics for each fingerprints set
    metrics, target_metric = fingerprints_grid_search(
        oauth, body, BASE_FINGERPRINTS)
    # send all metrics to redis
    # later use it to add to training report
    REDIS_CLIENT.setex(
        'optimizer_metrics_{}'.format(body['CorrelationId']), EXPIRATION_TIME,
        json.dumps(metrics)
    )
    # find best fingerprints set
    optimal_fingerprints = sorted(
        metrics.values(), key=lambda value: value['metrics'][target_metric],
        reverse=True
    )[0]['fptype']
    # set other default 'optimal' parameters for training model
    body['SubSampleSize'] = 1.0
    body['TestDataSize'] = 0.3
    body['Scaler'] = 'MinMax'
    body['KFold'] = 5
    body['Fingerprints'] = optimal_fingerprints
    body['OptimizationMethod'] = 'default'
    body['NumberOfIterations'] = 100

    # make optimizer metrics csv and post it to blob storage
    formatted_metrics = TMP_TMP(
        metrics, model_type_by_code(body['Methods'][0].lower()))
    csv_path = '{}/ml_optimizer/{}/optimizing.csv'.format(
        TEMP_FOLDER, body['CorrelationId'])
    write_optimized_metrics_to_csv(formatted_metrics, csv_path)
    multipart_model = get_multipart_object(
        body, csv_path, 'application/x-spss-sav',
        additional_fields={'ParentId': body['TargetFolderId']}
    )

    # send optimizer metrics csv file to blob storage
    fetch_token(oauth)
    response = post_data_to_blob(oauth, multipart_model)
    LOGGER.info('Optimizer csv status code: {}'.format(response.status_code))

    # send best fingerprints set and 'optimal' parameters to training model
    training_optimized = model_training_optimized(body)
    training_optimized_message_publisher = PurePublisher(TRAINING_OPTIMIZED)
    training_optimized_message_publisher.publish(training_optimized)

    # clear current optimization folder
    shutil.rmtree(
        '{}/ml_optimizer/{}'.format(TEMP_FOLDER, body['CorrelationId']),
        ignore_errors=True
    )