Esempio n. 1
0
def calculate_feature_vectors(body):
    try:
        data_file_as_bytes_array = REDIS_CLIENT.get('{}-file'.format(
            body['CorrelationId']))
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        error_message = 'Can\'t get sdf file from redis'
        raise Exception(error_message)

    if body['FileType'] == 'sdf':
        csv_file_path = process_sdf(body, data_file_as_bytes_array)
    elif body['FileType'] == 'cif':
        path_to_cif_file = '{}/{}.cif'.format(TEMP_FOLDER,
                                              body['CorrelationId'])
        cif_file = open(path_to_cif_file, 'wb')
        cif_file.write(data_file_as_bytes_array)
        cif_file.close()

        if 'type' in body['Fingerprints'][0].keys():
            type_key = 'type'
        elif 'Type' in body['Fingerprints'][0].keys():
            type_key = 'Type'
        else:
            raise KeyError('No type key in fingerprints: {}'.format(
                body['Fingerprints']))

        descriptors = [x[type_key].strip() for x in body['Fingerprints']]
        csv_file_path = '{}/{}.csv'.format(TEMP_FOLDER, body['CorrelationId'])
        shape = generate_csv([path_to_cif_file], descriptors, csv_file_path)

        body['Structures'] = shape[0]
        body['Columns'] = shape[1]
        body['Failed'] = 0
    else:
        raise ValueError('Unknown file type: {}'.format(body['FileType']))

    file_to_send = open(csv_file_path, 'rb')

    try:
        REDIS_CLIENT.setex('{}-csv'.format(body['CorrelationId']),
                           EXPIRATION_TIME, file_to_send.read())
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        error_message = 'Can\'t send csv file to redis'
        raise Exception(error_message)

    file_to_send.close()

    # os.remove(csv_file_path)

    feature_vectors_calculator_publisher = feature_vectors_calculated_message(
        body)
    model_trained_message_publisher = PurePublisher(FEATURE_VECTORS_CALCULATED)
    model_trained_message_publisher.publish(
        feature_vectors_calculator_publisher)

    return None
Esempio n. 2
0
def publish_start_training_event(body):
    """
    Method for send (publish) start training event to OSDR via rabbitmq

    :param body: RabbitMQ MT message's body
    :type body: dict
    """

    # generate rabbitmq message
    start_training_publisher = PurePublisher(MODEL_TRAINING_STARTED)
    # create valid message for send
    message_body = model_training_start_message(body)
    # send message
    start_training_publisher.publish(message_body)
        def function_wrapper(body):
            """
            Method which contain try/except blocks for function.
            If have not exceptions send report message to RabbitMQ queue
            defined in report_publisher MT constant
            If run except block logging traceback message to RabbitMQ queue
            defined in fail_publisher MT constant

            :param body: body of RabbitMQ message
            :return: function with exception handler
            """

            report_event_message = None
            report_failed_message = None

            try:
                # try to execute function code with arguments
                report_event_message = function(body)

            except Exception:
                # get all system info about exception
                exception_type, exception, traceback_text = sys.exc_info()
                # make error traceback message
                exception_message = 'Internal server error.'
                # if specified exception
                if exception_type.__name__ == Exception.__name__:
                    exception_message += ' {}'.format(exception)
                else:
                    exception_message += 'Unspecified server error.'

                # log error traceback
                logging_exception_message(self.logger)

                # make failed message for publish in rabbitmq exchange
                report_failed_message = self.fail_message_constructor(
                    exception_message, body)

            if report_event_message and self.report_publisher:
                # publish report message to rabbitmq
                report_message_publisher = PurePublisher(self.report_publisher)
                report_message_publisher.publish(report_event_message)
            elif report_event_message:
                self.logger.info(
                    'Have not report publisher, but callback is work')

            if report_failed_message and self.fail_publisher:
                # publish failed message to rabbitmq
                fail_message_publisher = PurePublisher(self.fail_publisher)
                fail_message_publisher.publish(report_failed_message)
            elif report_failed_message:
                self.logger.info(
                    'Have not fail publisher, callback throw exception')
Esempio n. 4
0
def publish_thumbnail_generated(training_parameters, thumbnail_blob_id):
    """
    Method for send (publish) thumbnail generated event to OSDR.
    Send thumbnail blob id and model trainig parameters (model id etc)

    :param training_parameters: model training parameters
    :param thumbnail_blob_id: model thumbnail image's blob id
    :type training_parameters: dict
    """

    # generate message body
    thumbnail_generated = thumbnail_generated_message(training_parameters,
                                                      thumbnail_blob_id)
    # make publisher object
    model_thumbnail_generated_publisher = PurePublisher(
        MODEL_THUMBNAIL_GENERATED)
    # send (publish) rabbitmq message, thumbnail generated event
    model_thumbnail_generated_publisher.publish(thumbnail_generated)
    # change thumbnail generated flag
    # using in thumbnail generated rabbitmq message to OSDR
    training_parameters['IsThumbnailGenerated'] = True
Esempio n. 5
0
def callback(body):
    """
    Pika callback function used by single structure predictor.
    Make list of json with prediction data for each model prediction.

    :param body: RabbitMQ MT message's body
    """

    # start total prediction time counter
    start_prediction_total = time()
    models_data = []
    oauth = get_oauth()
    # try to reformat molecule, if using "\\" instead of "\" in mol string
    if '\\n' in body['Structure']:
        body['Structure'] = body['Structure'].replace('\\n', '\n')

    # set molecules converter function depends on input structure type
    molecules_converter_method = None
    if body['Format'] == 'MOL':
        molecules_converter_method = molecules_from_mol_strings
    elif body['Format'] == 'SMILES':
        molecules_converter_method = molecules_from_smiles

    # read molecules from input mol string or SMILES
    molecules = None
    exception_message = None
    try:
        molecules = molecules_converter_method([body['Structure']])
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        exception_message = 'Get molecule from molstring exception'
    # get models ids, models blob ids, models buckets from input message
    fetch_token(oauth)
    model_id, models_blob_ids, models_buckets = get_models_from_body_message(
        body)
    # make prediction for all models
    for model_id, model_blob_id, model_bucket in zip(model_id, models_blob_ids,
                                                     models_buckets):
        # start current prediction counter
        start_current_prediction = time()

        start_timer = time()
        # define exception message for current prediction
        if not exception_message:
            exception_message = None

        # initialize prediction parameters
        prediction_parameters = dict()
        prediction_parameters['DatasetFileName'] = 'Single Structure Predict'
        prediction_parameters['Molecules'] = molecules

        # get current model info
        fetch_token(oauth)
        blob_model_info = get_model_info(oauth, model_blob_id, model_bucket)
        blob_model_info['ModelBlobId'] = model_blob_id
        blob_model_info['ModelBucket'] = model_bucket
        blob_model_info['ModelCode'] = algorithm_code_by_name(
            blob_model_info['Method'])

        # add prediction data to json
        # add training parameters
        # add dataset parameters
        predicted_data = {
            'id': model_id,
            'trainingParameters': training_parameters(blob_model_info),
            'property': property_parameters(blob_model_info),
            'dataset': dataset(blob_model_info),
            'reportId': str(uuid.uuid1())
        }

        fetch_token(oauth)
        LOGGER.info('INITIAL PREPARING: {} sec'.format(time() - start_timer))
        try:
            # update prediction parameters
            prepare_prediction_parameters(oauth, prediction_parameters,
                                          blob_model_info)
        except:
            # log error traceback
            logging_exception_message(LOGGER)
            if not exception_message:
                exception_message = 'Prepare prediction parameters exception'

        start_timer = time()
        try:
            # make prediction
            ml_predictor = MLPredictor(prediction_parameters)
            ml_predictor.make_prediction()
            prediction = ml_predictor.prediction
        except:
            # log error traceback
            logging_exception_message(LOGGER)
            if not exception_message:
                exception_message = 'Predictor exception'
        LOGGER.info('PREDICTION TIME: {} sec'.format(time() - start_timer))

        # add error to json if something wrong
        if exception_message:
            predicted_data['error'] = error(exception_message)
        else:
            predicted_data['result'] = result(prediction)
            predicted_data['applicabilityDomain'] = applicability_domain(
                prediction)

        # stop current prediction counter
        current_prediction_time_seconds = time() - start_current_prediction
        predicted_data['predictionElapsedTime'] = int(
            current_prediction_time_seconds * 1000)

        models_data.append(predicted_data)

    # stop total predictions counter
    total_prediction_time_seconds = time() - start_prediction_total
    body['Data'] = {
        'predictionElapsedTime': int(total_prediction_time_seconds * 1000),
        'models': models_data
    }
    # add prediction consensus data to sending message
    consensus_data = consensus(models_data)
    if consensus_data:
        body['Data'].update({'consensus': consensus_data})

    # send (publish) properties predicted message to OSDR
    prediction_created_event = single_structure_property_predicted(body)
    properties_predicted_publisher = PurePublisher(SINGLE_STRUCTURE_PREDICTED)
    properties_predicted_publisher.publish(prediction_created_event)

    return None
def generate_training_report(body):
    """
    Pika callback function used by training report generator.
    Make plots files, general metrics csv file and report file if success.

    :param body: RabbitMQ MT message's body
    """

    oauth = get_oauth()
    fetch_token(oauth)

    # define using variables for ml reporter
    model = body['Models'][0]
    model_blob_id = model['blobId']
    file_info = get_file_info_from_blob(oauth, model_blob_id).json()
    if 'ModelInfo' in file_info['metadata'].keys():
        info_key = 'ModelInfo'
    elif 'modelInfo' in file_info['metadata'].keys():
        info_key = 'modelInfo'
    else:
        raise KeyError('No model info')
    model_info = json.loads(file_info['metadata'][info_key])
    body['Bins'] = model_info['Bins']
    model_name = model_info['ModelName']
    model_type = model_info['ModelType']
    base_folder = '{}/general_training_report_{}'.format(
        TEMP_FOLDER, uuid.uuid1())
    make_directory(base_folder)
    LOGGER.info('MODEL INFO: {}'.format(model_info))
    LOGGER.info('MODEL NAME: {}'.format(model_name))
    LOGGER.info('MODEL TYPE: {}'.format(model_type))

    # generate general metrics dict
    all_models_metrics = []
    for model in body['Models']:
        # if something wrong with model file from blob storage
        if 'genericFiles' not in model.keys():
            raise TypeError('Empty model\'s generic files blob ids')

        for file_blob_id in model['genericFiles']:
            file_info = get_file_info_from_blob(oauth, file_blob_id).json()
            if 'fileInfo' in file_info['metadata'].keys():
                fileinfo_key = 'fileInfo'
            elif 'FileInfo' in file_info['metadata'].keys():
                fileinfo_key = 'FileInfo'
            else:
                raise KeyError('No fileInfo key in: {}'.format(
                    file_info['metadata'].keys()))

            file_info = json.loads(file_info['metadata'][fileinfo_key])

            if 'fileType' in file_info.keys():
                filetype_key = 'fileType'
            elif 'FileType' in file_info.keys():
                filetype_key = 'FileType'
            else:
                filetype_key = None

            if filetype_key and file_info[filetype_key] == TRAINING_CSV_METRICS:
                csv_blob_id = file_blob_id
                csv_model_metrics = get_file_from_blob(csv_blob_id,
                                                       oauth).content
                all_models_metrics.append(csv_model_metrics.decode())

            LOGGER.info('CURRENT MODEL INFO: {}'.format(file_info))

    LOGGER.info('ALL MODELS METRICS: {}'.format(all_models_metrics))
    # write general metrics data to csv file
    csv_files_names = write_csvs_files(all_models_metrics)
    general_csv_dict = merge_csv_files(csv_files_names)
    rows = make_general_csv_rows(general_csv_dict)
    general_csv_file_path = write_rows_to_csv_file(rows, base_folder)
    metrics = html_metrics_from_dict(general_csv_dict)

    fetch_token(oauth)
    # make csv info for blob storage
    general_csv_info = {
        'FileInfo':
        json.dumps({
            'modelName': model_name,
            'fileType': ALL_MODELS_TRAINING_CSV_METRICS
        }),
        'SkipOsdrProcessing':
        'true'
    }
    # make multipart object prepared to POST to blob storage
    # include csv file and file info
    multipart_general_csv = get_multipart_object(
        body,
        general_csv_file_path,
        'text/csv',
        additional_fields=general_csv_info)
    # POST metrcis csv file to blob storage
    post_data_to_blob(oauth, multipart_general_csv)

    # create general images
    body['NumberOfGenericFiles'] = 0
    path_to_radar_plot = None
    try:
        if model_type == CLASSIFIER:
            LOGGER.info('Creating radar_plot')
            nbits = body['Bins']
            path_to_radar_plot = radar_plot(general_csv_file_path,
                                            base_folder,
                                            nbits,
                                            titlename=model_name)
            # make radar plot multipart encoded object
            multipart_radar_plot = get_multipart_object(
                body,
                path_to_radar_plot,
                'image/png',
                additional_fields={'correlationId': body['CorrelationId']})

            # send, http POST request to blob storage api with radar plot
            post_data_to_blob(oauth, multipart_radar_plot)
            body['NumberOfGenericFiles'] += 1
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        raise Exception('Post generic data exception')

    optimizer_metrics = REDIS_CLIENT.get('optimizer_metrics_{}'.format(
        body['CorrelationId']))

    if optimizer_metrics:
        optimizer_metrics = html_optimal_metrics_from_dict(
            json.loads(optimizer_metrics), model_type)

    # add metrics and images to pdf report file
    context = {
        'metrics': metrics,
        'radar_plots': [path_to_radar_plot],
        'optimizer': optimizer_metrics
    }
    pdf_path = make_pdf_report(base_folder, context, model_name='general')
    fetch_token(oauth)
    multipart_general_csv = get_multipart_object(
        body,
        pdf_path,
        'application/pdf',
        additional_fields={'correlationId': body['CorrelationId']})
    post_data_to_blob(oauth, multipart_general_csv)
    body['NumberOfGenericFiles'] += 1

    # remove temporary directory
    shutil.rmtree(base_folder, ignore_errors=True)

    report_generated = training_report_generated_message(body)
    model_report_generated_message_publisher = PurePublisher(REPORT_GENERATED)
    model_report_generated_message_publisher.publish(report_generated)

    LOGGER.info('Report generated!')

    return None
def callback(body):
    """
    Pika callback function used by ml predictor.
    Make file with predicted properties by picked model.
    Send file to blob storage for OSDR

    :param body: RabbitMQ MT message's body
    """

    oauth = get_oauth()
    prediction_parameters = dict()
    fetch_token(oauth)
    # update prediction parameters
    # add dataset as bytes
    # add dataset file name, just for report message
    prediction_parameters['Dataset'], prediction_parameters[
        'DatasetFileName'] = get_dataset(oauth, body)
    model_info = get_model_info(oauth, body['ModelBlobId'],
                                body['ModelBucket'])

    fetch_token(oauth)
    model_info['ModelBlobId'] = body['ModelBlobId']
    model_info['ModelBucket'] = body['ModelBucket']
    model_info['ModelCode'] = algorithm_code_by_name(model_info['Method'])
    # update prediction parameters with model paramers, such as density matrix,
    # distance model, MODI, fingerprints etc
    prepare_prediction_parameters(oauth, prediction_parameters, model_info)

    # update prediction parameters
    # add list of molecules
    prediction_parameters['Molecules'] = get_molecules_from_sdf_bytes(
        prediction_parameters['Dataset'])

    # define predictor object using prediction parameters
    # make prediction for all molecules
    ml_predictor = MLPredictor(prediction_parameters)
    ml_predictor.make_prediction()

    # send prediction result to OSDR
    # write prediction to csv
    prediction_csv_path = ml_predictor.write_prediction_to_csv()
    fetch_token(oauth)
    # prepare multipart object
    multipart_csv = get_multipart_object(body, prediction_csv_path, 'text/csv')
    # POST data to blob storage
    response_csv = post_data_to_blob(oauth, multipart_csv)

    # get prediction blob id and publish message in properties predicted queue
    prediction_created_event = property_predicted(
        body, os.path.basename(prediction_csv_path),
        response_csv.json()[0])
    properties_predicted_publisher = PurePublisher(PROPERTIES_PREDICTED)
    properties_predicted_publisher.publish(prediction_created_event)

    # remove prediction file from temporary folder
    os.remove(prediction_csv_path)
    # remove temporary models folder
    shutil.rmtree(prediction_parameters['ModelsFolder'], ignore_errors=True)
    # clear memory
    del MODELS_IN_MEMORY_CACHE[model_info['ModelBlobId']]

    return None
Esempio n. 8
0
def train_model(body):
    """
    Pika callback function used by ml modeller. Make plots files, metrics files
    and model file if success.

    :param body: RabbitMQ MT message's body
    """

    # prepared needed for calculations variables
    body['CurrentModelId'] = body['CorrelationId']
    oauth = get_oauth()

    if not body['Method']:
        raise ValueError('Empty method in model trainer')

    body['Method'] = body['Method'].lower()
    method_code = body['Method']
    body['Name'] = '_'.join(algorithm_name_by_code(method_code).split())

    # publish training started event to OSDR
    fetch_token(oauth)
    publish_start_training_event(body)

    # validate input data
    # raise error with invalid parameter
    validate_user_input(body)

    # make dataframe using input parameters
    model_type = model_type_by_code(method_code)
    try:
        dataframe = make_dataframe(model_type, body, oauth)
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        raise Exception('Make dataframe exception')

    # calculate scaler model
    model_trainer = make_model_trainer(method_code, body, dataframe)
    LOGGER.info('Sending scaler file to {}/{}'.format(BLOB_URL, CLIENT_ID))
    fetch_token(oauth)

    # make applicability domain files and calculate values
    LOGGER.info('Calculate applicability domain')
    model_trainer.make_applicability_domain()

    # generic files for current training model, except model (*.sav) files
    body['NumberOfGenericFiles'] = 0
    # train chosen model
    LOGGER.info('Start model training')
    try:
        model_training_data = conduct_training(model_trainer, method_code,
                                               body, oauth)
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        raise Exception('Model training exception')

    # POST model's files (images, csvs etc)
    try:
        # POST classifier model files
        if model_type == CLASSIFIER:
            post_classifier(model_trainer, body, oauth,
                            model_training_data['path_to_csv_file'])
        # POST regressor model files
        elif model_type == REGRESSOR:
            post_regressor(model_trainer, body, oauth)
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        raise Exception('Post generic data exception')

    # update template tags with all needed data for model's report
    model_trainer.make_report_text()

    # make pdf report for trained model
    pdf_path = make_pdf_report(model_trainer.sub_folder,
                               model_trainer.template_tags,
                               model_name=body['Name'])
    LOGGER.info('Sending pdf report to {}/{}'.format(BLOB_URL, CLIENT_ID))

    # POST pdf report to blob storage
    fetch_token(oauth)
    model_report_info = {
        'FileInfo':
        json.dumps({
            'modelName': model_trainer.cv_model.model_name,
            'fileType': MODEL_PDF_REPORT
        }),
        'ParentId':
        body['CurrentModelId']
    }
    multipart_pdf = get_multipart_object(body,
                                         pdf_path,
                                         'application/pdf',
                                         additional_fields=model_report_info)
    post_data_to_blob(oauth, multipart_pdf)
    body['NumberOfGenericFiles'] += 1

    # remove temporary directory
    shutil.rmtree(model_trainer.sub_folder, ignore_errors=True)

    # send model trained message to OSDR via rabbitmq
    model_trained = model_trained_message(body, model_training_data,
                                          model_trainer)
    model_trained_message_publisher = PurePublisher(MODEL_TRAINED)
    model_trained_message_publisher.publish(model_trained)

    LOGGER.info('Finished Calculations!')

    return None
Esempio n. 9
0
def find_optimal_parameters(body):
    """
    Pika callback function used by ml optimizer
    Find optimal training fingerprints set for input dataset
    Using only 1000 (by default) or less structures from input dataset
    Send overall optimizing result to Redis, to use it in ml training report

    :param body: RabbitMQ MT message's body
    :type body: dict
    """

    oauth = get_oauth()

    # check input methods
    if not body['Methods']:
        raise ValueError('Empty Methods')

    # calculate metrics for each fingerprints set
    metrics, target_metric = fingerprints_grid_search(
        oauth, body, BASE_FINGERPRINTS)
    # send all metrics to redis
    # later use it to add to training report
    REDIS_CLIENT.setex(
        'optimizer_metrics_{}'.format(body['CorrelationId']), EXPIRATION_TIME,
        json.dumps(metrics)
    )
    # find best fingerprints set
    optimal_fingerprints = sorted(
        metrics.values(), key=lambda value: value['metrics'][target_metric],
        reverse=True
    )[0]['fptype']
    # set other default 'optimal' parameters for training model
    body['SubSampleSize'] = 1.0
    body['TestDataSize'] = 0.3
    body['Scaler'] = 'MinMax'
    body['KFold'] = 5
    body['Fingerprints'] = optimal_fingerprints
    body['OptimizationMethod'] = 'default'
    body['NumberOfIterations'] = 100

    # make optimizer metrics csv and post it to blob storage
    formatted_metrics = TMP_TMP(
        metrics, model_type_by_code(body['Methods'][0].lower()))
    csv_path = '{}/ml_optimizer/{}/optimizing.csv'.format(
        TEMP_FOLDER, body['CorrelationId'])
    write_optimized_metrics_to_csv(formatted_metrics, csv_path)
    multipart_model = get_multipart_object(
        body, csv_path, 'application/x-spss-sav',
        additional_fields={'ParentId': body['TargetFolderId']}
    )

    # send optimizer metrics csv file to blob storage
    fetch_token(oauth)
    response = post_data_to_blob(oauth, multipart_model)
    LOGGER.info('Optimizer csv status code: {}'.format(response.status_code))

    # send best fingerprints set and 'optimal' parameters to training model
    training_optimized = model_training_optimized(body)
    training_optimized_message_publisher = PurePublisher(TRAINING_OPTIMIZED)
    training_optimized_message_publisher.publish(training_optimized)

    # clear current optimization folder
    shutil.rmtree(
        '{}/ml_optimizer/{}'.format(TEMP_FOLDER, body['CorrelationId']),
        ignore_errors=True
    )