Example #1
0
def calculate_feature_vectors(body):
    try:
        data_file_as_bytes_array = REDIS_CLIENT.get('{}-file'.format(
            body['CorrelationId']))
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        error_message = 'Can\'t get sdf file from redis'
        raise Exception(error_message)

    if body['FileType'] == 'sdf':
        csv_file_path = process_sdf(body, data_file_as_bytes_array)
    elif body['FileType'] == 'cif':
        path_to_cif_file = '{}/{}.cif'.format(TEMP_FOLDER,
                                              body['CorrelationId'])
        cif_file = open(path_to_cif_file, 'wb')
        cif_file.write(data_file_as_bytes_array)
        cif_file.close()

        if 'type' in body['Fingerprints'][0].keys():
            type_key = 'type'
        elif 'Type' in body['Fingerprints'][0].keys():
            type_key = 'Type'
        else:
            raise KeyError('No type key in fingerprints: {}'.format(
                body['Fingerprints']))

        descriptors = [x[type_key].strip() for x in body['Fingerprints']]
        csv_file_path = '{}/{}.csv'.format(TEMP_FOLDER, body['CorrelationId'])
        shape = generate_csv([path_to_cif_file], descriptors, csv_file_path)

        body['Structures'] = shape[0]
        body['Columns'] = shape[1]
        body['Failed'] = 0
    else:
        raise ValueError('Unknown file type: {}'.format(body['FileType']))

    file_to_send = open(csv_file_path, 'rb')

    try:
        REDIS_CLIENT.setex('{}-csv'.format(body['CorrelationId']),
                           EXPIRATION_TIME, file_to_send.read())
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        error_message = 'Can\'t send csv file to redis'
        raise Exception(error_message)

    file_to_send.close()

    # os.remove(csv_file_path)

    feature_vectors_calculator_publisher = feature_vectors_calculated_message(
        body)
    model_trained_message_publisher = PurePublisher(FEATURE_VECTORS_CALCULATED)
    model_trained_message_publisher.publish(
        feature_vectors_calculator_publisher)

    return None
        def function_wrapper(body):
            """
            Method which contain try/except blocks for function.
            If have not exceptions send report message to RabbitMQ queue
            defined in report_publisher MT constant
            If run except block logging traceback message to RabbitMQ queue
            defined in fail_publisher MT constant

            :param body: body of RabbitMQ message
            :return: function with exception handler
            """

            report_event_message = None
            report_failed_message = None

            try:
                # try to execute function code with arguments
                report_event_message = function(body)

            except Exception:
                # get all system info about exception
                exception_type, exception, traceback_text = sys.exc_info()
                # make error traceback message
                exception_message = 'Internal server error.'
                # if specified exception
                if exception_type.__name__ == Exception.__name__:
                    exception_message += ' {}'.format(exception)
                else:
                    exception_message += 'Unspecified server error.'

                # log error traceback
                logging_exception_message(self.logger)

                # make failed message for publish in rabbitmq exchange
                report_failed_message = self.fail_message_constructor(
                    exception_message, body)

            if report_event_message and self.report_publisher:
                # publish report message to rabbitmq
                report_message_publisher = PurePublisher(self.report_publisher)
                report_message_publisher.publish(report_event_message)
            elif report_event_message:
                self.logger.info(
                    'Have not report publisher, but callback is work')

            if report_failed_message and self.fail_publisher:
                # publish failed message to rabbitmq
                fail_message_publisher = PurePublisher(self.fail_publisher)
                fail_message_publisher.publish(report_failed_message)
            elif report_failed_message:
                self.logger.info(
                    'Have not fail publisher, callback throw exception')
Example #3
0
def process_sdf(body, sdf_as_bytes_array):
    fingerprints = list()
    for fingerprint in body['Fingerprints']:
        new_fingerprint = dict()
        for key, value in fingerprint.items():
            new_fingerprint[key.capitalize()] = value

        fingerprints.append(new_fingerprint)

    body['Fingerprints'] = fingerprints
    molecules = get_molecules_from_sdf_bytes(sdf_as_bytes_array)
    errors_list = [[]]
    try:
        data_frame = sdf_to_csv('',
                                body['Fingerprints'],
                                find_classes=True,
                                find_values=True,
                                molecules=molecules,
                                processing_errors=errors_list)
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        error_message = 'Can\'t make dataframe using this sdf file'
        raise Exception(error_message)

    smiles_list = list()
    inchies = list()
    for molecule_number, molecule in enumerate(molecules):
        smiles = Chem.MolToSmiles(molecule, isomericSmiles=True)
        inchi = get_inchi_key(molecule)
        smiles_list.append(('SMILES', molecule_number, smiles))
        inchies.append(('InChiKey', molecule_number, inchi))

    smiles_numpy_array = numpy.array(smiles_list,
                                     dtype=[('name', 'U17'),
                                            ('molecule_number', 'i4'),
                                            ('value', 'U40')])
    inchi_numpy_array = numpy.array(inchies,
                                    dtype=[('name', 'U17'),
                                           ('molecule_number', 'i4'),
                                           ('value', 'U40')])
    errors_numpy_array = numpy.array(errors_list[0],
                                     dtype=[('name', 'U17'),
                                            ('molecule_number', 'i4'),
                                            ('value', 'U40')])
    data_frame = data_frame.astype([('name', 'U10'), ('molecule_number', 'i4'),
                                    ('value', 'U40')])

    data_frame = numpy.insert(data_frame, 0, inchi_numpy_array, axis=1)
    data_frame = numpy.insert(data_frame, 0, smiles_numpy_array, axis=1)
    data_frame = numpy.insert(data_frame,
                              data_frame.shape[1],
                              errors_numpy_array,
                              axis=1)

    csv_file_path = '{}/{}.csv'.format(TEMP_FOLDER, body['CorrelationId'])

    try:
        numpy_to_csv(data_frame, csv_file_path)
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        error_message = 'Can\'t convert dataframe to csv file'
        raise Exception(error_message)

    body['Structures'] = data_frame.shape[0]
    body['Columns'] = data_frame.shape[1]
    body['Failed'] = 0

    return csv_file_path
Example #4
0
def callback(body):
    """
    Pika callback function used by single structure predictor.
    Make list of json with prediction data for each model prediction.

    :param body: RabbitMQ MT message's body
    """

    # start total prediction time counter
    start_prediction_total = time()
    models_data = []
    oauth = get_oauth()
    # try to reformat molecule, if using "\\" instead of "\" in mol string
    if '\\n' in body['Structure']:
        body['Structure'] = body['Structure'].replace('\\n', '\n')

    # set molecules converter function depends on input structure type
    molecules_converter_method = None
    if body['Format'] == 'MOL':
        molecules_converter_method = molecules_from_mol_strings
    elif body['Format'] == 'SMILES':
        molecules_converter_method = molecules_from_smiles

    # read molecules from input mol string or SMILES
    molecules = None
    exception_message = None
    try:
        molecules = molecules_converter_method([body['Structure']])
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        exception_message = 'Get molecule from molstring exception'
    # get models ids, models blob ids, models buckets from input message
    fetch_token(oauth)
    model_id, models_blob_ids, models_buckets = get_models_from_body_message(
        body)
    # make prediction for all models
    for model_id, model_blob_id, model_bucket in zip(model_id, models_blob_ids,
                                                     models_buckets):
        # start current prediction counter
        start_current_prediction = time()

        start_timer = time()
        # define exception message for current prediction
        if not exception_message:
            exception_message = None

        # initialize prediction parameters
        prediction_parameters = dict()
        prediction_parameters['DatasetFileName'] = 'Single Structure Predict'
        prediction_parameters['Molecules'] = molecules

        # get current model info
        fetch_token(oauth)
        blob_model_info = get_model_info(oauth, model_blob_id, model_bucket)
        blob_model_info['ModelBlobId'] = model_blob_id
        blob_model_info['ModelBucket'] = model_bucket
        blob_model_info['ModelCode'] = algorithm_code_by_name(
            blob_model_info['Method'])

        # add prediction data to json
        # add training parameters
        # add dataset parameters
        predicted_data = {
            'id': model_id,
            'trainingParameters': training_parameters(blob_model_info),
            'property': property_parameters(blob_model_info),
            'dataset': dataset(blob_model_info),
            'reportId': str(uuid.uuid1())
        }

        fetch_token(oauth)
        LOGGER.info('INITIAL PREPARING: {} sec'.format(time() - start_timer))
        try:
            # update prediction parameters
            prepare_prediction_parameters(oauth, prediction_parameters,
                                          blob_model_info)
        except:
            # log error traceback
            logging_exception_message(LOGGER)
            if not exception_message:
                exception_message = 'Prepare prediction parameters exception'

        start_timer = time()
        try:
            # make prediction
            ml_predictor = MLPredictor(prediction_parameters)
            ml_predictor.make_prediction()
            prediction = ml_predictor.prediction
        except:
            # log error traceback
            logging_exception_message(LOGGER)
            if not exception_message:
                exception_message = 'Predictor exception'
        LOGGER.info('PREDICTION TIME: {} sec'.format(time() - start_timer))

        # add error to json if something wrong
        if exception_message:
            predicted_data['error'] = error(exception_message)
        else:
            predicted_data['result'] = result(prediction)
            predicted_data['applicabilityDomain'] = applicability_domain(
                prediction)

        # stop current prediction counter
        current_prediction_time_seconds = time() - start_current_prediction
        predicted_data['predictionElapsedTime'] = int(
            current_prediction_time_seconds * 1000)

        models_data.append(predicted_data)

    # stop total predictions counter
    total_prediction_time_seconds = time() - start_prediction_total
    body['Data'] = {
        'predictionElapsedTime': int(total_prediction_time_seconds * 1000),
        'models': models_data
    }
    # add prediction consensus data to sending message
    consensus_data = consensus(models_data)
    if consensus_data:
        body['Data'].update({'consensus': consensus_data})

    # send (publish) properties predicted message to OSDR
    prediction_created_event = single_structure_property_predicted(body)
    properties_predicted_publisher = PurePublisher(SINGLE_STRUCTURE_PREDICTED)
    properties_predicted_publisher.publish(prediction_created_event)

    return None
def generate_training_report(body):
    """
    Pika callback function used by training report generator.
    Make plots files, general metrics csv file and report file if success.

    :param body: RabbitMQ MT message's body
    """

    oauth = get_oauth()
    fetch_token(oauth)

    # define using variables for ml reporter
    model = body['Models'][0]
    model_blob_id = model['blobId']
    file_info = get_file_info_from_blob(oauth, model_blob_id).json()
    if 'ModelInfo' in file_info['metadata'].keys():
        info_key = 'ModelInfo'
    elif 'modelInfo' in file_info['metadata'].keys():
        info_key = 'modelInfo'
    else:
        raise KeyError('No model info')
    model_info = json.loads(file_info['metadata'][info_key])
    body['Bins'] = model_info['Bins']
    model_name = model_info['ModelName']
    model_type = model_info['ModelType']
    base_folder = '{}/general_training_report_{}'.format(
        TEMP_FOLDER, uuid.uuid1())
    make_directory(base_folder)
    LOGGER.info('MODEL INFO: {}'.format(model_info))
    LOGGER.info('MODEL NAME: {}'.format(model_name))
    LOGGER.info('MODEL TYPE: {}'.format(model_type))

    # generate general metrics dict
    all_models_metrics = []
    for model in body['Models']:
        # if something wrong with model file from blob storage
        if 'genericFiles' not in model.keys():
            raise TypeError('Empty model\'s generic files blob ids')

        for file_blob_id in model['genericFiles']:
            file_info = get_file_info_from_blob(oauth, file_blob_id).json()
            if 'fileInfo' in file_info['metadata'].keys():
                fileinfo_key = 'fileInfo'
            elif 'FileInfo' in file_info['metadata'].keys():
                fileinfo_key = 'FileInfo'
            else:
                raise KeyError('No fileInfo key in: {}'.format(
                    file_info['metadata'].keys()))

            file_info = json.loads(file_info['metadata'][fileinfo_key])

            if 'fileType' in file_info.keys():
                filetype_key = 'fileType'
            elif 'FileType' in file_info.keys():
                filetype_key = 'FileType'
            else:
                filetype_key = None

            if filetype_key and file_info[filetype_key] == TRAINING_CSV_METRICS:
                csv_blob_id = file_blob_id
                csv_model_metrics = get_file_from_blob(csv_blob_id,
                                                       oauth).content
                all_models_metrics.append(csv_model_metrics.decode())

            LOGGER.info('CURRENT MODEL INFO: {}'.format(file_info))

    LOGGER.info('ALL MODELS METRICS: {}'.format(all_models_metrics))
    # write general metrics data to csv file
    csv_files_names = write_csvs_files(all_models_metrics)
    general_csv_dict = merge_csv_files(csv_files_names)
    rows = make_general_csv_rows(general_csv_dict)
    general_csv_file_path = write_rows_to_csv_file(rows, base_folder)
    metrics = html_metrics_from_dict(general_csv_dict)

    fetch_token(oauth)
    # make csv info for blob storage
    general_csv_info = {
        'FileInfo':
        json.dumps({
            'modelName': model_name,
            'fileType': ALL_MODELS_TRAINING_CSV_METRICS
        }),
        'SkipOsdrProcessing':
        'true'
    }
    # make multipart object prepared to POST to blob storage
    # include csv file and file info
    multipart_general_csv = get_multipart_object(
        body,
        general_csv_file_path,
        'text/csv',
        additional_fields=general_csv_info)
    # POST metrcis csv file to blob storage
    post_data_to_blob(oauth, multipart_general_csv)

    # create general images
    body['NumberOfGenericFiles'] = 0
    path_to_radar_plot = None
    try:
        if model_type == CLASSIFIER:
            LOGGER.info('Creating radar_plot')
            nbits = body['Bins']
            path_to_radar_plot = radar_plot(general_csv_file_path,
                                            base_folder,
                                            nbits,
                                            titlename=model_name)
            # make radar plot multipart encoded object
            multipart_radar_plot = get_multipart_object(
                body,
                path_to_radar_plot,
                'image/png',
                additional_fields={'correlationId': body['CorrelationId']})

            # send, http POST request to blob storage api with radar plot
            post_data_to_blob(oauth, multipart_radar_plot)
            body['NumberOfGenericFiles'] += 1
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        raise Exception('Post generic data exception')

    optimizer_metrics = REDIS_CLIENT.get('optimizer_metrics_{}'.format(
        body['CorrelationId']))

    if optimizer_metrics:
        optimizer_metrics = html_optimal_metrics_from_dict(
            json.loads(optimizer_metrics), model_type)

    # add metrics and images to pdf report file
    context = {
        'metrics': metrics,
        'radar_plots': [path_to_radar_plot],
        'optimizer': optimizer_metrics
    }
    pdf_path = make_pdf_report(base_folder, context, model_name='general')
    fetch_token(oauth)
    multipart_general_csv = get_multipart_object(
        body,
        pdf_path,
        'application/pdf',
        additional_fields={'correlationId': body['CorrelationId']})
    post_data_to_blob(oauth, multipart_general_csv)
    body['NumberOfGenericFiles'] += 1

    # remove temporary directory
    shutil.rmtree(base_folder, ignore_errors=True)

    report_generated = training_report_generated_message(body)
    model_report_generated_message_publisher = PurePublisher(REPORT_GENERATED)
    model_report_generated_message_publisher.publish(report_generated)

    LOGGER.info('Report generated!')

    return None
Example #6
0
def train_model(body):
    """
    Pika callback function used by ml modeller. Make plots files, metrics files
    and model file if success.

    :param body: RabbitMQ MT message's body
    """

    # prepared needed for calculations variables
    body['CurrentModelId'] = body['CorrelationId']
    oauth = get_oauth()

    if not body['Method']:
        raise ValueError('Empty method in model trainer')

    body['Method'] = body['Method'].lower()
    method_code = body['Method']
    body['Name'] = '_'.join(algorithm_name_by_code(method_code).split())

    # publish training started event to OSDR
    fetch_token(oauth)
    publish_start_training_event(body)

    # validate input data
    # raise error with invalid parameter
    validate_user_input(body)

    # make dataframe using input parameters
    model_type = model_type_by_code(method_code)
    try:
        dataframe = make_dataframe(model_type, body, oauth)
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        raise Exception('Make dataframe exception')

    # calculate scaler model
    model_trainer = make_model_trainer(method_code, body, dataframe)
    LOGGER.info('Sending scaler file to {}/{}'.format(BLOB_URL, CLIENT_ID))
    fetch_token(oauth)

    # make applicability domain files and calculate values
    LOGGER.info('Calculate applicability domain')
    model_trainer.make_applicability_domain()

    # generic files for current training model, except model (*.sav) files
    body['NumberOfGenericFiles'] = 0
    # train chosen model
    LOGGER.info('Start model training')
    try:
        model_training_data = conduct_training(model_trainer, method_code,
                                               body, oauth)
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        raise Exception('Model training exception')

    # POST model's files (images, csvs etc)
    try:
        # POST classifier model files
        if model_type == CLASSIFIER:
            post_classifier(model_trainer, body, oauth,
                            model_training_data['path_to_csv_file'])
        # POST regressor model files
        elif model_type == REGRESSOR:
            post_regressor(model_trainer, body, oauth)
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        raise Exception('Post generic data exception')

    # update template tags with all needed data for model's report
    model_trainer.make_report_text()

    # make pdf report for trained model
    pdf_path = make_pdf_report(model_trainer.sub_folder,
                               model_trainer.template_tags,
                               model_name=body['Name'])
    LOGGER.info('Sending pdf report to {}/{}'.format(BLOB_URL, CLIENT_ID))

    # POST pdf report to blob storage
    fetch_token(oauth)
    model_report_info = {
        'FileInfo':
        json.dumps({
            'modelName': model_trainer.cv_model.model_name,
            'fileType': MODEL_PDF_REPORT
        }),
        'ParentId':
        body['CurrentModelId']
    }
    multipart_pdf = get_multipart_object(body,
                                         pdf_path,
                                         'application/pdf',
                                         additional_fields=model_report_info)
    post_data_to_blob(oauth, multipart_pdf)
    body['NumberOfGenericFiles'] += 1

    # remove temporary directory
    shutil.rmtree(model_trainer.sub_folder, ignore_errors=True)

    # send model trained message to OSDR via rabbitmq
    model_trained = model_trained_message(body, model_training_data,
                                          model_trainer)
    model_trained_message_publisher = PurePublisher(MODEL_TRAINED)
    model_trained_message_publisher.publish(model_trained)

    LOGGER.info('Finished Calculations!')

    return None