def calculate_feature_vectors(body): try: data_file_as_bytes_array = REDIS_CLIENT.get('{}-file'.format( body['CorrelationId'])) except: # log error traceback logging_exception_message(LOGGER) error_message = 'Can\'t get sdf file from redis' raise Exception(error_message) if body['FileType'] == 'sdf': csv_file_path = process_sdf(body, data_file_as_bytes_array) elif body['FileType'] == 'cif': path_to_cif_file = '{}/{}.cif'.format(TEMP_FOLDER, body['CorrelationId']) cif_file = open(path_to_cif_file, 'wb') cif_file.write(data_file_as_bytes_array) cif_file.close() if 'type' in body['Fingerprints'][0].keys(): type_key = 'type' elif 'Type' in body['Fingerprints'][0].keys(): type_key = 'Type' else: raise KeyError('No type key in fingerprints: {}'.format( body['Fingerprints'])) descriptors = [x[type_key].strip() for x in body['Fingerprints']] csv_file_path = '{}/{}.csv'.format(TEMP_FOLDER, body['CorrelationId']) shape = generate_csv([path_to_cif_file], descriptors, csv_file_path) body['Structures'] = shape[0] body['Columns'] = shape[1] body['Failed'] = 0 else: raise ValueError('Unknown file type: {}'.format(body['FileType'])) file_to_send = open(csv_file_path, 'rb') try: REDIS_CLIENT.setex('{}-csv'.format(body['CorrelationId']), EXPIRATION_TIME, file_to_send.read()) except: # log error traceback logging_exception_message(LOGGER) error_message = 'Can\'t send csv file to redis' raise Exception(error_message) file_to_send.close() # os.remove(csv_file_path) feature_vectors_calculator_publisher = feature_vectors_calculated_message( body) model_trained_message_publisher = PurePublisher(FEATURE_VECTORS_CALCULATED) model_trained_message_publisher.publish( feature_vectors_calculator_publisher) return None
def function_wrapper(body): """ Method which contain try/except blocks for function. If have not exceptions send report message to RabbitMQ queue defined in report_publisher MT constant If run except block logging traceback message to RabbitMQ queue defined in fail_publisher MT constant :param body: body of RabbitMQ message :return: function with exception handler """ report_event_message = None report_failed_message = None try: # try to execute function code with arguments report_event_message = function(body) except Exception: # get all system info about exception exception_type, exception, traceback_text = sys.exc_info() # make error traceback message exception_message = 'Internal server error.' # if specified exception if exception_type.__name__ == Exception.__name__: exception_message += ' {}'.format(exception) else: exception_message += 'Unspecified server error.' # log error traceback logging_exception_message(self.logger) # make failed message for publish in rabbitmq exchange report_failed_message = self.fail_message_constructor( exception_message, body) if report_event_message and self.report_publisher: # publish report message to rabbitmq report_message_publisher = PurePublisher(self.report_publisher) report_message_publisher.publish(report_event_message) elif report_event_message: self.logger.info( 'Have not report publisher, but callback is work') if report_failed_message and self.fail_publisher: # publish failed message to rabbitmq fail_message_publisher = PurePublisher(self.fail_publisher) fail_message_publisher.publish(report_failed_message) elif report_failed_message: self.logger.info( 'Have not fail publisher, callback throw exception')
def process_sdf(body, sdf_as_bytes_array): fingerprints = list() for fingerprint in body['Fingerprints']: new_fingerprint = dict() for key, value in fingerprint.items(): new_fingerprint[key.capitalize()] = value fingerprints.append(new_fingerprint) body['Fingerprints'] = fingerprints molecules = get_molecules_from_sdf_bytes(sdf_as_bytes_array) errors_list = [[]] try: data_frame = sdf_to_csv('', body['Fingerprints'], find_classes=True, find_values=True, molecules=molecules, processing_errors=errors_list) except: # log error traceback logging_exception_message(LOGGER) error_message = 'Can\'t make dataframe using this sdf file' raise Exception(error_message) smiles_list = list() inchies = list() for molecule_number, molecule in enumerate(molecules): smiles = Chem.MolToSmiles(molecule, isomericSmiles=True) inchi = get_inchi_key(molecule) smiles_list.append(('SMILES', molecule_number, smiles)) inchies.append(('InChiKey', molecule_number, inchi)) smiles_numpy_array = numpy.array(smiles_list, dtype=[('name', 'U17'), ('molecule_number', 'i4'), ('value', 'U40')]) inchi_numpy_array = numpy.array(inchies, dtype=[('name', 'U17'), ('molecule_number', 'i4'), ('value', 'U40')]) errors_numpy_array = numpy.array(errors_list[0], dtype=[('name', 'U17'), ('molecule_number', 'i4'), ('value', 'U40')]) data_frame = data_frame.astype([('name', 'U10'), ('molecule_number', 'i4'), ('value', 'U40')]) data_frame = numpy.insert(data_frame, 0, inchi_numpy_array, axis=1) data_frame = numpy.insert(data_frame, 0, smiles_numpy_array, axis=1) data_frame = numpy.insert(data_frame, data_frame.shape[1], errors_numpy_array, axis=1) csv_file_path = '{}/{}.csv'.format(TEMP_FOLDER, body['CorrelationId']) try: numpy_to_csv(data_frame, csv_file_path) except: # log error traceback logging_exception_message(LOGGER) error_message = 'Can\'t convert dataframe to csv file' raise Exception(error_message) body['Structures'] = data_frame.shape[0] body['Columns'] = data_frame.shape[1] body['Failed'] = 0 return csv_file_path
def callback(body): """ Pika callback function used by single structure predictor. Make list of json with prediction data for each model prediction. :param body: RabbitMQ MT message's body """ # start total prediction time counter start_prediction_total = time() models_data = [] oauth = get_oauth() # try to reformat molecule, if using "\\" instead of "\" in mol string if '\\n' in body['Structure']: body['Structure'] = body['Structure'].replace('\\n', '\n') # set molecules converter function depends on input structure type molecules_converter_method = None if body['Format'] == 'MOL': molecules_converter_method = molecules_from_mol_strings elif body['Format'] == 'SMILES': molecules_converter_method = molecules_from_smiles # read molecules from input mol string or SMILES molecules = None exception_message = None try: molecules = molecules_converter_method([body['Structure']]) except: # log error traceback logging_exception_message(LOGGER) exception_message = 'Get molecule from molstring exception' # get models ids, models blob ids, models buckets from input message fetch_token(oauth) model_id, models_blob_ids, models_buckets = get_models_from_body_message( body) # make prediction for all models for model_id, model_blob_id, model_bucket in zip(model_id, models_blob_ids, models_buckets): # start current prediction counter start_current_prediction = time() start_timer = time() # define exception message for current prediction if not exception_message: exception_message = None # initialize prediction parameters prediction_parameters = dict() prediction_parameters['DatasetFileName'] = 'Single Structure Predict' prediction_parameters['Molecules'] = molecules # get current model info fetch_token(oauth) blob_model_info = get_model_info(oauth, model_blob_id, model_bucket) blob_model_info['ModelBlobId'] = model_blob_id blob_model_info['ModelBucket'] = model_bucket blob_model_info['ModelCode'] = algorithm_code_by_name( blob_model_info['Method']) # add prediction data to json # add training parameters # add dataset parameters predicted_data = { 'id': model_id, 'trainingParameters': training_parameters(blob_model_info), 'property': property_parameters(blob_model_info), 'dataset': dataset(blob_model_info), 'reportId': str(uuid.uuid1()) } fetch_token(oauth) LOGGER.info('INITIAL PREPARING: {} sec'.format(time() - start_timer)) try: # update prediction parameters prepare_prediction_parameters(oauth, prediction_parameters, blob_model_info) except: # log error traceback logging_exception_message(LOGGER) if not exception_message: exception_message = 'Prepare prediction parameters exception' start_timer = time() try: # make prediction ml_predictor = MLPredictor(prediction_parameters) ml_predictor.make_prediction() prediction = ml_predictor.prediction except: # log error traceback logging_exception_message(LOGGER) if not exception_message: exception_message = 'Predictor exception' LOGGER.info('PREDICTION TIME: {} sec'.format(time() - start_timer)) # add error to json if something wrong if exception_message: predicted_data['error'] = error(exception_message) else: predicted_data['result'] = result(prediction) predicted_data['applicabilityDomain'] = applicability_domain( prediction) # stop current prediction counter current_prediction_time_seconds = time() - start_current_prediction predicted_data['predictionElapsedTime'] = int( current_prediction_time_seconds * 1000) models_data.append(predicted_data) # stop total predictions counter total_prediction_time_seconds = time() - start_prediction_total body['Data'] = { 'predictionElapsedTime': int(total_prediction_time_seconds * 1000), 'models': models_data } # add prediction consensus data to sending message consensus_data = consensus(models_data) if consensus_data: body['Data'].update({'consensus': consensus_data}) # send (publish) properties predicted message to OSDR prediction_created_event = single_structure_property_predicted(body) properties_predicted_publisher = PurePublisher(SINGLE_STRUCTURE_PREDICTED) properties_predicted_publisher.publish(prediction_created_event) return None
def generate_training_report(body): """ Pika callback function used by training report generator. Make plots files, general metrics csv file and report file if success. :param body: RabbitMQ MT message's body """ oauth = get_oauth() fetch_token(oauth) # define using variables for ml reporter model = body['Models'][0] model_blob_id = model['blobId'] file_info = get_file_info_from_blob(oauth, model_blob_id).json() if 'ModelInfo' in file_info['metadata'].keys(): info_key = 'ModelInfo' elif 'modelInfo' in file_info['metadata'].keys(): info_key = 'modelInfo' else: raise KeyError('No model info') model_info = json.loads(file_info['metadata'][info_key]) body['Bins'] = model_info['Bins'] model_name = model_info['ModelName'] model_type = model_info['ModelType'] base_folder = '{}/general_training_report_{}'.format( TEMP_FOLDER, uuid.uuid1()) make_directory(base_folder) LOGGER.info('MODEL INFO: {}'.format(model_info)) LOGGER.info('MODEL NAME: {}'.format(model_name)) LOGGER.info('MODEL TYPE: {}'.format(model_type)) # generate general metrics dict all_models_metrics = [] for model in body['Models']: # if something wrong with model file from blob storage if 'genericFiles' not in model.keys(): raise TypeError('Empty model\'s generic files blob ids') for file_blob_id in model['genericFiles']: file_info = get_file_info_from_blob(oauth, file_blob_id).json() if 'fileInfo' in file_info['metadata'].keys(): fileinfo_key = 'fileInfo' elif 'FileInfo' in file_info['metadata'].keys(): fileinfo_key = 'FileInfo' else: raise KeyError('No fileInfo key in: {}'.format( file_info['metadata'].keys())) file_info = json.loads(file_info['metadata'][fileinfo_key]) if 'fileType' in file_info.keys(): filetype_key = 'fileType' elif 'FileType' in file_info.keys(): filetype_key = 'FileType' else: filetype_key = None if filetype_key and file_info[filetype_key] == TRAINING_CSV_METRICS: csv_blob_id = file_blob_id csv_model_metrics = get_file_from_blob(csv_blob_id, oauth).content all_models_metrics.append(csv_model_metrics.decode()) LOGGER.info('CURRENT MODEL INFO: {}'.format(file_info)) LOGGER.info('ALL MODELS METRICS: {}'.format(all_models_metrics)) # write general metrics data to csv file csv_files_names = write_csvs_files(all_models_metrics) general_csv_dict = merge_csv_files(csv_files_names) rows = make_general_csv_rows(general_csv_dict) general_csv_file_path = write_rows_to_csv_file(rows, base_folder) metrics = html_metrics_from_dict(general_csv_dict) fetch_token(oauth) # make csv info for blob storage general_csv_info = { 'FileInfo': json.dumps({ 'modelName': model_name, 'fileType': ALL_MODELS_TRAINING_CSV_METRICS }), 'SkipOsdrProcessing': 'true' } # make multipart object prepared to POST to blob storage # include csv file and file info multipart_general_csv = get_multipart_object( body, general_csv_file_path, 'text/csv', additional_fields=general_csv_info) # POST metrcis csv file to blob storage post_data_to_blob(oauth, multipart_general_csv) # create general images body['NumberOfGenericFiles'] = 0 path_to_radar_plot = None try: if model_type == CLASSIFIER: LOGGER.info('Creating radar_plot') nbits = body['Bins'] path_to_radar_plot = radar_plot(general_csv_file_path, base_folder, nbits, titlename=model_name) # make radar plot multipart encoded object multipart_radar_plot = get_multipart_object( body, path_to_radar_plot, 'image/png', additional_fields={'correlationId': body['CorrelationId']}) # send, http POST request to blob storage api with radar plot post_data_to_blob(oauth, multipart_radar_plot) body['NumberOfGenericFiles'] += 1 except: # log error traceback logging_exception_message(LOGGER) raise Exception('Post generic data exception') optimizer_metrics = REDIS_CLIENT.get('optimizer_metrics_{}'.format( body['CorrelationId'])) if optimizer_metrics: optimizer_metrics = html_optimal_metrics_from_dict( json.loads(optimizer_metrics), model_type) # add metrics and images to pdf report file context = { 'metrics': metrics, 'radar_plots': [path_to_radar_plot], 'optimizer': optimizer_metrics } pdf_path = make_pdf_report(base_folder, context, model_name='general') fetch_token(oauth) multipart_general_csv = get_multipart_object( body, pdf_path, 'application/pdf', additional_fields={'correlationId': body['CorrelationId']}) post_data_to_blob(oauth, multipart_general_csv) body['NumberOfGenericFiles'] += 1 # remove temporary directory shutil.rmtree(base_folder, ignore_errors=True) report_generated = training_report_generated_message(body) model_report_generated_message_publisher = PurePublisher(REPORT_GENERATED) model_report_generated_message_publisher.publish(report_generated) LOGGER.info('Report generated!') return None
def train_model(body): """ Pika callback function used by ml modeller. Make plots files, metrics files and model file if success. :param body: RabbitMQ MT message's body """ # prepared needed for calculations variables body['CurrentModelId'] = body['CorrelationId'] oauth = get_oauth() if not body['Method']: raise ValueError('Empty method in model trainer') body['Method'] = body['Method'].lower() method_code = body['Method'] body['Name'] = '_'.join(algorithm_name_by_code(method_code).split()) # publish training started event to OSDR fetch_token(oauth) publish_start_training_event(body) # validate input data # raise error with invalid parameter validate_user_input(body) # make dataframe using input parameters model_type = model_type_by_code(method_code) try: dataframe = make_dataframe(model_type, body, oauth) except: # log error traceback logging_exception_message(LOGGER) raise Exception('Make dataframe exception') # calculate scaler model model_trainer = make_model_trainer(method_code, body, dataframe) LOGGER.info('Sending scaler file to {}/{}'.format(BLOB_URL, CLIENT_ID)) fetch_token(oauth) # make applicability domain files and calculate values LOGGER.info('Calculate applicability domain') model_trainer.make_applicability_domain() # generic files for current training model, except model (*.sav) files body['NumberOfGenericFiles'] = 0 # train chosen model LOGGER.info('Start model training') try: model_training_data = conduct_training(model_trainer, method_code, body, oauth) except: # log error traceback logging_exception_message(LOGGER) raise Exception('Model training exception') # POST model's files (images, csvs etc) try: # POST classifier model files if model_type == CLASSIFIER: post_classifier(model_trainer, body, oauth, model_training_data['path_to_csv_file']) # POST regressor model files elif model_type == REGRESSOR: post_regressor(model_trainer, body, oauth) except: # log error traceback logging_exception_message(LOGGER) raise Exception('Post generic data exception') # update template tags with all needed data for model's report model_trainer.make_report_text() # make pdf report for trained model pdf_path = make_pdf_report(model_trainer.sub_folder, model_trainer.template_tags, model_name=body['Name']) LOGGER.info('Sending pdf report to {}/{}'.format(BLOB_URL, CLIENT_ID)) # POST pdf report to blob storage fetch_token(oauth) model_report_info = { 'FileInfo': json.dumps({ 'modelName': model_trainer.cv_model.model_name, 'fileType': MODEL_PDF_REPORT }), 'ParentId': body['CurrentModelId'] } multipart_pdf = get_multipart_object(body, pdf_path, 'application/pdf', additional_fields=model_report_info) post_data_to_blob(oauth, multipart_pdf) body['NumberOfGenericFiles'] += 1 # remove temporary directory shutil.rmtree(model_trainer.sub_folder, ignore_errors=True) # send model trained message to OSDR via rabbitmq model_trained = model_trained_message(body, model_training_data, model_trainer) model_trained_message_publisher = PurePublisher(MODEL_TRAINED) model_trained_message_publisher.publish(model_trained) LOGGER.info('Finished Calculations!') return None