def calculate_feature_vectors(body): try: data_file_as_bytes_array = REDIS_CLIENT.get('{}-file'.format( body['CorrelationId'])) except: # log error traceback logging_exception_message(LOGGER) error_message = 'Can\'t get sdf file from redis' raise Exception(error_message) if body['FileType'] == 'sdf': csv_file_path = process_sdf(body, data_file_as_bytes_array) elif body['FileType'] == 'cif': path_to_cif_file = '{}/{}.cif'.format(TEMP_FOLDER, body['CorrelationId']) cif_file = open(path_to_cif_file, 'wb') cif_file.write(data_file_as_bytes_array) cif_file.close() if 'type' in body['Fingerprints'][0].keys(): type_key = 'type' elif 'Type' in body['Fingerprints'][0].keys(): type_key = 'Type' else: raise KeyError('No type key in fingerprints: {}'.format( body['Fingerprints'])) descriptors = [x[type_key].strip() for x in body['Fingerprints']] csv_file_path = '{}/{}.csv'.format(TEMP_FOLDER, body['CorrelationId']) shape = generate_csv([path_to_cif_file], descriptors, csv_file_path) body['Structures'] = shape[0] body['Columns'] = shape[1] body['Failed'] = 0 else: raise ValueError('Unknown file type: {}'.format(body['FileType'])) file_to_send = open(csv_file_path, 'rb') try: REDIS_CLIENT.setex('{}-csv'.format(body['CorrelationId']), EXPIRATION_TIME, file_to_send.read()) except: # log error traceback logging_exception_message(LOGGER) error_message = 'Can\'t send csv file to redis' raise Exception(error_message) file_to_send.close() # os.remove(csv_file_path) feature_vectors_calculator_publisher = feature_vectors_calculated_message( body) model_trained_message_publisher = PurePublisher(FEATURE_VECTORS_CALCULATED) model_trained_message_publisher.publish( feature_vectors_calculator_publisher) return None
def publish_start_training_event(body): """ Method for send (publish) start training event to OSDR via rabbitmq :param body: RabbitMQ MT message's body :type body: dict """ # generate rabbitmq message start_training_publisher = PurePublisher(MODEL_TRAINING_STARTED) # create valid message for send message_body = model_training_start_message(body) # send message start_training_publisher.publish(message_body)
def function_wrapper(body): """ Method which contain try/except blocks for function. If have not exceptions send report message to RabbitMQ queue defined in report_publisher MT constant If run except block logging traceback message to RabbitMQ queue defined in fail_publisher MT constant :param body: body of RabbitMQ message :return: function with exception handler """ report_event_message = None report_failed_message = None try: # try to execute function code with arguments report_event_message = function(body) except Exception: # get all system info about exception exception_type, exception, traceback_text = sys.exc_info() # make error traceback message exception_message = 'Internal server error.' # if specified exception if exception_type.__name__ == Exception.__name__: exception_message += ' {}'.format(exception) else: exception_message += 'Unspecified server error.' # log error traceback logging_exception_message(self.logger) # make failed message for publish in rabbitmq exchange report_failed_message = self.fail_message_constructor( exception_message, body) if report_event_message and self.report_publisher: # publish report message to rabbitmq report_message_publisher = PurePublisher(self.report_publisher) report_message_publisher.publish(report_event_message) elif report_event_message: self.logger.info( 'Have not report publisher, but callback is work') if report_failed_message and self.fail_publisher: # publish failed message to rabbitmq fail_message_publisher = PurePublisher(self.fail_publisher) fail_message_publisher.publish(report_failed_message) elif report_failed_message: self.logger.info( 'Have not fail publisher, callback throw exception')
def publish_thumbnail_generated(training_parameters, thumbnail_blob_id): """ Method for send (publish) thumbnail generated event to OSDR. Send thumbnail blob id and model trainig parameters (model id etc) :param training_parameters: model training parameters :param thumbnail_blob_id: model thumbnail image's blob id :type training_parameters: dict """ # generate message body thumbnail_generated = thumbnail_generated_message(training_parameters, thumbnail_blob_id) # make publisher object model_thumbnail_generated_publisher = PurePublisher( MODEL_THUMBNAIL_GENERATED) # send (publish) rabbitmq message, thumbnail generated event model_thumbnail_generated_publisher.publish(thumbnail_generated) # change thumbnail generated flag # using in thumbnail generated rabbitmq message to OSDR training_parameters['IsThumbnailGenerated'] = True
def callback(body): """ Pika callback function used by single structure predictor. Make list of json with prediction data for each model prediction. :param body: RabbitMQ MT message's body """ # start total prediction time counter start_prediction_total = time() models_data = [] oauth = get_oauth() # try to reformat molecule, if using "\\" instead of "\" in mol string if '\\n' in body['Structure']: body['Structure'] = body['Structure'].replace('\\n', '\n') # set molecules converter function depends on input structure type molecules_converter_method = None if body['Format'] == 'MOL': molecules_converter_method = molecules_from_mol_strings elif body['Format'] == 'SMILES': molecules_converter_method = molecules_from_smiles # read molecules from input mol string or SMILES molecules = None exception_message = None try: molecules = molecules_converter_method([body['Structure']]) except: # log error traceback logging_exception_message(LOGGER) exception_message = 'Get molecule from molstring exception' # get models ids, models blob ids, models buckets from input message fetch_token(oauth) model_id, models_blob_ids, models_buckets = get_models_from_body_message( body) # make prediction for all models for model_id, model_blob_id, model_bucket in zip(model_id, models_blob_ids, models_buckets): # start current prediction counter start_current_prediction = time() start_timer = time() # define exception message for current prediction if not exception_message: exception_message = None # initialize prediction parameters prediction_parameters = dict() prediction_parameters['DatasetFileName'] = 'Single Structure Predict' prediction_parameters['Molecules'] = molecules # get current model info fetch_token(oauth) blob_model_info = get_model_info(oauth, model_blob_id, model_bucket) blob_model_info['ModelBlobId'] = model_blob_id blob_model_info['ModelBucket'] = model_bucket blob_model_info['ModelCode'] = algorithm_code_by_name( blob_model_info['Method']) # add prediction data to json # add training parameters # add dataset parameters predicted_data = { 'id': model_id, 'trainingParameters': training_parameters(blob_model_info), 'property': property_parameters(blob_model_info), 'dataset': dataset(blob_model_info), 'reportId': str(uuid.uuid1()) } fetch_token(oauth) LOGGER.info('INITIAL PREPARING: {} sec'.format(time() - start_timer)) try: # update prediction parameters prepare_prediction_parameters(oauth, prediction_parameters, blob_model_info) except: # log error traceback logging_exception_message(LOGGER) if not exception_message: exception_message = 'Prepare prediction parameters exception' start_timer = time() try: # make prediction ml_predictor = MLPredictor(prediction_parameters) ml_predictor.make_prediction() prediction = ml_predictor.prediction except: # log error traceback logging_exception_message(LOGGER) if not exception_message: exception_message = 'Predictor exception' LOGGER.info('PREDICTION TIME: {} sec'.format(time() - start_timer)) # add error to json if something wrong if exception_message: predicted_data['error'] = error(exception_message) else: predicted_data['result'] = result(prediction) predicted_data['applicabilityDomain'] = applicability_domain( prediction) # stop current prediction counter current_prediction_time_seconds = time() - start_current_prediction predicted_data['predictionElapsedTime'] = int( current_prediction_time_seconds * 1000) models_data.append(predicted_data) # stop total predictions counter total_prediction_time_seconds = time() - start_prediction_total body['Data'] = { 'predictionElapsedTime': int(total_prediction_time_seconds * 1000), 'models': models_data } # add prediction consensus data to sending message consensus_data = consensus(models_data) if consensus_data: body['Data'].update({'consensus': consensus_data}) # send (publish) properties predicted message to OSDR prediction_created_event = single_structure_property_predicted(body) properties_predicted_publisher = PurePublisher(SINGLE_STRUCTURE_PREDICTED) properties_predicted_publisher.publish(prediction_created_event) return None
def generate_training_report(body): """ Pika callback function used by training report generator. Make plots files, general metrics csv file and report file if success. :param body: RabbitMQ MT message's body """ oauth = get_oauth() fetch_token(oauth) # define using variables for ml reporter model = body['Models'][0] model_blob_id = model['blobId'] file_info = get_file_info_from_blob(oauth, model_blob_id).json() if 'ModelInfo' in file_info['metadata'].keys(): info_key = 'ModelInfo' elif 'modelInfo' in file_info['metadata'].keys(): info_key = 'modelInfo' else: raise KeyError('No model info') model_info = json.loads(file_info['metadata'][info_key]) body['Bins'] = model_info['Bins'] model_name = model_info['ModelName'] model_type = model_info['ModelType'] base_folder = '{}/general_training_report_{}'.format( TEMP_FOLDER, uuid.uuid1()) make_directory(base_folder) LOGGER.info('MODEL INFO: {}'.format(model_info)) LOGGER.info('MODEL NAME: {}'.format(model_name)) LOGGER.info('MODEL TYPE: {}'.format(model_type)) # generate general metrics dict all_models_metrics = [] for model in body['Models']: # if something wrong with model file from blob storage if 'genericFiles' not in model.keys(): raise TypeError('Empty model\'s generic files blob ids') for file_blob_id in model['genericFiles']: file_info = get_file_info_from_blob(oauth, file_blob_id).json() if 'fileInfo' in file_info['metadata'].keys(): fileinfo_key = 'fileInfo' elif 'FileInfo' in file_info['metadata'].keys(): fileinfo_key = 'FileInfo' else: raise KeyError('No fileInfo key in: {}'.format( file_info['metadata'].keys())) file_info = json.loads(file_info['metadata'][fileinfo_key]) if 'fileType' in file_info.keys(): filetype_key = 'fileType' elif 'FileType' in file_info.keys(): filetype_key = 'FileType' else: filetype_key = None if filetype_key and file_info[filetype_key] == TRAINING_CSV_METRICS: csv_blob_id = file_blob_id csv_model_metrics = get_file_from_blob(csv_blob_id, oauth).content all_models_metrics.append(csv_model_metrics.decode()) LOGGER.info('CURRENT MODEL INFO: {}'.format(file_info)) LOGGER.info('ALL MODELS METRICS: {}'.format(all_models_metrics)) # write general metrics data to csv file csv_files_names = write_csvs_files(all_models_metrics) general_csv_dict = merge_csv_files(csv_files_names) rows = make_general_csv_rows(general_csv_dict) general_csv_file_path = write_rows_to_csv_file(rows, base_folder) metrics = html_metrics_from_dict(general_csv_dict) fetch_token(oauth) # make csv info for blob storage general_csv_info = { 'FileInfo': json.dumps({ 'modelName': model_name, 'fileType': ALL_MODELS_TRAINING_CSV_METRICS }), 'SkipOsdrProcessing': 'true' } # make multipart object prepared to POST to blob storage # include csv file and file info multipart_general_csv = get_multipart_object( body, general_csv_file_path, 'text/csv', additional_fields=general_csv_info) # POST metrcis csv file to blob storage post_data_to_blob(oauth, multipart_general_csv) # create general images body['NumberOfGenericFiles'] = 0 path_to_radar_plot = None try: if model_type == CLASSIFIER: LOGGER.info('Creating radar_plot') nbits = body['Bins'] path_to_radar_plot = radar_plot(general_csv_file_path, base_folder, nbits, titlename=model_name) # make radar plot multipart encoded object multipart_radar_plot = get_multipart_object( body, path_to_radar_plot, 'image/png', additional_fields={'correlationId': body['CorrelationId']}) # send, http POST request to blob storage api with radar plot post_data_to_blob(oauth, multipart_radar_plot) body['NumberOfGenericFiles'] += 1 except: # log error traceback logging_exception_message(LOGGER) raise Exception('Post generic data exception') optimizer_metrics = REDIS_CLIENT.get('optimizer_metrics_{}'.format( body['CorrelationId'])) if optimizer_metrics: optimizer_metrics = html_optimal_metrics_from_dict( json.loads(optimizer_metrics), model_type) # add metrics and images to pdf report file context = { 'metrics': metrics, 'radar_plots': [path_to_radar_plot], 'optimizer': optimizer_metrics } pdf_path = make_pdf_report(base_folder, context, model_name='general') fetch_token(oauth) multipart_general_csv = get_multipart_object( body, pdf_path, 'application/pdf', additional_fields={'correlationId': body['CorrelationId']}) post_data_to_blob(oauth, multipart_general_csv) body['NumberOfGenericFiles'] += 1 # remove temporary directory shutil.rmtree(base_folder, ignore_errors=True) report_generated = training_report_generated_message(body) model_report_generated_message_publisher = PurePublisher(REPORT_GENERATED) model_report_generated_message_publisher.publish(report_generated) LOGGER.info('Report generated!') return None
def callback(body): """ Pika callback function used by ml predictor. Make file with predicted properties by picked model. Send file to blob storage for OSDR :param body: RabbitMQ MT message's body """ oauth = get_oauth() prediction_parameters = dict() fetch_token(oauth) # update prediction parameters # add dataset as bytes # add dataset file name, just for report message prediction_parameters['Dataset'], prediction_parameters[ 'DatasetFileName'] = get_dataset(oauth, body) model_info = get_model_info(oauth, body['ModelBlobId'], body['ModelBucket']) fetch_token(oauth) model_info['ModelBlobId'] = body['ModelBlobId'] model_info['ModelBucket'] = body['ModelBucket'] model_info['ModelCode'] = algorithm_code_by_name(model_info['Method']) # update prediction parameters with model paramers, such as density matrix, # distance model, MODI, fingerprints etc prepare_prediction_parameters(oauth, prediction_parameters, model_info) # update prediction parameters # add list of molecules prediction_parameters['Molecules'] = get_molecules_from_sdf_bytes( prediction_parameters['Dataset']) # define predictor object using prediction parameters # make prediction for all molecules ml_predictor = MLPredictor(prediction_parameters) ml_predictor.make_prediction() # send prediction result to OSDR # write prediction to csv prediction_csv_path = ml_predictor.write_prediction_to_csv() fetch_token(oauth) # prepare multipart object multipart_csv = get_multipart_object(body, prediction_csv_path, 'text/csv') # POST data to blob storage response_csv = post_data_to_blob(oauth, multipart_csv) # get prediction blob id and publish message in properties predicted queue prediction_created_event = property_predicted( body, os.path.basename(prediction_csv_path), response_csv.json()[0]) properties_predicted_publisher = PurePublisher(PROPERTIES_PREDICTED) properties_predicted_publisher.publish(prediction_created_event) # remove prediction file from temporary folder os.remove(prediction_csv_path) # remove temporary models folder shutil.rmtree(prediction_parameters['ModelsFolder'], ignore_errors=True) # clear memory del MODELS_IN_MEMORY_CACHE[model_info['ModelBlobId']] return None
def train_model(body): """ Pika callback function used by ml modeller. Make plots files, metrics files and model file if success. :param body: RabbitMQ MT message's body """ # prepared needed for calculations variables body['CurrentModelId'] = body['CorrelationId'] oauth = get_oauth() if not body['Method']: raise ValueError('Empty method in model trainer') body['Method'] = body['Method'].lower() method_code = body['Method'] body['Name'] = '_'.join(algorithm_name_by_code(method_code).split()) # publish training started event to OSDR fetch_token(oauth) publish_start_training_event(body) # validate input data # raise error with invalid parameter validate_user_input(body) # make dataframe using input parameters model_type = model_type_by_code(method_code) try: dataframe = make_dataframe(model_type, body, oauth) except: # log error traceback logging_exception_message(LOGGER) raise Exception('Make dataframe exception') # calculate scaler model model_trainer = make_model_trainer(method_code, body, dataframe) LOGGER.info('Sending scaler file to {}/{}'.format(BLOB_URL, CLIENT_ID)) fetch_token(oauth) # make applicability domain files and calculate values LOGGER.info('Calculate applicability domain') model_trainer.make_applicability_domain() # generic files for current training model, except model (*.sav) files body['NumberOfGenericFiles'] = 0 # train chosen model LOGGER.info('Start model training') try: model_training_data = conduct_training(model_trainer, method_code, body, oauth) except: # log error traceback logging_exception_message(LOGGER) raise Exception('Model training exception') # POST model's files (images, csvs etc) try: # POST classifier model files if model_type == CLASSIFIER: post_classifier(model_trainer, body, oauth, model_training_data['path_to_csv_file']) # POST regressor model files elif model_type == REGRESSOR: post_regressor(model_trainer, body, oauth) except: # log error traceback logging_exception_message(LOGGER) raise Exception('Post generic data exception') # update template tags with all needed data for model's report model_trainer.make_report_text() # make pdf report for trained model pdf_path = make_pdf_report(model_trainer.sub_folder, model_trainer.template_tags, model_name=body['Name']) LOGGER.info('Sending pdf report to {}/{}'.format(BLOB_URL, CLIENT_ID)) # POST pdf report to blob storage fetch_token(oauth) model_report_info = { 'FileInfo': json.dumps({ 'modelName': model_trainer.cv_model.model_name, 'fileType': MODEL_PDF_REPORT }), 'ParentId': body['CurrentModelId'] } multipart_pdf = get_multipart_object(body, pdf_path, 'application/pdf', additional_fields=model_report_info) post_data_to_blob(oauth, multipart_pdf) body['NumberOfGenericFiles'] += 1 # remove temporary directory shutil.rmtree(model_trainer.sub_folder, ignore_errors=True) # send model trained message to OSDR via rabbitmq model_trained = model_trained_message(body, model_training_data, model_trainer) model_trained_message_publisher = PurePublisher(MODEL_TRAINED) model_trained_message_publisher.publish(model_trained) LOGGER.info('Finished Calculations!') return None
def find_optimal_parameters(body): """ Pika callback function used by ml optimizer Find optimal training fingerprints set for input dataset Using only 1000 (by default) or less structures from input dataset Send overall optimizing result to Redis, to use it in ml training report :param body: RabbitMQ MT message's body :type body: dict """ oauth = get_oauth() # check input methods if not body['Methods']: raise ValueError('Empty Methods') # calculate metrics for each fingerprints set metrics, target_metric = fingerprints_grid_search( oauth, body, BASE_FINGERPRINTS) # send all metrics to redis # later use it to add to training report REDIS_CLIENT.setex( 'optimizer_metrics_{}'.format(body['CorrelationId']), EXPIRATION_TIME, json.dumps(metrics) ) # find best fingerprints set optimal_fingerprints = sorted( metrics.values(), key=lambda value: value['metrics'][target_metric], reverse=True )[0]['fptype'] # set other default 'optimal' parameters for training model body['SubSampleSize'] = 1.0 body['TestDataSize'] = 0.3 body['Scaler'] = 'MinMax' body['KFold'] = 5 body['Fingerprints'] = optimal_fingerprints body['OptimizationMethod'] = 'default' body['NumberOfIterations'] = 100 # make optimizer metrics csv and post it to blob storage formatted_metrics = TMP_TMP( metrics, model_type_by_code(body['Methods'][0].lower())) csv_path = '{}/ml_optimizer/{}/optimizing.csv'.format( TEMP_FOLDER, body['CorrelationId']) write_optimized_metrics_to_csv(formatted_metrics, csv_path) multipart_model = get_multipart_object( body, csv_path, 'application/x-spss-sav', additional_fields={'ParentId': body['TargetFolderId']} ) # send optimizer metrics csv file to blob storage fetch_token(oauth) response = post_data_to_blob(oauth, multipart_model) LOGGER.info('Optimizer csv status code: {}'.format(response.status_code)) # send best fingerprints set and 'optimal' parameters to training model training_optimized = model_training_optimized(body) training_optimized_message_publisher = PurePublisher(TRAINING_OPTIMIZED) training_optimized_message_publisher.publish(training_optimized) # clear current optimization folder shutil.rmtree( '{}/ml_optimizer/{}'.format(TEMP_FOLDER, body['CorrelationId']), ignore_errors=True )