def test_posting_data_to_blob(self): """ Test for post_data_to_blob method. Used in ML services a lot """ # logging test start LOGGER.info('Post data to blob test start') # create test file in file system make_test_file = open( os.path.join(self.temp_folder, self.test_file_name), 'a') make_test_file.write('123 312 222\n') make_test_file.close() # make metadata for POST message_object = { 'ParentId': self.parent_id, 'UserId': self.user_id } # make file data POST data_file_path = os.path.join(self.temp_folder, self.test_file_name) data_file_type = 'text/txt' # make multipart object, which used for simplifying POSTing multipart_object = get_multipart_object( message_object, data_file_path, data_file_type) # POST data to blob storage fetch_token(self.oauth) response = post_data_to_blob(self.oauth, multipart_object) # if post successful, 200 status code self.assertEqual(200, response.status_code)
def post_classifier(classifier, training_parameters, oauth, metrics_path): """ Method for POST special for classifier model files, such as radar plot :param classifier: model trainer object :param training_parameters: model training parameters :param oauth: using in ml service OAuth2Session object :param metrics_path: path to model's metrics csv file :type classifier: ClassicClassifier, DNNClassifier :type training_parameters: dict """ LOGGER.info('Creating radar_plot') # get nbits number from model trainer nbits = classifier.bins # generate radar plot image # get path to radar plot image path_to_radar_plot = radar_plot(metrics_path, classifier.sub_folder, nbits, titlename=training_parameters['Name']) # update model trainer template tags # using on pdf report creation classifier.template_tags['radar_plots'].append(path_to_radar_plot) # generate radar plot info for blob storage radar_plot_info = { 'FileInfo': json.dumps({ 'modelName': classifier.cv_model.model_name, }), 'ParentId': training_parameters['CurrentModelId'] } # make radar plot multipart encoded object multipart_radar_plot = get_multipart_object( training_parameters, path_to_radar_plot, 'image/png', additional_fields=radar_plot_info) # log sending multipart encoded object LOGGER.info('Sending radar plot to {}/{}'.format(BLOB_URL, CLIENT_ID)) training_parameters['NumberOfGenericFiles'] += 1 # send, http POST request to blob storage api with radar plot post_data_to_blob(oauth, multipart_radar_plot)
def post_thumbnail(model_trainer, training_parameters, oauth): """ Method which send (publish) message with model's thumbnail data to OSDR :param model_trainer: model trainer object :param training_parameters: model training parameters :param oauth: using in ml service OAuth2Session object :type model_trainer: ClassicClassifier, ClassicRegressor, DNNClassifier, DNNRegressor :type training_parameters: dict """ # get path to thumbnail image from model trainer path_to_thumbnail = model_trainer.plots['mean']['thumbnail_plot_path'] # generate thumbnail info (using in blob storage) thumbnail_info = { 'FileInfo': json.dumps({ 'modelName': model_trainer.cv_model.model_name, 'fileType': THUMBNAIL_IMAGE }), 'SkipOsdrProcessing': 'true', 'ParentId': training_parameters['CurrentModelId'] } # prepare thumbnail multipart object for send multipart_thumbnail = get_multipart_object( training_parameters, path_to_thumbnail, 'image/png', additional_fields=thumbnail_info) # POST prepared thumbnail object (image and info) to blob storage # get response from blob storage response = post_data_to_blob(oauth, multipart_thumbnail) # send (publish) thumbnail blob id in OSDR via rabbitmq thumbnail_blob_id = response.json()[0] publish_thumbnail_generated(training_parameters, thumbnail_blob_id)
def post_regressor(regresor, training_parameters, oauth): """ Method for POST special for regresor model files, such as distribution plot :param regresor: model trainer object :param training_parameters: model training parameters :param oauth: using in ml service OAuth2Session object :type regresor: ClassicRegressor, DNNRegressor :type training_parameters: dict """ LOGGER.info('Creating distribution_plot') # generate distribution plot # get path to distribution plot path_to_distribution = distribution_plot( regresor, model_name=training_parameters['Name']) # generate distribution plot info for blob storage distribution_plot_info = { 'FileInfo': json.dumps({ 'modelName': regresor.cv_model.model_name, }), 'ParentId': training_parameters['CurrentModelId'] } # make distribution plot multipart encoded object multipart_distribution_plot = get_multipart_object( training_parameters, path_to_distribution, 'image/png', additional_fields=distribution_plot_info) # log sending multipart encoded object LOGGER.info('Sending distribution plot to {}/{}'.format( BLOB_URL, CLIENT_ID)) training_parameters['NumberOfGenericFiles'] += 1 # send, http POST request to blob storage api with distribution plot post_data_to_blob(oauth, multipart_distribution_plot)
def get_blob_id(test_object): """ Method for add file to blob storage and get file's blob id :param test_object: TestCase class object :return: saved file blob id """ fetch_token(test_object.oauth) # make file's metadata metadata = { 'ParentId': test_object.parent_id, 'UserId': test_object.user_id } # make path to file independent of OS path_to_file = test_object.test_file # prepare multipart object to POST multipart_file = get_multipart_object(metadata, path_to_file, 'text/sdf') # POST file with metadata to blob storage response = post_data_to_blob( test_object.oauth, multipart_file, bucket_id=test_object.user_id) return response.json()[0]
def generate_training_report(body): """ Pika callback function used by training report generator. Make plots files, general metrics csv file and report file if success. :param body: RabbitMQ MT message's body """ oauth = get_oauth() fetch_token(oauth) # define using variables for ml reporter model = body['Models'][0] model_blob_id = model['blobId'] file_info = get_file_info_from_blob(oauth, model_blob_id).json() if 'ModelInfo' in file_info['metadata'].keys(): info_key = 'ModelInfo' elif 'modelInfo' in file_info['metadata'].keys(): info_key = 'modelInfo' else: raise KeyError('No model info') model_info = json.loads(file_info['metadata'][info_key]) body['Bins'] = model_info['Bins'] model_name = model_info['ModelName'] model_type = model_info['ModelType'] base_folder = '{}/general_training_report_{}'.format( TEMP_FOLDER, uuid.uuid1()) make_directory(base_folder) LOGGER.info('MODEL INFO: {}'.format(model_info)) LOGGER.info('MODEL NAME: {}'.format(model_name)) LOGGER.info('MODEL TYPE: {}'.format(model_type)) # generate general metrics dict all_models_metrics = [] for model in body['Models']: # if something wrong with model file from blob storage if 'genericFiles' not in model.keys(): raise TypeError('Empty model\'s generic files blob ids') for file_blob_id in model['genericFiles']: file_info = get_file_info_from_blob(oauth, file_blob_id).json() if 'fileInfo' in file_info['metadata'].keys(): fileinfo_key = 'fileInfo' elif 'FileInfo' in file_info['metadata'].keys(): fileinfo_key = 'FileInfo' else: raise KeyError('No fileInfo key in: {}'.format( file_info['metadata'].keys())) file_info = json.loads(file_info['metadata'][fileinfo_key]) if 'fileType' in file_info.keys(): filetype_key = 'fileType' elif 'FileType' in file_info.keys(): filetype_key = 'FileType' else: filetype_key = None if filetype_key and file_info[filetype_key] == TRAINING_CSV_METRICS: csv_blob_id = file_blob_id csv_model_metrics = get_file_from_blob(csv_blob_id, oauth).content all_models_metrics.append(csv_model_metrics.decode()) LOGGER.info('CURRENT MODEL INFO: {}'.format(file_info)) LOGGER.info('ALL MODELS METRICS: {}'.format(all_models_metrics)) # write general metrics data to csv file csv_files_names = write_csvs_files(all_models_metrics) general_csv_dict = merge_csv_files(csv_files_names) rows = make_general_csv_rows(general_csv_dict) general_csv_file_path = write_rows_to_csv_file(rows, base_folder) metrics = html_metrics_from_dict(general_csv_dict) fetch_token(oauth) # make csv info for blob storage general_csv_info = { 'FileInfo': json.dumps({ 'modelName': model_name, 'fileType': ALL_MODELS_TRAINING_CSV_METRICS }), 'SkipOsdrProcessing': 'true' } # make multipart object prepared to POST to blob storage # include csv file and file info multipart_general_csv = get_multipart_object( body, general_csv_file_path, 'text/csv', additional_fields=general_csv_info) # POST metrcis csv file to blob storage post_data_to_blob(oauth, multipart_general_csv) # create general images body['NumberOfGenericFiles'] = 0 path_to_radar_plot = None try: if model_type == CLASSIFIER: LOGGER.info('Creating radar_plot') nbits = body['Bins'] path_to_radar_plot = radar_plot(general_csv_file_path, base_folder, nbits, titlename=model_name) # make radar plot multipart encoded object multipart_radar_plot = get_multipart_object( body, path_to_radar_plot, 'image/png', additional_fields={'correlationId': body['CorrelationId']}) # send, http POST request to blob storage api with radar plot post_data_to_blob(oauth, multipart_radar_plot) body['NumberOfGenericFiles'] += 1 except: # log error traceback logging_exception_message(LOGGER) raise Exception('Post generic data exception') optimizer_metrics = REDIS_CLIENT.get('optimizer_metrics_{}'.format( body['CorrelationId'])) if optimizer_metrics: optimizer_metrics = html_optimal_metrics_from_dict( json.loads(optimizer_metrics), model_type) # add metrics and images to pdf report file context = { 'metrics': metrics, 'radar_plots': [path_to_radar_plot], 'optimizer': optimizer_metrics } pdf_path = make_pdf_report(base_folder, context, model_name='general') fetch_token(oauth) multipart_general_csv = get_multipart_object( body, pdf_path, 'application/pdf', additional_fields={'correlationId': body['CorrelationId']}) post_data_to_blob(oauth, multipart_general_csv) body['NumberOfGenericFiles'] += 1 # remove temporary directory shutil.rmtree(base_folder, ignore_errors=True) report_generated = training_report_generated_message(body) model_report_generated_message_publisher = PurePublisher(REPORT_GENERATED) model_report_generated_message_publisher.publish(report_generated) LOGGER.info('Report generated!') return None
def callback(body): """ Pika callback function used by ml predictor. Make file with predicted properties by picked model. Send file to blob storage for OSDR :param body: RabbitMQ MT message's body """ oauth = get_oauth() prediction_parameters = dict() fetch_token(oauth) # update prediction parameters # add dataset as bytes # add dataset file name, just for report message prediction_parameters['Dataset'], prediction_parameters[ 'DatasetFileName'] = get_dataset(oauth, body) model_info = get_model_info(oauth, body['ModelBlobId'], body['ModelBucket']) fetch_token(oauth) model_info['ModelBlobId'] = body['ModelBlobId'] model_info['ModelBucket'] = body['ModelBucket'] model_info['ModelCode'] = algorithm_code_by_name(model_info['Method']) # update prediction parameters with model paramers, such as density matrix, # distance model, MODI, fingerprints etc prepare_prediction_parameters(oauth, prediction_parameters, model_info) # update prediction parameters # add list of molecules prediction_parameters['Molecules'] = get_molecules_from_sdf_bytes( prediction_parameters['Dataset']) # define predictor object using prediction parameters # make prediction for all molecules ml_predictor = MLPredictor(prediction_parameters) ml_predictor.make_prediction() # send prediction result to OSDR # write prediction to csv prediction_csv_path = ml_predictor.write_prediction_to_csv() fetch_token(oauth) # prepare multipart object multipart_csv = get_multipart_object(body, prediction_csv_path, 'text/csv') # POST data to blob storage response_csv = post_data_to_blob(oauth, multipart_csv) # get prediction blob id and publish message in properties predicted queue prediction_created_event = property_predicted( body, os.path.basename(prediction_csv_path), response_csv.json()[0]) properties_predicted_publisher = PurePublisher(PROPERTIES_PREDICTED) properties_predicted_publisher.publish(prediction_created_event) # remove prediction file from temporary folder os.remove(prediction_csv_path) # remove temporary models folder shutil.rmtree(prediction_parameters['ModelsFolder'], ignore_errors=True) # clear memory del MODELS_IN_MEMORY_CACHE[model_info['ModelBlobId']] return None
def train_model(body): """ Pika callback function used by ml modeller. Make plots files, metrics files and model file if success. :param body: RabbitMQ MT message's body """ # prepared needed for calculations variables body['CurrentModelId'] = body['CorrelationId'] oauth = get_oauth() if not body['Method']: raise ValueError('Empty method in model trainer') body['Method'] = body['Method'].lower() method_code = body['Method'] body['Name'] = '_'.join(algorithm_name_by_code(method_code).split()) # publish training started event to OSDR fetch_token(oauth) publish_start_training_event(body) # validate input data # raise error with invalid parameter validate_user_input(body) # make dataframe using input parameters model_type = model_type_by_code(method_code) try: dataframe = make_dataframe(model_type, body, oauth) except: # log error traceback logging_exception_message(LOGGER) raise Exception('Make dataframe exception') # calculate scaler model model_trainer = make_model_trainer(method_code, body, dataframe) LOGGER.info('Sending scaler file to {}/{}'.format(BLOB_URL, CLIENT_ID)) fetch_token(oauth) # make applicability domain files and calculate values LOGGER.info('Calculate applicability domain') model_trainer.make_applicability_domain() # generic files for current training model, except model (*.sav) files body['NumberOfGenericFiles'] = 0 # train chosen model LOGGER.info('Start model training') try: model_training_data = conduct_training(model_trainer, method_code, body, oauth) except: # log error traceback logging_exception_message(LOGGER) raise Exception('Model training exception') # POST model's files (images, csvs etc) try: # POST classifier model files if model_type == CLASSIFIER: post_classifier(model_trainer, body, oauth, model_training_data['path_to_csv_file']) # POST regressor model files elif model_type == REGRESSOR: post_regressor(model_trainer, body, oauth) except: # log error traceback logging_exception_message(LOGGER) raise Exception('Post generic data exception') # update template tags with all needed data for model's report model_trainer.make_report_text() # make pdf report for trained model pdf_path = make_pdf_report(model_trainer.sub_folder, model_trainer.template_tags, model_name=body['Name']) LOGGER.info('Sending pdf report to {}/{}'.format(BLOB_URL, CLIENT_ID)) # POST pdf report to blob storage fetch_token(oauth) model_report_info = { 'FileInfo': json.dumps({ 'modelName': model_trainer.cv_model.model_name, 'fileType': MODEL_PDF_REPORT }), 'ParentId': body['CurrentModelId'] } multipart_pdf = get_multipart_object(body, pdf_path, 'application/pdf', additional_fields=model_report_info) post_data_to_blob(oauth, multipart_pdf) body['NumberOfGenericFiles'] += 1 # remove temporary directory shutil.rmtree(model_trainer.sub_folder, ignore_errors=True) # send model trained message to OSDR via rabbitmq model_trained = model_trained_message(body, model_training_data, model_trainer) model_trained_message_publisher = PurePublisher(MODEL_TRAINED) model_trained_message_publisher.publish(model_trained) LOGGER.info('Finished Calculations!') return None
def find_optimal_parameters(body): """ Pika callback function used by ml optimizer Find optimal training fingerprints set for input dataset Using only 1000 (by default) or less structures from input dataset Send overall optimizing result to Redis, to use it in ml training report :param body: RabbitMQ MT message's body :type body: dict """ oauth = get_oauth() # check input methods if not body['Methods']: raise ValueError('Empty Methods') # calculate metrics for each fingerprints set metrics, target_metric = fingerprints_grid_search( oauth, body, BASE_FINGERPRINTS) # send all metrics to redis # later use it to add to training report REDIS_CLIENT.setex( 'optimizer_metrics_{}'.format(body['CorrelationId']), EXPIRATION_TIME, json.dumps(metrics) ) # find best fingerprints set optimal_fingerprints = sorted( metrics.values(), key=lambda value: value['metrics'][target_metric], reverse=True )[0]['fptype'] # set other default 'optimal' parameters for training model body['SubSampleSize'] = 1.0 body['TestDataSize'] = 0.3 body['Scaler'] = 'MinMax' body['KFold'] = 5 body['Fingerprints'] = optimal_fingerprints body['OptimizationMethod'] = 'default' body['NumberOfIterations'] = 100 # make optimizer metrics csv and post it to blob storage formatted_metrics = TMP_TMP( metrics, model_type_by_code(body['Methods'][0].lower())) csv_path = '{}/ml_optimizer/{}/optimizing.csv'.format( TEMP_FOLDER, body['CorrelationId']) write_optimized_metrics_to_csv(formatted_metrics, csv_path) multipart_model = get_multipart_object( body, csv_path, 'application/x-spss-sav', additional_fields={'ParentId': body['TargetFolderId']} ) # send optimizer metrics csv file to blob storage fetch_token(oauth) response = post_data_to_blob(oauth, multipart_model) LOGGER.info('Optimizer csv status code: {}'.format(response.status_code)) # send best fingerprints set and 'optimal' parameters to training model training_optimized = model_training_optimized(body) training_optimized_message_publisher = PurePublisher(TRAINING_OPTIMIZED) training_optimized_message_publisher.publish(training_optimized) # clear current optimization folder shutil.rmtree( '{}/ml_optimizer/{}'.format(TEMP_FOLDER, body['CorrelationId']), ignore_errors=True )