def test_svm_classifier_manual_test_set(self): """ Test for classic classification (SVM classifier) manual test set """ classname = 'Soluble' dataframe = sdf_to_csv(self.sdf_file_path, self.fingerprints, class_name_list=classname) manual_test_dataframe = sdf_to_csv(self.manual_test_file_path, self.fingerprints, class_name_list=classname) classic_classifier = ALGORITHM[TRAINER_CLASS][ SUPPORT_VECTOR_MACHINE_CLASSIFIER]( self.sdf_file_path, classname, dataframe, subsample_size=1.0, test_set_size=self.test_set_size, seed=0, fptype=self.fingerprints, scale='standard', output_path=self.temporary_folder, n_split=self.n_split, manual_test_set=manual_test_dataframe) classic_classifier.train_model( CODES[SUPPORT_VECTOR_MACHINE_CLASSIFIER]) metrics = classic_classifier.metrics[ SUPPORT_VECTOR_MACHINE_CLASSIFIER]['mean'] true_metrics = { ('train', 'AUC'): 0.99, ('train', 'ACC'): 0.99, ('train', 'f1-score'): 0.99, ('train', 'Cohen_Kappa'): 0.95, ('train', 'Matthews_corr'): 0.96, ('train', 'Precision'): 0.99, ('train', 'Recall'): 0.99, ('test', 'AUC'): 0.95, ('test', 'ACC'): 0.93, ('test', 'f1-score'): 0.96, ('test', 'Cohen_Kappa'): 0.64, ('test', 'Matthews_corr'): 0.66, ('test', 'Precision'): 0.93, ('test', 'Recall'): 0.98, ('validation', 'AUC'): 0.94, ('validation', 'ACC'): 0.93, ('validation', 'f1-score'): 0.96, ('validation', 'Cohen_Kappa'): 0.59, ('validation', 'Matthews_corr'): 0.63, ('validation', 'Precision'): 0.93, ('validation', 'Recall'): 0.99 } self.assertDictAlmostEqual(metrics, true_metrics, delta=0.1)
def test_classifier_applicability_domain(self): valuename = 'Soluble' dataframe = sdf_to_csv(self.sdf_file_path, self.fingerprints, class_name_list=valuename) classic_classifier = ALGORITHM[TRAINER_CLASS][NAIVE_BAYES]( self.sdf_file_path, valuename, dataframe, subsample_size=1.0, test_set_size=self.test_set_size, seed=0, fptype=self.fingerprints, scale='minmax', output_path=self.temporary_folder, n_split=self.n_split) classic_classifier.make_applicability_domain() self.assertAlmostEqual(classic_classifier.distance_mean, 5.5480266, delta=0.01) self.assertAlmostEqual(classic_classifier.distance_std, 2.28519, delta=0.01) self.assertAlmostEqual(classic_classifier.density_mean, 1916.112310059458, delta=0.01) self.assertAlmostEqual(classic_classifier.density_std, 0.08123426215633249, delta=0.01) self.assertAlmostEqual(classic_classifier.modi, 0.79, delta=0.01) self.assertEqual(classic_classifier.train_shape, 1295)
def test_elastic_net_metrics(self): """ Test for classic regression metrics (elastic net) """ valuename = 'logS' dataframe = sdf_to_csv( self.sdf_file_path, self.fingerprints, value_name_list=valuename) classic_regressor = ALGORITHM[TRAINER_CLASS][ELASTIC_NETWORK]( self.sdf_file_path, valuename, dataframe, scale='minmax', seed=0, test_set_size=self.test_set_size, fptype=self.fingerprints, output_path=self.temporary_folder, n_split=self.n_split, subsample_size=1.0 ) classic_regressor.train_model(CODES[ELASTIC_NETWORK]) metrics = classic_regressor.metrics[ELASTIC_NETWORK]['mean'] true_metrics = { ('train', 'RMSE'): 0.51, ('train', 'MAE'): 0.39, ('train', 'R2'): 0.93, ('test', 'MAE'): 0.55, ('test', 'R2'): 0.87, ('test', 'RMSE'): 0.74, ('validation', 'R2'): 0.86, ('validation', 'RMSE'): 0.75, ('validation', 'MAE'): 0.58 } self.assertDictAlmostEqual(metrics, true_metrics, delta=0.1)
def make_dataframe(self, dataframe): """ Method which make ndarray dataframe from molecules dataset :return: ndarray dataframe with dataset molecules """ # make initial dataframe if dataframe is None: dataframe = sdf_to_csv(self.dataset_file_name, self.fptype, molecules=self.molecules) rows_to_delete = numpy.where( numpy.isnan(dataframe['value']).any(axis=1)) dataframe = numpy.delete(dataframe, rows_to_delete, axis=0) for index in sorted(rows_to_delete[0], reverse=True): LOGGER.info(index) del self.molecules[index] # apple scaler to dataframe if it used on training if self.scaler: dataframe = self.scaler.transform(dataframe['value']) else: dataframe = dataframe["value"] return dataframe
def test_sdf_processor(self): molecules = molecules_from_mol_strings([self.molstring]) dataframe = sdf_to_csv('', self.fingerprints, find_classes=True, find_values=True, molecules=molecules)
def test_random_forest_regressor_test_size_zero(self): """ Test for classic regression (random forest regressor) test size zero """ valuename = 'logS' dataframe = sdf_to_csv(self.sdf_file_path, self.fingerprints, value_name_list=valuename) classic_regressor = ALGORITHM[TRAINER_CLASS][RANDOM_FOREST_REGRESSOR]( self.sdf_file_path, valuename, dataframe, scale='minmax', seed=0, test_set_size=self.test_set_size, fptype=self.fingerprints, output_path=self.temporary_folder, n_split=self.n_split, subsample_size=1.0) classic_regressor.train_model(CODES[RANDOM_FOREST_REGRESSOR]) metrics = classic_regressor.metrics[RANDOM_FOREST_REGRESSOR]['mean'] true_metrics = { ('train', 'RMSE'): 0.42, ('train', 'MAE'): 0.32, ('train', 'R2'): 0.95, ('validation', 'R2'): 0.90, ('validation', 'RMSE'): 0.63, ('validation', 'MAE'): 0.48 } self.assertDictAlmostEqual(metrics, true_metrics, delta=0.1)
def test_regressor_applicability_domain(self): valuename = 'logS' dataframe = sdf_to_csv(self.sdf_file_path, self.fingerprints, value_name_list=valuename) classic_regressor = ALGORITHM[TRAINER_CLASS][ELASTIC_NETWORK]( self.sdf_file_path, valuename, dataframe, scale='minmax', seed=0, test_set_size=self.test_set_size, fptype=self.fingerprints, output_path=self.temporary_folder, n_split=self.n_split, subsample_size=1.0) classic_regressor.make_applicability_domain() self.assertAlmostEqual(classic_regressor.distance_mean, 5.5480266, delta=0.01) self.assertAlmostEqual(classic_regressor.distance_std, 2.28519, delta=0.01) self.assertAlmostEqual(classic_regressor.density_mean, 1916.112310059458, delta=0.01) self.assertAlmostEqual(classic_regressor.density_std, 0.08123426215633249, delta=0.01) self.assertAlmostEqual(classic_regressor.modi, 0.75, delta=0.01) self.assertEqual(classic_regressor.train_shape, 1295)
def test_ada_boost_skopt_hyperparameters(self): """ Test for classic classification skopt hyperparameters (Ada boost) """ classname = 'Soluble' dataframe = sdf_to_csv(self.sdf_file_path, self.fingerprints, class_name_list=classname) ada_boost = ALGORITHM[TRAINER_CLASS][ADA_BOOST_DECISION_TREE]( self.sdf_file_path, classname, dataframe, subsample_size=1.0, test_set_size=self.test_set_size, seed=0, fptype=self.fingerprints, scale='minmax', output_path=self.temporary_folder, n_split=self.n_split, opt_method='gauss') ada_boost.model_name = ADA_BOOST_DECISION_TREE parameters = ada_boost.make_training_parameters_grid() self.assertEquals(ada_boost_classifier_hyperparameters_skopt, parameters)
def make_dataframe(model_type, training_parameters, oauth): """ Method for make dataframe using training parameters values. Download sdf file from blob storage and convert it to dataframe :param model_type: type of training model, classifier or regressor :param training_parameters: model training parameters :param oauth: using in ml service OAuth2Session object :type model_type: str :type training_parameters: dict :return: prepared dataframe with needed training target column """ # download sdf file from blob storage # make stream from downloaded object stream = make_stream_from_sdf(training_parameters, oauth) # define using local variables filename = training_parameters['SourceFileName'] classname = training_parameters['ClassName'] fptype = training_parameters['Fingerprints'] # create dataframe using training parameters and dataframe LOGGER.info('Creating Fingerprints for molecules...') if model_type == CLASSIFIER: dataframe = sdf_to_csv(filename, fptype, class_name_list=classname, stream=stream) elif model_type == REGRESSOR: dataframe = sdf_to_csv(filename, fptype, value_name_list=classname, stream=stream) else: # raise error if using unknown model type # you should use defined (known) model types global variables only LOGGER.error('Unknown model type: {}'.format(model_type)) raise TypeError('Unknown model type: {}'.format(model_type)) LOGGER.info('Fingerprints created.') return dataframe
def test_svm_regressor_manual_test_set(self): """ Test for classic regression (SVM regressor) manual test set """ valuename = 'logS' dataframe = sdf_to_csv(self.sdf_file_path, self.fingerprints, value_name_list=valuename) manual_test_dataframe = sdf_to_csv(self.manual_test_file_path, self.fingerprints, value_name_list=valuename) classic_regressor = ALGORITHM[TRAINER_CLASS][ SUPPORT_VECTOR_MACHINE_REGRESSOR]( self.sdf_file_path, valuename, dataframe, seed=0, test_set_size=self.test_set_size, fptype=self.fingerprints, output_path=self.temporary_folder, n_split=self.n_split, manual_test_set=manual_test_dataframe, subsample_size=1.0) classic_regressor.train_model(CODES[SUPPORT_VECTOR_MACHINE_REGRESSOR]) metrics = classic_regressor.metrics[SUPPORT_VECTOR_MACHINE_REGRESSOR][ 'mean'] true_metrics = { ('train', 'RMSE'): 0.54, ('train', 'MAE'): 0.38, ('train', 'R2'): 0.93, ('test', 'MAE'): 0.64, ('test', 'R2'): 0.78, ('test', 'RMSE'): 0.88, ('validation', 'R2'): 0.75, ('validation', 'RMSE'): 1.0, ('validation', 'MAE'): 0.68 } self.assertDictAlmostEqual(metrics, true_metrics, delta=0.1)
def test_dnn_classification_metrics(self): """ Test for DNN classification metrics """ valuename = 'Soluble' dataframe = sdf_to_csv(self.sdf_file_path, self.fingerprints, class_name_list=valuename) dnn_classifier = ALGORITHM[TRAINER_CLASS][DNN_CLASSIFIER]( self.sdf_file_path, valuename, dataframe, subsample_size=1.0, test_set_size=self.test_set_size, seed=0, fptype=self.fingerprints, scale='minmax', output_path=self.temporary_folder, n_split=self.n_split) dnn_classifier.train_model(CODES[DNN_CLASSIFIER]) metrics = dnn_classifier.metrics[DNN_CLASSIFIER]['mean'] true_metrics = { ('train', 'AUC'): 0.99, ('train', 'ACC'): 0.97, ('train', 'f1-score'): 0.98, ('train', 'Cohen_Kappa'): 0.84, ('train', 'Matthews_corr'): 0.85, ('train', 'Precision'): 0.96, ('train', 'Recall'): 0.99, ('test', 'AUC'): 0.96, ('test', 'ACC'): 0.94, ('test', 'f1-score'): 0.96, ('test', 'Cohen_Kappa'): 0.67, ('test', 'Matthews_corr'): 0.69, ('test', 'Precision'): 0.94, ('test', 'Recall'): 0.99, ('validation', 'AUC'): 0.89, ('validation', 'ACC'): 0.92, ('validation', 'f1-score'): 0.95, ('validation', 'Cohen_Kappa'): 0.55, ('validation', 'Matthews_corr'): 0.57, ('validation', 'Precision'): 0.93, ('validation', 'Recall'): 0.98 } self.assertDictAlmostEqual(metrics, true_metrics, delta=0.3)
def test_naive_bayes_metrics(self): """ Test for classic classification metrics (naive Bayes) """ valuename = 'Soluble' dataframe = sdf_to_csv( self.sdf_file_path, self.fingerprints, class_name_list=valuename) classic_classifier = ALGORITHM[TRAINER_CLASS][NAIVE_BAYES]( self.sdf_file_path, valuename, dataframe, subsample_size=1.0, test_set_size=self.test_set_size, seed=0, fptype=self.fingerprints, scale='robust', output_path=self.temporary_folder, n_split=self.n_split ) classic_classifier.train_model(CODES[NAIVE_BAYES]) metrics = classic_classifier.metrics[NAIVE_BAYES]['mean'] true_metrics = { ('train', 'AUC'): 0.85, ('train', 'ACC'): 0.81, ('train', 'f1-score'): 0.88, ('train', 'Cohen_Kappa'): 0.41, ('train', 'Matthews_corr'): 0.47, ('train', 'Precision'): 0.97, ('train', 'Recall'): 0.8, ('test', 'AUC'): 0.89, ('test', 'ACC'): 0.81, ('test', 'f1-score'): 0.88, ('test', 'Cohen_Kappa'): 0.46, ('test', 'Matthews_corr'): 0.53, ('test', 'Precision'): 0.98, ('test', 'Recall'): 0.79, ('validation', 'AUC'): 0.83, ('validation', 'ACC'): 0.81, ('validation', 'f1-score'): 0.88, ('validation', 'Cohen_Kappa'): 0.40, ('validation', 'Matthews_corr'): 0.45, ('validation', 'Precision'): 0.97, ('validation', 'Recall'): 0.81 } self.assertDictAlmostEqual(metrics, true_metrics, delta=0.1)
def test_logistic_regression_test_size_zero(self): """ Test for classic classification (logistic regression) test size zero """ classname = 'Soluble' dataframe = sdf_to_csv(self.sdf_file_path, self.fingerprints, class_name_list=classname) classic_classifier = ALGORITHM[TRAINER_CLASS][LOGISTIC_REGRESSION]( self.sdf_file_path, classname, dataframe, subsample_size=1.0, test_set_size=self.test_set_size, seed=0, fptype=self.fingerprints, scale='minmax', output_path=self.temporary_folder, n_split=self.n_split) classic_classifier.train_model(CODES[LOGISTIC_REGRESSION]) metrics = classic_classifier.metrics[LOGISTIC_REGRESSION]['mean'] true_metrics = { ('train', 'AUC'): 0.99, ('train', 'ACC'): 0.97, ('train', 'f1-score'): 0.98, ('train', 'Cohen_Kappa'): 0.84, ('train', 'Matthews_corr'): 0.84, ('train', 'Precision'): 0.97, ('train', 'Recall'): 0.99, ('validation', 'AUC'): 0.97, ('validation', 'ACC'): 0.95, ('validation', 'f1-score'): 0.97, ('validation', 'Cohen_Kappa'): 0.70, ('validation', 'Matthews_corr'): 0.73, ('validation', 'Precision'): 0.94, ('validation', 'Recall'): 0.99 } self.assertDictAlmostEqual(metrics, true_metrics, delta=0.1)
def test_dnn_regression_metrics(self): """ Test for DNN regression metrics """ valuename = 'logS' dataframe = sdf_to_csv(self.sdf_file_path, self.fingerprints, value_name_list=valuename) dnn_regressor = ALGORITHM[TRAINER_CLASS][DNN_REGRESSOR]( self.sdf_file_path, valuename, dataframe, scale='minmax', seed=0, test_set_size=self.test_set_size, fptype=self.fingerprints, output_path=self.temporary_folder, n_split=self.n_split, subsample_size=1.0) dnn_regressor.train_model(CODES[DNN_REGRESSOR]) metrics = dnn_regressor.metrics[DNN_REGRESSOR]['mean'] true_metrics = { ('train', 'RMSE'): 0.60, ('train', 'MAE'): 0.46, ('train', 'R2'): 0.91, ('test', 'MAE'): 0.62, ('test', 'R2'): 0.84, ('test', 'RMSE'): 0.82, ('validation', 'R2'): 0.81, ('validation', 'RMSE'): 0.87, ('validation', 'MAE'): 0.67 } self.assertDictAlmostEqual(metrics, true_metrics, delta=0.3)
def test_elastic_network_skopt_hyperparameters(self): """ Test for classic classification skopt hyperparameters (elastic network) """ valuename = 'logS' dataframe = sdf_to_csv(self.sdf_file_path, self.fingerprints, value_name_list=valuename) elastic_network_regressor = ALGORITHM[TRAINER_CLASS][ELASTIC_NETWORK]( self.sdf_file_path, valuename, dataframe, subsample_size=1.0, test_set_size=self.test_set_size, seed=0, fptype=self.fingerprints, scale='minmax', output_path=self.temporary_folder, n_split=self.n_split, opt_method='forest') elastic_network_regressor.model_name = ELASTIC_NETWORK parameters = elastic_network_regressor.make_training_parameters_grid() self.assertEquals(elastic_network_hyperparameters_skopt, parameters)
layers = [64, 64] input_drop_out = 0.0 drop_out = 0.0 n_split = 10 optimizer = 'Nadam' activation = 'selu' l_rate = 0.01 beta = 0.0001 k_constraint = 4 mc_train_cut_off = 0.65 output_path = 'C:\\PycharmProjects\\ml.services\\Source\\callers and models' dataframe = sdf_to_csv(filepath, fptype, value_name_list=valuename, cut_off=0.1) # dataframe = pd.read_csv(filename) # x = dataframe.values #returns a numpy array # min_max_scaler = preprocessing.MinMaxScaler() # x_scaled = min_max_scaler.fit_transform(x) # headers = [x for x in range(797)] # headers.append('Tox') # dataframe = pd.DataFrame(x_scaled,columns=headers) # print(dataframe) classifier = DNNClassifier(ntpath.basename(filepath), classname, dataframe, test_set_size=test_set_size,
def process_sdf(body, sdf_as_bytes_array): fingerprints = list() for fingerprint in body['Fingerprints']: new_fingerprint = dict() for key, value in fingerprint.items(): new_fingerprint[key.capitalize()] = value fingerprints.append(new_fingerprint) body['Fingerprints'] = fingerprints molecules = get_molecules_from_sdf_bytes(sdf_as_bytes_array) errors_list = [[]] try: data_frame = sdf_to_csv('', body['Fingerprints'], find_classes=True, find_values=True, molecules=molecules, processing_errors=errors_list) except: # log error traceback logging_exception_message(LOGGER) error_message = 'Can\'t make dataframe using this sdf file' raise Exception(error_message) smiles_list = list() inchies = list() for molecule_number, molecule in enumerate(molecules): smiles = Chem.MolToSmiles(molecule, isomericSmiles=True) inchi = get_inchi_key(molecule) smiles_list.append(('SMILES', molecule_number, smiles)) inchies.append(('InChiKey', molecule_number, inchi)) smiles_numpy_array = numpy.array(smiles_list, dtype=[('name', 'U17'), ('molecule_number', 'i4'), ('value', 'U40')]) inchi_numpy_array = numpy.array(inchies, dtype=[('name', 'U17'), ('molecule_number', 'i4'), ('value', 'U40')]) errors_numpy_array = numpy.array(errors_list[0], dtype=[('name', 'U17'), ('molecule_number', 'i4'), ('value', 'U40')]) data_frame = data_frame.astype([('name', 'U10'), ('molecule_number', 'i4'), ('value', 'U40')]) data_frame = numpy.insert(data_frame, 0, inchi_numpy_array, axis=1) data_frame = numpy.insert(data_frame, 0, smiles_numpy_array, axis=1) data_frame = numpy.insert(data_frame, data_frame.shape[1], errors_numpy_array, axis=1) csv_file_path = '{}/{}.csv'.format(TEMP_FOLDER, body['CorrelationId']) try: numpy_to_csv(data_frame, csv_file_path) except: # log error traceback logging_exception_message(LOGGER) error_message = 'Can\'t convert dataframe to csv file' raise Exception(error_message) body['Structures'] = data_frame.shape[0] body['Columns'] = data_frame.shape[1] body['Failed'] = 0 return csv_file_path
shuffle=False, verbose=0) plot_train_history(model_history_tmp, 'compressor_0_1', '') # load the best model base on validation results for this fold autoencoder = load_model('checkpoint.h5') latent_to_map = Model(input, neck_out) latent_to_map.save('smi2lat.h5') return latent_to_map, var_thresh, scaler fptype = [{'Type': 'DESC'}, {'Type': 'SEQ'}] dataframe = sdf_to_csv(path_to_sdf, fptype=fptype, class_name_list=classname) dataframe = dataframe.apply(pd.to_numeric, errors='coerce').replace( [np.inf, -np.inf], np.nan).dropna(axis=0, how='any').reset_index(drop=True) x_features = dataframe.ix[:, :-1] print(x_features) lat2coord, var_thresh, scaler = get_mapper(x_features) data_list = [] for id, row in dataframe.iterrows(): lis = [] lis.extend( list( lat2coord.predict( scaler.transform( var_thresh.transform(np.reshape(row[:-1].values,
def test_neighbors_regressor_report(self): valuename = 'logS' dataframe = sdf_to_csv(self.sdf_file_path, self.fingerprints, value_name_list=valuename) classic_regressor = ALGORITHM[TRAINER_CLASS][NEIGHBORS_REGRESSOR]( self.sdf_file_path, valuename, dataframe, scale='minmax', seed=0, test_set_size=self.test_set_size, fptype=self.fingerprints, output_path=self.temporary_folder, n_split=self.n_split, subsample_size=0.2) classic_regressor.train_model(CODES[NEIGHBORS_REGRESSOR]) classic_regressor.make_applicability_domain() plots = classic_regressor.cv_model.make_plots() path_to_csv = classic_regressor.make_perfomance_csv().split('/')[-1] path_to_qmrf_report = classic_regressor.make_qmrf_report( None, None).split('/')[-1] path_to_archive = classic_regressor.cv_model.compress_models() path_to_archive = classic_regressor.compress_additional_files( path_to_archive).split('/')[-1] path_to_distrubution_plot = distribution_plot( classic_regressor, model_name='Nearest Neighbors').split('/')[-1] prepare_plots(plots) true_plots = { '1': { 'regression_results_test': 'K Neighbors Regressor_logS_test_fold_1_regression_plot.png', 'regression_results_train': 'K Neighbors Regressor_logS_train_fold_1_regression_plot.png', 'regression_results_valid': 'K Neighbors Regressor_logS_validation_fold_1_regression_plot.png', 'thumbnail_plot_path': None }, '2': { 'regression_results_test': 'K Neighbors Regressor_logS_test_fold_2_regression_plot.png', 'regression_results_train': 'K Neighbors Regressor_logS_train_fold_2_regression_plot.png', 'regression_results_valid': 'K Neighbors Regressor_logS_validation_fold_2_regression_plot.png', 'thumbnail_plot_path': None }, 'mean': { 'regression_results_test': 'K Neighbors Regressor_logS_test_fold_mean_regression_plot.png', 'regression_results_train': 'K Neighbors Regressor_logS_train_fold_mean_regression_plot.png', 'regression_results_valid': 'K Neighbors Regressor_logS_validation_fold_mean_regression_plot.png', 'thumbnail_plot_path': 'K Neighbors Regressor_logS_thumbnail_image.jpg' } } self.assertDictEqual(plots, true_plots) self.assertEqual(path_to_distrubution_plot, 'Nearest Neighbors_train_test_distribution.png') self.assertEqual(path_to_csv, 'K Neighbors Regressor_DNN_data_solubility.csv') self.assertEqual(path_to_qmrf_report, 'K_Neighbors_Regressor_QMRF_report.pdf') self.assertEqual(path_to_archive, 'K_Neighbors_Regressor.zip')
def test_random_forest_classifier_report(self): valuename = 'Soluble' dataframe = sdf_to_csv(self.sdf_file_path, self.fingerprints, class_name_list=valuename) classic_classifier = ALGORITHM[TRAINER_CLASS][ RANDOM_FOREST_CLASSIFIER](self.sdf_file_path, valuename, dataframe, subsample_size=0.2, test_set_size=self.test_set_size, seed=0, fptype=self.fingerprints, scale='robust', output_path=self.temporary_folder, n_split=self.n_split) classic_classifier.train_model(CODES[RANDOM_FOREST_CLASSIFIER]) classic_classifier.make_applicability_domain() plots = classic_classifier.cv_model.make_plots() path_to_csv = classic_classifier.make_perfomance_csv() path_to_qmrf_report = classic_classifier.make_qmrf_report( None, None).split('/')[-1] path_to_archive = classic_classifier.cv_model.compress_models() path_to_archive = classic_classifier.compress_additional_files( path_to_archive).split('/')[-1] path_to_radar_plot = radar_plot( path_to_csv, classic_classifier.sub_folder, classic_classifier.bins, titlename='Random Forest').split('/')[-1] path_to_csv = path_to_csv.split('/')[-1] prepare_plots(plots) true_plots = { '1': { 'roc_plot_path': 'Random_Forest_Classifier_fold_1_ROC_plot.png', 'cm_plot_path': 'Random_Forest_Classifier_fold_1_confusion.png', 'thumbnail_plot_path': None }, '2': { 'roc_plot_path': 'Random_Forest_Classifier_fold_2_ROC_plot.png', 'cm_plot_path': 'Random_Forest_Classifier_fold_2_confusion.png', 'thumbnail_plot_path': None }, 'mean': { 'roc_plot_path': 'Random_Forest_Classifier_fold_mean_ROC_plot.png', 'cm_plot_path': 'Random_Forest_Classifier_fold_mean_confusion.png', 'thumbnail_plot_path': 'Random_Forest_Classifier_thumbnail_image.jpg' } } self.assertDictEqual(plots, true_plots) self.assertEqual(path_to_radar_plot, 'radar_plot.png') self.assertEqual(path_to_csv, 'Random Forest Classifier_DNN_data_solubility.csv') self.assertEqual(path_to_qmrf_report, 'Random_Forest_Classifier_QMRF_report.pdf') self.assertEqual(path_to_archive, 'Random_Forest_Classifier.zip')
layers = [64, 64] input_drop_out = 0.1 drop_out = 0.0 n_split = 10 optimizer = 'Nadam' activation = 'relu' l_rate = 0.005 beta = 0.00001 k_constraint = 4 mc_train_cut_off = 0.65 output_path = 'C:\\PycharmProjects\\ml.services\\Source\\callers and models' dataframe = sdf_to_csv(filepath, fptype, value_name_list=valuename) dataframe_test = sdf_to_csv(filepath_test, fptype, value_name_list=valuename) regressor = ALGORITHM[TRAINER_CLASS][DNN_REGRESSOR]( ntpath.basename(filepath), valuename, dataframe, test_set_size=test_set_size, fptype=fptype, n_split=n_split, output_path=output_path, scale="standard", manual_test_set=dataframe_test) dnn = regressor.train_model(CODES[DNN_REGRESSOR]) dnn.make_plots()
def fingerprints_grid_search( oauth, body, fingerprints, subsample_size=1000 ): """ Function for searching of optimal combination of fingerprints. subsample_size molecules are extracted from initial dataset and used for training of multiple models with varying combinations of fingerprints. :param oauth: :param body: :param fingerprints: list of fingerprints' combinations :param subsample_size: number of objects that will be used to train model :return: dict with fingerprints' metrics and statistics """ # make folder for current optimization optimizer_folder = '{}/ml_optimizer/{}'.format( TEMP_FOLDER, body['CorrelationId']) make_directory(optimizer_folder) # download and save sdf file stream = make_stream_from_sdf(body, oauth) filename = body['SourceFileName'] temporary_sdf_filename = '{}/tmp_{}.sdf'.format(optimizer_folder, filename) temporary_sdf_file = open(temporary_sdf_filename, 'wb') temporary_sdf_file.write(stream.getvalue()) temporary_sdf_file.close() # extract sample (which have subsample_size) from source dataset prediction_target = body['ClassName'] mode = model_type_by_code(body['Methods'][0].lower()) sample_file_name = extract_sample_dataset( input_file_name=temporary_sdf_filename, subsample_size=subsample_size, prediction_target=prediction_target, mode=mode ) # define classifier and regressor models for optimizing if mode == CLASSIFIER: model_code = NAIVE_BAYES target_metric = 'test__AUC' elif mode == REGRESSOR: model_code = ELASTIC_NETWORK target_metric = 'test__R2' else: raise ValueError('Unknown node: {}'.format(mode)) # loop all base fingerprints sets to find best set metrics = dict() for fingerprint_number, fptype in enumerate(fingerprints): # make dataframe depends on fingerprint set # and model type (classifier or regressor) start_fps_processing = time() if mode == CLASSIFIER: dataframe = sdf_to_csv( sample_file_name, fptype=fptype, class_name_list=prediction_target ) elif mode == REGRESSOR: dataframe = sdf_to_csv( sample_file_name, fptype=fptype, value_name_list=prediction_target ) else: raise ValueError('Unknown mode: {}'.format(mode)) fps_processing_time_seconds = time() - start_fps_processing # train model start_current_training = time() classic_classifier = ALGORITHM[TRAINER_CLASS][model_code]( sample_file_name, prediction_target, dataframe, subsample_size=1.0, test_set_size=0.2, seed=0, fptype=fptype, scale='minmax', n_split=1, output_path=optimizer_folder ) classic_classifier.train_model(CODES[model_code]) current_training_time_seconds = time() - start_current_training # add formatted model's metrics and times to heap formatted_metrics = format_metrics( classic_classifier.metrics[model_code]['mean']) metrics.update({ fingerprint_number: { 'fptype': fptype, 'metrics': formatted_metrics, 'fingerprint_processing_time': fps_processing_time_seconds, 'prediction_time': current_training_time_seconds } }) return metrics, target_metric
import sklearn print(sklearn.__version__) suppl = Chem.SDMolSupplier( 'C:\PycharmProjects\ml-data-qsar\TEST\LC50\LC50_training.sdf') molecules = [x for x in suppl if x is not None] molecules = molecules fptype = [{'Type': 'DESC'}, {'Type': 'MACCS'}, {'Type': 'FCFC','Size': 512,'Radius':3}, {'Type': 'AVALON','Size': 512}] dataframe = sdf_to_csv('LC50_prediction', fptype=fptype, molecules=molecules) folder_path = 'C:\PycharmProjects\ml-models\\UBC\Half_LIfe_U_2018_03_18__14_24_16_DESC_MACCS_FCFC_512_3_AVALON_512_scaled___' models_paths = [os.path.join(folder_path, x) for x in listdir(folder_path) if x.split('.')[-1] == 'h5'] transformers = [os.path.join(folder_path, x) for x in listdir(folder_path) if x.split('.')[-1] == 'sav'] predicted_test_y_vectors = [] df_predict_clf = pd.DataFrame() for transformer in transformers: trans = joblib.load(transformer) for path_to_model in models_paths: model_base = load_model( path_to_model,