def main(args): """General wrapper function for replicate aggregation. Args: args (dict): Dictionary of arguments from argparser Returns: DataFrame: Activity file T4r with aggregated values """ start = time.time() if args["non_interactive"] is True: overwriting = True else: overwriting = False load_config(args) load_key(args) print("Consistency checks of config and key files.") hash_reference_set.main(args) print("Start classification filtering.") output_dir = prepare(args, overwriting) T3c = read_input_file(args["classification_weight_table"]) T4c = read_input_file(args["classification_activity_file"]) T10c, T8c, T4c_filtered_out, T4c_dedup = filter_clf( T3c, T4c, ConfigDict.get_parameters()["training_quorum"]["classification"], ConfigDict.get_parameters()["evaluation_quorum"]["classification"], ConfigDict.get_parameters()["initial_task_weights"], ) write_tmp_output(output_dir, T10c, T8c, T4c_filtered_out, T4c_dedup) print(f"Classification filtering took {time.time() - start:.08} seconds.") print(f"Classification filtering done.")
def prepare(args: dict, overwriting: bool): """Setup run by creating directories and log files. Args: args (dict): argparser arguments overwriting (bool): overwriting flag Returns: Tuple(DataFrame, DataFrame): Path to output and mapping_table subdirectories. """ output_dir_lsh = make_dir(args, "results_tmp", "lsh_folding", overwriting) mapping_table_dir = make_dir(args, "mapping_table", None, overwriting) create_log_files(output_dir_lsh) create_log_files(mapping_table_dir) load_config(args) load_key(args) method_params_fp = ConfigDict.get_parameters()["fingerprint"] method_params_lsh = ConfigDict.get_parameters()["lsh"] method_params = {**method_params_fp, **method_params_lsh} key = SecretDict.get_secrets()["key"] lshf = LSHFoldingCalculator.from_param_dict( secret=key, method_param_dict=method_params, verbosity=0) outcols = ["fp_feat", "fp_val", "fold_id", "success", "error_message"] out_types = ["object", "object", "object", "bool", "object"] dt = DfTransformer( lshf, input_columns={"canonical_smiles": "smiles"}, output_columns=outcols, output_types=out_types, success_column="success", nproc=args["number_cpu"], verbosity=0, ) return output_dir_lsh, mapping_table_dir, dt
def prepare(args): overwriting = True load_config(args) load_key(args) output_dir = make_dir(args, "reference_set", None, overwriting) key = SecretDict.get_secrets()["key"] method_params_standardizer = ConfigDict.get_parameters()["standardization"] st = Standardizer.from_param_dict( method_param_dict=method_params_standardizer, verbosity=0) outcols_st = ["canonical_smiles", "success", "error_message"] out_types_st = ["object", "bool", "object"] dt_standarizer = DfTransformer( st, input_columns={"smiles": "smiles"}, output_columns=outcols_st, output_types=out_types_st, success_column="success", nproc=1, verbosity=0, ) method_params_folding = ConfigDict.get_parameters()["scaffold_folding"] sa = ScaffoldFoldAssign.from_param_dict( secret=key, method_param_dict=method_params_folding, verbosity=0) outcols_sa = [ "murcko_smiles", "sn_smiles", "fold_id", "success", "error_message" ] out_types_sa = ["object", "object", "int", "bool", "object"] dt_fold = DfTransformer( sa, input_columns={"canonical_smiles": "smiles"}, output_columns=outcols_sa, output_types=out_types_sa, success_column="success", nproc=1, verbosity=0, ) method_params_descriptor = ConfigDict.get_parameters()["fingerprint"] dc = DescriptorCalculator.from_param_dict( secret=key, method_param_dict=method_params_descriptor, verbosity=0) outcols_dc = ["fp_feat", "fp_val", "success", "error_message"] out_types_dc = ["object", "object", "bool", "object"] dt_descriptor = DfTransformer( dc, input_columns={"canonical_smiles": "smiles"}, output_columns=outcols_dc, output_types=out_types_dc, success_column="success", nproc=1, verbosity=0, ) return output_dir, dt_standarizer, dt_fold, dt_descriptor
def setUp(self): self.config = ConfigDict(config_path=Path( os.path.join(curDir, "reference_files", "example_parameters.json"))).get_parameters() self.keys = SecretDict(key_path=Path( os.path.join(curDir, "reference_files", "example_key.json"))).get_secrets()
def test_standardizer_different_configs(self): """Testing standardization of smiles using threading""" df_smiles = read_csv(curDir / "input/test_standardizer.csv") outcols = ["canonical_smiles", "success", "error_message"] out_types = ["object", "bool", "object"] ## Load ref standardizer st_ref = Standardizer( max_num_atoms=self.config["standardization"]["max_num_atoms"], max_num_tautomers=self.config["standardization"] ["max_num_tautomers"], include_stereoinfo=self.config["standardization"] ["include_stereoinfo"], verbosity=0, ) dt_ref = DfTransformer( st_ref, input_columns={"smiles": "smiles"}, output_columns=outcols, output_types=out_types, success_column="success", nproc=4, verbosity=0, ) response_ref = dt_ref.process_dataframe(df_smiles)[0] config_2 = ConfigDict(config_path=Path( os.path.join(curDir, "input/", "example_parameters_2.json"))).get_parameters() ## load test standardizer st_tmp = Standardizer( max_num_atoms=config_2["standardization"]["max_num_atoms"], max_num_tautomers=config_2["standardization"]["max_num_tautomers"], include_stereoinfo=config_2["standardization"] ["include_stereoinfo"], verbosity=0, ) dt_tmp = DfTransformer( st_tmp, input_columns={"smiles": "smiles"}, output_columns=outcols, output_types=out_types, success_column="success", nproc=2, verbosity=0, ) response_tmp = dt_tmp.process_dataframe(df_smiles)[0] try: assert_frame_equal(response_ref, response_tmp) except AssertionError: # frames are not equal pass else: # frames are equal raise AssertionError
def main(args): """General wrapper function for replicate aggregation. Args: args (dict): Dictionary of arguments from argparser Returns: DataFrame: Activity file T4r with aggregated values """ start = time.time() if args["non_interactive"] is True: overwriting = True else: overwriting = False load_config(args) load_key(args) print("Consistency checks of config and key files.") hash_reference_set.main(args) print("Start aggregation.") output_dir = prepare(args, overwriting) T0 = read_input_file(args["assay_file"]) T1 = read_input_file(args["activity_file"]) print("Check assay types in T0.") sanity_check_assay_type(T0) print("Check consistency of input_assay_id between T0 and T1.") sanity_check_assay_sizes(T0, T1) print("Check uniqueness of T0.") sanity_check_uniqueness(T0, colname="input_assay_id", filename=args["assay_file"]) print(f"Sanity checks took {time.time() - start:.08} seconds.") print(f"Sanity checks passed.") T5 = read_input_file(args["mapping_table"]) ( df_aggr, df_failed_range, df_failed_aggr, df_failed_std, df_dup, T0_upd, ) = aggregate_replicates( T0, T1, T5, ConfigDict.get_parameters()["credibility_range"], args["number_cpu"] ) write_tmp_output( output_dir, df_aggr, df_failed_range, df_failed_aggr, df_failed_std, df_dup, T0_upd, ) print(f"Replicate aggregation took {time.time() - start:.08} seconds.") print(f"Replicate aggregation done.")
def prepare(args): """ Prepare output directories and instantiate df tansformer object for scaffold based folding Args: args (dict): argparser arguments Returns: Tuple(Path, DfTransformer): Path to output directory and instatitaed DfTranfomer for sccaffold folding """ output_dir = make_dir(args, "results_tmp", "folding", args["non_interactive"]) mapping_table_dir = make_dir(args, "mapping_table", None, args["non_interactive"]) create_log_files(output_dir) create_log_files(mapping_table_dir) load_config(args) load_key(args) key = SecretDict.get_secrets()["key"] method_params = ConfigDict.get_parameters()["scaffold_folding"] sa = ScaffoldFoldAssign.from_param_dict( secret=key, method_param_dict=method_params, verbosity=0 ) outcols = ["murcko_smiles", "sn_smiles", "fold_id", "success", "error_message"] out_types = ["object", "object", "int", "bool", "object"] dt = DfTransformer( sa, input_columns={"canonical_smiles": "smiles"}, output_columns=outcols, output_types=out_types, success_column="success", nproc=args["number_cpu"], verbosity=0, ) return output_dir, mapping_table_dir, dt
def do_prepare_training(args): """Wrapper to run the entire pipeline for training. Args: args (Namespace): Subparser argmuents #""" start_total = time.time() start = time.time() _args = vars(args) if _args["non_interactive"] is True: overwriting = True else: overwriting = False num_cpu = _args["number_cpu"] # # load parameters and key load_config(_args) load_key(_args) bit_size = melloddy_tuner.utils.config.parameters.get_parameters( )["fingerprint"]["fold_size"] ######### # Consistency check print("Consistency checks of config and key files.") hash_reference_set.main(_args) ######### start = time.time() tag = _args["tag"] print("Reading input data.") df_T0 = read_input_file(_args["weight_table"]) df_T1 = read_input_file(_args["activity_file"]) df_T2 = read_input_file(_args["structure_file"]) print("Data loaded.") print("Start sanity checks of input data.") print("Check assay types in T0.") sanity_check_assay_type(df_T0) print("Check consistency of input_assay_id between T0 and T1.") sanity_check_assay_sizes(df_T0, df_T1) print("Check consistency of input_compound_id between T1 and T2.") sanity_check_compound_sizes(df_T1, df_T2) print("Check uniqueness of T0 and T2.") sanity_check_uniqueness(df_T0, colname="input_assay_id", filename="T0") sanity_check_uniqueness(df_T2, colname="input_compound_id", filename="T2") print(f"Sanity checks took {time.time() - start:.08} seconds.") print(f"Sanity checks passed.") start = time.time() print("Start standardizing structures.") # Make directories, load input files results_dir = make_dir(_args, "results", None, overwriting) output_dir_std, dt_std = standardize_smiles.prepare(_args) df_smi, sd_smi_failed = standardize_smiles.run(df_T2, dt_std) save_df_as_csv(output_dir_std, df_smi, "T2_standardized") save_df_as_csv(output_dir_std, sd_smi_failed, "T2_standardized.FAILED") del sd_smi_failed, df_T2 print(f"Standardization took {time.time() - start:.08} seconds.") print(f"Standardization done.") df_T5 = pd.DataFrame() df_T6 = pd.DataFrame() if _args["folding_method"] == "scaffold": print("Using scaffold-based fold assignment.") output_dir_desc, dt_desc = calculate_descriptors.prepare( _args, overwriting) start = time.time() print("Start calculating descriptors.") df_desc, df_desc_failed = calculate_descriptors.run(df_smi, dt_desc) save_df_as_csv(output_dir_desc, df_desc, "T2_descriptors") save_df_as_csv(output_dir_desc, df_desc_failed, "T2_descriptors.FAILED") del df_smi, df_desc_failed print( f"Fingerprint calculation took {time.time() - start:.08} seconds.") print(f"Descriptor calculation done.") start = time.time() print("Start computing folds.") output_dir_fold, mapping_table_dir, dt_fold = calculate_scaffold_folds.prepare( _args) df_fold, df_fold_failed = calculate_scaffold_folds.run( df_desc, dt_fold) save_df_as_csv(output_dir_fold, df_fold, "T2_folds") save_df_as_csv(output_dir_fold, df_fold_failed, "T2_folds.FAILED") del df_fold_failed, df_desc df_T5, df_T6, df_duplicates = helper.format_dataframe(df_fold) save_df_as_csv(mapping_table_dir, df_T5, "T5") save_df_as_csv(mapping_table_dir, df_T6, "T6") save_df_as_csv(output_dir_desc, df_duplicates, "T2_descriptor_vector_id.DUPLICATES") del df_duplicates print(f"Fold calculation took {time.time() - start:.08} seconds.") print(f"Fold calculation done.") elif _args["folding_method"] == "lsh": print("Using LSH based fold assignment.") output_dir_lsh, mapping_table_dir, dt_lsh = calculate_lsh_folds.prepare( _args, overwriting) output_file = os.path.join(output_dir_lsh, "T2_descriptors_lsh.csv") error_file = os.path.join(output_dir_lsh, "T2_descriptors_lsh.FAILED.csv") dupl_file = os.path.join(output_dir_lsh, "T2_descriptors_lsh.DUPLICATES.csv") mapping_file_T5 = os.path.join(mapping_table_dir, "T5.csv") mapping_file_T6 = os.path.join(mapping_table_dir, "T6.csv") df_desc_lsh, df_desc_lsh_failed = dt_lsh.process_dataframe(df_smi) df_desc_lsh.to_csv(output_file, index=False) df_desc_lsh_failed.to_csv(error_file, index=False) df_T5, df_T6, df_duplicates = helper.format_dataframe(df_desc_lsh) df_duplicates.to_csv(dupl_file, index=False) df_T5.to_csv(mapping_file_T5, index=False) df_T6.to_csv(mapping_file_T6, index=False) del df_duplicates end = time.time() print( f"Fingerprint calculation and LSH folding took {end - start:.08} seconds." ) print(f"Descriptor calculation and LSH folding done.") else: print("Please use scaffold or lsh as folding method.") quit() start = time.time() print("Start aggregating values.") output_dir_agg = aggregate_values.prepare(_args, overwriting) ( df_T4r, df_failed_range, df_failed_aggr, df_failed_std, df_dup, df_T0_upd, ) = aggregate_values.aggregate_replicates( df_T0, df_T1, df_T5, ConfigDict.get_parameters()["credibility_range"], num_cpu) df_T4r = df_T4r[[ "input_assay_id", "descriptor_vector_id", "fold_id", "standard_qualifier", "standard_value", ]] save_df_as_csv( output_dir_agg, df_T4r, "T4r", [ "input_assay_id", "descriptor_vector_id", "fold_id", "standard_qualifier", "standard_value", ], ) save_df_as_csv( output_dir_agg, df_failed_range, "failed_range_T1", [ "input_compound_id", "input_assay_id", "standard_qualifier", "standard_value" ], ) save_df_as_csv( output_dir_agg, df_failed_aggr, "failed_aggr_T1", [ "descriptor_vector_id", "input_assay_id", "standard_qualifier", "standard_value", "fold_id", ], ) save_df_as_csv( output_dir_agg, df_failed_std, "failed_std_T1", [ "descriptor_vector_id", "input_assay_id", "standard_qualifier", "standard_value", "fold_id", ], ) save_df_as_csv( output_dir_agg, df_dup, "duplicates_T1", [ "input_assay_id", "input_compound_id", "descriptor_vector_id", "fold_id", "standard_qualifier", "standard_value", ], ) save_df_as_csv(output_dir_agg, df_T0_upd, "T0_upd") del df_T5, df_failed_range, df_failed_aggr, df_dup, df_T1 print(f"Replicate aggregation took {time.time() - start:.08} seconds.") print(f"Replicate aggregation done.") start = time.time() print("Start thresholding.") output_dir_thres = apply_thresholding.prepare(_args, overwriting) df_T0_upd = df_T0_upd.astype({"input_assay_id": "int"}) df_T4r = df_T4r.astype({"input_assay_id": "int"}) df_T4c, df_T3c = apply_thresholding.run(df_T0_upd, df_T4r, num_cpu) # Write final dataframes (T4c, T3c) columns_T3c = [ "classification_task_id", "input_assay_id", "assay_type", "variance_quorum_OK", "use_in_regression", "is_auxiliary", "threshold", "threshold_method", "direction", ] columns_T4c = [ "classification_task_id", "descriptor_vector_id", "fold_id", "input_assay_id", "standard_qualifier", "standard_value", "threshold", "class_label", ] df_T4c.sort_values("classification_task_id", inplace=True) df_T3c.sort_values("classification_task_id", inplace=True) # Filter ambiguous class labels df_T4c_failed = df_T4c[df_T4c.class_label.isna()] df_T4c = df_T4c[~df_T4c.class_label.isna()] df_T4c = df_T4c[columns_T4c] df_T3c = df_T3c[columns_T3c] save_df_as_csv(output_dir_thres, df_T4c_failed, "T4c.FAILED") save_df_as_csv(output_dir_thres, df_T4c, "T4c") save_df_as_csv(output_dir_thres, df_T3c, "T3c") print(f"Thresholding took {time.time() - start:.08} seconds.") print(f"Thresholding done.") print("Start filter classification data.") start = time.time() output_dir_filter_clf = filter_classification.prepare(_args, overwriting) T10c, T8c, T4c_filtered_out, T4c_dedup = filter_classification.filter_clf( df_T3c, df_T4c, ConfigDict.get_parameters()["training_quorum"]["classification"], ConfigDict.get_parameters()["evaluation_quorum"]["classification"], ConfigDict.get_parameters()["initial_task_weights"], ) filter_classification.write_tmp_output(output_dir_filter_clf, T10c, T8c, T4c_filtered_out, T4c_dedup) del df_T4c, df_T3c, T4c_filtered_out, T4c_dedup print(f"Classification filtering took {time.time() - start:.08} seconds.") print(f"Classification filtering done.") print("Start filter regression data.") ##### start = time.time() out_dir_filter_reg = filter_regression.prepare(_args, overwriting) T10r, T8r, T4r_filtered_out, T4r_dedup = filter_regression.filter_regression_tasks( df_T0_upd, df_T4r, ConfigDict.get_parameters()["training_quorum"]["regression"], ConfigDict.get_parameters()["evaluation_quorum"]["regression"], ConfigDict.get_parameters()["initial_task_weights"], ConfigDict.get_parameters()["censored_downweighting"], ) filter_regression.write_tmp_output(out_dir_filter_reg, T10r, T8r, T4r_filtered_out, T4r_dedup) del df_T0, df_T4r, T4r_filtered_out, T4r_dedup print(f"Filtering regression data took {time.time() - start:.08} seconds.") print(f"Filtering regression data done.") print("Start creating sparse matrices.") start = time.time() out_dir_matrices, results_dir = csv_2_mtx.prepare(_args, overwriting) df_T6_cont, T10c_cont, T10r_cont = csv_2_mtx.get_cont_id(df_T6, T10c, T10r) df_T11 = df_T6_cont[["cont_descriptor_vector_id", "fold_id", "fp_feat"]] save_df_as_csv(results_dir, T10c_cont, "T10c_cont") save_df_as_csv(results_dir, T10r_cont, "T10r_cont") save_df_as_csv(results_dir, df_T6_cont, "T6_cont") csv_2_mtx.save_csv_output(out_dir_matrices, tag, T8c, T8r) del df_T6, df_T6_cont, T10r, T10c ( x_matrix, fold_vector, y_matrix_clf, y_matrix_reg, censored_mask, ) = csv_2_mtx.make_matrices(df_T11, T10c_cont, T10r_cont, bit_size) del df_T11, T10c_cont, T10r_cont y_matrix_clf.data = np.nan_to_num(y_matrix_clf.data, copy=False) y_matrix_clf.eliminate_zeros() csv_2_mtx.save_npy_matrices( out_dir_matrices, tag, x_matrix, fold_vector, y_matrix_clf, y_matrix_reg, censored_mask, ) print(f"Formatting to matrices took {time.time() - start:.08} seconds.") end = time.time() print(f"Overall processing took {end - start_total:.08} seconds.") print(f"Files are ready for SparseChem.")
def setUp(self): self.config = ConfigDict(config_path=Path( os.path.join(curDir, 'reference_files', 'example_parameters.json'))).get_parameters()
def calculate_single_assay(df_tuple) -> list: # Load HTS threshold if HTS data present l_assay_types = df_tuple[1]["assay_type"].unique() thresh_HTS = int() if "AUX_HTS" in l_assay_types: thresh_HTS = ConfigDict.get_parameters( )["global_thresholds"]["AUX_HTS"] # Load data quora for fixed-adaptive threshold quorum_num_active = ConfigDict.get_parameters( )["training_quorum"]["classification"]["OTHER"]["num_active_total"] quorum_num_inactive = ConfigDict.get_parameters( )["training_quorum"]["classification"]["OTHER"]["num_inactive_total"] # Initialize T3c, T4c dataframes columns_T3c = [ "input_assay_id", "assay_type", "variance_quorum_OK", "use_in_regression", "is_auxiliary", "threshold", "threshold_method", "direction", ] columns_T4c = [ "descriptor_vector_id", "fold_id", "input_assay_id", "standard_qualifier", "standard_value", "threshold", "class_label", ] df_T3c = pd.DataFrame(columns=columns_T3c) df_T4c = pd.DataFrame(columns=columns_T4c) tmp_assay = df_tuple[1] assay_type = tmp_assay["assay_type"].iloc[0] l_thresh = [] if assay_type in ["OTHER", "PANEL"]: l_thresh = get_thresholds_dose_response(tmp_assay, quorum_num_active, quorum_num_inactive) elif assay_type == "ADME": l_thresh = get_thresholds_ADME(tmp_assay) elif assay_type == "AUX_HTS": l_thresh = [(thresh_HTS, "fixed")] for thresh in l_thresh: # Generate new assay instance tmp_assay.loc[:, "input_assay_id"] = df_tuple[0] df_assay = tmp_assay.copy() # Convert standard_value to class_label direction = df_assay["direction"].iloc[0] if assay_type == "AUX_HTS": T4c_assay, T3c_assay = apply_thresholding(df_assay, assay_type, thresh, direction, columns_T4c, columns_T3c) else: if direction != "low": direction = "high" T4c_assay, T3c_assay = apply_thresholding(df_assay, assay_type, thresh, direction, columns_T4c, columns_T3c) df_T4c = df_T4c.append(T4c_assay) df_T3c = df_T3c.append(T3c_assay) return df_T4c, df_T3c