def main(args): """General wrapper function for replicate aggregation. Args: args (dict): Dictionary of arguments from argparser Returns: DataFrame: Activity file T4r with aggregated values """ start = time.time() if args["non_interactive"] is True: overwriting = True else: overwriting = False load_config(args) load_key(args) print("Consistency checks of config and key files.") hash_reference_set.main(args) print("Start classification filtering.") output_dir = prepare(args, overwriting) T3c = read_input_file(args["classification_weight_table"]) T4c = read_input_file(args["classification_activity_file"]) T10c, T8c, T4c_filtered_out, T4c_dedup = filter_clf( T3c, T4c, ConfigDict.get_parameters()["training_quorum"]["classification"], ConfigDict.get_parameters()["evaluation_quorum"]["classification"], ConfigDict.get_parameters()["initial_task_weights"], ) write_tmp_output(output_dir, T10c, T8c, T4c_filtered_out, T4c_dedup) print(f"Classification filtering took {time.time() - start:.08} seconds.") print(f"Classification filtering done.")
def prepare(args: dict, overwriting: bool): """Setup run by creating directories and log files. Args: args (dict): argparser arguments overwriting (bool): overwriting flag Returns: Tuple(DataFrame, DataFrame): Path to output and mapping_table subdirectories. """ output_dir_lsh = make_dir(args, "results_tmp", "lsh_folding", overwriting) mapping_table_dir = make_dir(args, "mapping_table", None, overwriting) create_log_files(output_dir_lsh) create_log_files(mapping_table_dir) load_config(args) load_key(args) method_params_fp = ConfigDict.get_parameters()["fingerprint"] method_params_lsh = ConfigDict.get_parameters()["lsh"] method_params = {**method_params_fp, **method_params_lsh} key = SecretDict.get_secrets()["key"] lshf = LSHFoldingCalculator.from_param_dict( secret=key, method_param_dict=method_params, verbosity=0) outcols = ["fp_feat", "fp_val", "fold_id", "success", "error_message"] out_types = ["object", "object", "object", "bool", "object"] dt = DfTransformer( lshf, input_columns={"canonical_smiles": "smiles"}, output_columns=outcols, output_types=out_types, success_column="success", nproc=args["number_cpu"], verbosity=0, ) return output_dir_lsh, mapping_table_dir, dt
def main(args: dict = None): """ Main function reading input files, executing functions and writing output files. """ start = time.time() if args is None: args = vars(init_arg_parser()) if args["non_interactive"] is True: overwriting = True else: overwriting = False load_config(args) load_key(args) print("Consistency checks of config and key files.") hash_reference_set.main(args) print("Generate sparse matrices from given dataframes.") fp_param = melloddy_tuner.utils.config.parameters.get_parameters( )["fingerprint"] bit_size = fp_param["fold_size"] output_dir, results_dir = prepare(args, overwriting) tag = args["tag"] if (tag != "cls") and (tag != "clsaux"): print("Please choose a different tag. Only cls or clsaux are allowed.") exit() df_T6 = read_input_file(args["structure_file"]) df_T10c = read_input_file(args["activity_file_clf"]) df_T10r = read_input_file(args["activity_file_reg"]) df_T6_cont, T10c_cont, T10r_cont = get_cont_id(df_T6, df_T10c, df_T10r) df_T11 = df_T6_cont[["cont_descriptor_vector_id", "fold_id", "fp_feat"]] df_T9c = read_input_file(args["weight_table_clf"]) df_T9r = read_input_file(args["weight_table_reg"]) save_df_as_csv(results_dir, T10c_cont, "T10c_cont") save_df_as_csv(results_dir, T10r_cont, "T10r_cont") save_df_as_csv(results_dir, df_T6_cont, "T6_cont") save_csv_output(output_dir, tag, df_T9c, df_T9r) x_matrix, fold_vector, y_matrix_clf, y_matrix_reg, censored_mask = make_matrices( df_T11, T10c_cont, T10r_cont, bit_size) y_matrix_clf.data = np.nan_to_num(y_matrix_clf.data, copy=False) y_matrix_clf.eliminate_zeros() save_npy_matrices( output_dir, tag, x_matrix, fold_vector, y_matrix_clf, y_matrix_reg, censored_mask, ) end = time.time() print(f"Formatting to matrices took {end - start:.08} seconds.") print(f"Files are ready for SparseChem.")
def main(args: dict = None): """Main wrapper to execute descriptor calculation and fold assignment. Args: args (dict): argparser dict containing relevant """ start = time.time() if args is None: args = vars(init_arg_parser()) if args["non_interactive"] is True: overwriting = True else: overwriting = False num_cpu = args["number_cpu"] load_config(args) load_key(args) print("Consistency checks of config and key files.") hash_reference_set.main(args) print("Start calculating descriptors and assign LSH folds.") output_dir_lsh, mapping_table_dir, dt = prepare(args, overwriting) input_file = args["structure_file"] output_file = os.path.join(output_dir_lsh, "T2_descriptors_lsh.csv") error_file = os.path.join(output_dir_lsh, "T2_descriptors_lsh.FAILED.csv") dupl_file = os.path.join(output_dir_lsh, "T2_descriptors_lsh.DUPLICATES.csv") mapping_file_T5 = os.path.join(mapping_table_dir, "T5.csv") mapping_file_T6 = os.path.join(mapping_table_dir, "T6.csv") df = pd.read_csv(input_file) df_processed, df_failed = dt.process_dataframe(df) df_processed.to_csv(output_file, index=False) df_failed.to_csv(error_file, index=False) df_grouped, df_desc_dupl = format_dataframe(df_processed) # col_T5 = ["input_compound_id", "fold_id"] # df_T5 = pd.merge(df_processed[col_T5], df_grouped[['input_compound_id', 'descriptor_vector_id', 'fold_id']], on=[ # "input_compound_id", "fold_id"], how="left") df_T5 = pd.merge( df_processed[["input_compound_id", "fp_feat", "fp_val", "fold_id"]], df_grouped[["fp_feat", "fp_val", "descriptor_vector_id", "fold_id"]], on=["fp_feat", "fp_val", "fold_id"], how="left", )[["input_compound_id", "fold_id", "descriptor_vector_id"]].reset_index(drop=True) df_T6 = df_grouped[[ "descriptor_vector_id", "fp_feat", "fp_val", "fold_id" ]] df_desc_dupl.to_csv(dupl_file, index=False) df_T5.to_csv(mapping_file_T5, index=False) df_T6.to_csv(mapping_file_T6, index=False) end = time.time() print( f"Fingerprint calculation and LSH folding took {end - start:.08} seconds." ) print(f"Descriptor calculation and LSH folding done.")
def main(args): """General wrapper function for replicate aggregation. Args: args (dict): Dictionary of arguments from argparser Returns: DataFrame: Activity file T4r with aggregated values """ start = time.time() if args["non_interactive"] is True: overwriting = True else: overwriting = False load_config(args) load_key(args) print("Consistency checks of config and key files.") hash_reference_set.main(args) print("Start aggregation.") output_dir = prepare(args, overwriting) T0 = read_input_file(args["assay_file"]) T1 = read_input_file(args["activity_file"]) print("Check assay types in T0.") sanity_check_assay_type(T0) print("Check consistency of input_assay_id between T0 and T1.") sanity_check_assay_sizes(T0, T1) print("Check uniqueness of T0.") sanity_check_uniqueness(T0, colname="input_assay_id", filename=args["assay_file"]) print(f"Sanity checks took {time.time() - start:.08} seconds.") print(f"Sanity checks passed.") T5 = read_input_file(args["mapping_table"]) ( df_aggr, df_failed_range, df_failed_aggr, df_failed_std, df_dup, T0_upd, ) = aggregate_replicates( T0, T1, T5, ConfigDict.get_parameters()["credibility_range"], args["number_cpu"] ) write_tmp_output( output_dir, df_aggr, df_failed_range, df_failed_aggr, df_failed_std, df_dup, T0_upd, ) print(f"Replicate aggregation took {time.time() - start:.08} seconds.") print(f"Replicate aggregation done.")
def prepare(args): overwriting = True load_config(args) load_key(args) output_dir = make_dir(args, "reference_set", None, overwriting) key = SecretDict.get_secrets()["key"] method_params_standardizer = ConfigDict.get_parameters()["standardization"] st = Standardizer.from_param_dict( method_param_dict=method_params_standardizer, verbosity=0) outcols_st = ["canonical_smiles", "success", "error_message"] out_types_st = ["object", "bool", "object"] dt_standarizer = DfTransformer( st, input_columns={"smiles": "smiles"}, output_columns=outcols_st, output_types=out_types_st, success_column="success", nproc=1, verbosity=0, ) method_params_folding = ConfigDict.get_parameters()["scaffold_folding"] sa = ScaffoldFoldAssign.from_param_dict( secret=key, method_param_dict=method_params_folding, verbosity=0) outcols_sa = [ "murcko_smiles", "sn_smiles", "fold_id", "success", "error_message" ] out_types_sa = ["object", "object", "int", "bool", "object"] dt_fold = DfTransformer( sa, input_columns={"canonical_smiles": "smiles"}, output_columns=outcols_sa, output_types=out_types_sa, success_column="success", nproc=1, verbosity=0, ) method_params_descriptor = ConfigDict.get_parameters()["fingerprint"] dc = DescriptorCalculator.from_param_dict( secret=key, method_param_dict=method_params_descriptor, verbosity=0) outcols_dc = ["fp_feat", "fp_val", "success", "error_message"] out_types_dc = ["object", "object", "bool", "object"] dt_descriptor = DfTransformer( dc, input_columns={"canonical_smiles": "smiles"}, output_columns=outcols_dc, output_types=out_types_dc, success_column="success", nproc=1, verbosity=0, ) return output_dir, dt_standarizer, dt_fold, dt_descriptor
def do_prepare_prediction(args): """Wrapper to run the entire pipeline for training. Args: args (Namespace): Subparser argmuents """ start = time.time() _args = vars(args) if _args["non_interactive"] is True: overwriting = True else: overwriting = False num_cpu = _args["number_cpu"] # load parameters and key load_config(_args) load_key(_args) bit_size = melloddy_tuner.utils.config.parameters.get_parameters( )["fingerprint"]["fold_size"] ######### # Consistency check print("Consistency checks of config and key files.") hash_reference_set.main(_args) ######### print("Prepare for prediction.") ###### df = read_input_file(_args["structure_file"]) # Make directories, load input files output_dir_std, dt_std = standardize_smiles.prepare(_args) df_smi, df_smi_failed = standardize_smiles.run(df, dt_std) output_dir_desc, dt_desc = calculate_descriptors.prepare( _args, overwriting) df_desc, df_desc_failed = calculate_descriptors.run(df_smi, dt_desc) df_desc_c = df_desc.copy() df_desc_c.loc[:, "descriptor_vector_id"] = ( df_desc_c.groupby("input_compound_id").ngroup().replace(-1, np.nan).add(1)) df_T6 = df_desc_c[["descriptor_vector_id", "fp_feat", "fp_val"]] out_dir_matrices, results_dir = csv_2_mtx.prepare(_args, overwriting) df_T11 = map_2_cont_id( df_T6, "descriptor_vector_id").sort_values("cont_descriptor_vector_id") save_df_as_csv(results_dir, df_T11, "T11_pred") x_matrix = csv_2_mtx.matrix_from_strucutres(df_T11, bit_size) save_mtx_as_npy(x_matrix, out_dir_matrices, "pred_x") print(f"Preparation took {time.time() - start:.08} seconds.") print(f"Prediction preparation done.")
def main(args: dict = None): """ Main function reading input files, executing functions and writing output files. """ start = time.time() if args is None: args = vars(init_arg_parser()) if args["non_interactive"] is True: overwriting = True else: overwriting = False load_config(args) load_key(args) print("Consistency checks of config and key files.") hash_reference_set.main(args) print("Start activity data formatting.") output_dir, mapping_table_dir = prepare(args, overwriting) results_dir = make_results_dir(args, overwriting) # read input files (mapping table T5, T10) activity data T4, and weight table T3 df_activity_data = read_input_file(args["activity_file"]) df_weight_table = read_input_file(args["weight_table"]) mapping_table_T5, mapping_table_T6, mapping_table_T10 = load_mapping_tables( args["dir_mapping_tables"]) # read input files (mapping table T5, T10) activity data T4, and weight table T3 pd.options.mode.chained_assignment = "raise" df_activity_data_formatted = do_actvity_formattting( df_activity_data, mapping_table_T5, mapping_table_T10) data_failed, data_duplicated_id_pairs, data_excluded = output_tmp_results( df_activity_data_formatted) write_tmp_output(output_dir, data_failed, data_duplicated_id_pairs, data_excluded) del (data_failed, data_duplicated_id_pairs, data_excluded) df_T11, df_T10, df_T3_mapped = output_results(df_activity_data_formatted, df_weight_table, mapping_table_T6) write_mappting_tables(mapping_table_dir, df_T3_mapped) write_output(results_dir, df_T11, df_T10) del (df_activity_data_formatted, df_T11, df_T10, df_T3_mapped) end = time.time() print(f"Formatting of activity data took {end - start:.08} seconds.") print(f"Activity data processing done.")
def main(args: dict = None): """Main wrapper to execute descriptor calculation and fold assignment. Args: args (dict): argparser dict containing relevant """ start = time.time() if args is None: args = vars(init_arg_parser()) if args["non_interactive"] is True: overwriting = True else: overwriting = False load_config(args) load_key(args) print("Consistency checks of config and key files.") hash_reference_set.main(args) output_dir, dt = prepare(args, overwriting) print("Start calculating descriptors.") input_file = args["structure_file"] output_file = os.path.join(output_dir, "T2_descriptors.csv") error_file = os.path.join(output_dir, "T2_descriptors.FAILED.csv") # dupl_file = os.path.join(output_dir, "T2_descriptors.DUPLICATES.csv") # mapping_file_T5 = os.path.join(mapping_table_dir, "T5.csv") # mapping_file_T6 = os.path.join(mapping_table_dir, "T6.csv") df = pd.read_csv(input_file) df_processed, df_failed = dt.process_dataframe(df) df_processed.to_csv(output_file, index=False) df_failed.to_csv(error_file, index=False) # df_T5, df_T6, df_duplicates = format_dataframe(df_processed) # df_duplicates.to_csv(dupl_file, index=False) # df_T5.to_csv(mapping_file_T5, index=False) # df_T6.to_csv(mapping_file_T6, index=False) end = time.time() print(f"Fingerprint calculation took {end - start:.08} seconds.") print(f"Descriptor calculation done.")
def prepare(args): """ Prepare output directories and instantiate df tansformer object for scaffold based folding Args: args (dict): argparser arguments Returns: Tuple(Path, DfTransformer): Path to output directory and instatitaed DfTranfomer for sccaffold folding """ output_dir = make_dir(args, "results_tmp", "folding", args["non_interactive"]) mapping_table_dir = make_dir(args, "mapping_table", None, args["non_interactive"]) create_log_files(output_dir) create_log_files(mapping_table_dir) load_config(args) load_key(args) key = SecretDict.get_secrets()["key"] method_params = ConfigDict.get_parameters()["scaffold_folding"] sa = ScaffoldFoldAssign.from_param_dict( secret=key, method_param_dict=method_params, verbosity=0 ) outcols = ["murcko_smiles", "sn_smiles", "fold_id", "success", "error_message"] out_types = ["object", "object", "int", "bool", "object"] dt = DfTransformer( sa, input_columns={"canonical_smiles": "smiles"}, output_columns=outcols, output_types=out_types, success_column="success", nproc=args["number_cpu"], verbosity=0, ) return output_dir, mapping_table_dir, dt
def do_prepare_training(args): """Wrapper to run the entire pipeline for training. Args: args (Namespace): Subparser argmuents #""" start_total = time.time() start = time.time() _args = vars(args) if _args["non_interactive"] is True: overwriting = True else: overwriting = False num_cpu = _args["number_cpu"] # # load parameters and key load_config(_args) load_key(_args) bit_size = melloddy_tuner.utils.config.parameters.get_parameters( )["fingerprint"]["fold_size"] ######### # Consistency check print("Consistency checks of config and key files.") hash_reference_set.main(_args) ######### start = time.time() tag = _args["tag"] print("Reading input data.") df_T0 = read_input_file(_args["weight_table"]) df_T1 = read_input_file(_args["activity_file"]) df_T2 = read_input_file(_args["structure_file"]) print("Data loaded.") print("Start sanity checks of input data.") print("Check assay types in T0.") sanity_check_assay_type(df_T0) print("Check consistency of input_assay_id between T0 and T1.") sanity_check_assay_sizes(df_T0, df_T1) print("Check consistency of input_compound_id between T1 and T2.") sanity_check_compound_sizes(df_T1, df_T2) print("Check uniqueness of T0 and T2.") sanity_check_uniqueness(df_T0, colname="input_assay_id", filename="T0") sanity_check_uniqueness(df_T2, colname="input_compound_id", filename="T2") print(f"Sanity checks took {time.time() - start:.08} seconds.") print(f"Sanity checks passed.") start = time.time() print("Start standardizing structures.") # Make directories, load input files results_dir = make_dir(_args, "results", None, overwriting) output_dir_std, dt_std = standardize_smiles.prepare(_args) df_smi, sd_smi_failed = standardize_smiles.run(df_T2, dt_std) save_df_as_csv(output_dir_std, df_smi, "T2_standardized") save_df_as_csv(output_dir_std, sd_smi_failed, "T2_standardized.FAILED") del sd_smi_failed, df_T2 print(f"Standardization took {time.time() - start:.08} seconds.") print(f"Standardization done.") df_T5 = pd.DataFrame() df_T6 = pd.DataFrame() if _args["folding_method"] == "scaffold": print("Using scaffold-based fold assignment.") output_dir_desc, dt_desc = calculate_descriptors.prepare( _args, overwriting) start = time.time() print("Start calculating descriptors.") df_desc, df_desc_failed = calculate_descriptors.run(df_smi, dt_desc) save_df_as_csv(output_dir_desc, df_desc, "T2_descriptors") save_df_as_csv(output_dir_desc, df_desc_failed, "T2_descriptors.FAILED") del df_smi, df_desc_failed print( f"Fingerprint calculation took {time.time() - start:.08} seconds.") print(f"Descriptor calculation done.") start = time.time() print("Start computing folds.") output_dir_fold, mapping_table_dir, dt_fold = calculate_scaffold_folds.prepare( _args) df_fold, df_fold_failed = calculate_scaffold_folds.run( df_desc, dt_fold) save_df_as_csv(output_dir_fold, df_fold, "T2_folds") save_df_as_csv(output_dir_fold, df_fold_failed, "T2_folds.FAILED") del df_fold_failed, df_desc df_T5, df_T6, df_duplicates = helper.format_dataframe(df_fold) save_df_as_csv(mapping_table_dir, df_T5, "T5") save_df_as_csv(mapping_table_dir, df_T6, "T6") save_df_as_csv(output_dir_desc, df_duplicates, "T2_descriptor_vector_id.DUPLICATES") del df_duplicates print(f"Fold calculation took {time.time() - start:.08} seconds.") print(f"Fold calculation done.") elif _args["folding_method"] == "lsh": print("Using LSH based fold assignment.") output_dir_lsh, mapping_table_dir, dt_lsh = calculate_lsh_folds.prepare( _args, overwriting) output_file = os.path.join(output_dir_lsh, "T2_descriptors_lsh.csv") error_file = os.path.join(output_dir_lsh, "T2_descriptors_lsh.FAILED.csv") dupl_file = os.path.join(output_dir_lsh, "T2_descriptors_lsh.DUPLICATES.csv") mapping_file_T5 = os.path.join(mapping_table_dir, "T5.csv") mapping_file_T6 = os.path.join(mapping_table_dir, "T6.csv") df_desc_lsh, df_desc_lsh_failed = dt_lsh.process_dataframe(df_smi) df_desc_lsh.to_csv(output_file, index=False) df_desc_lsh_failed.to_csv(error_file, index=False) df_T5, df_T6, df_duplicates = helper.format_dataframe(df_desc_lsh) df_duplicates.to_csv(dupl_file, index=False) df_T5.to_csv(mapping_file_T5, index=False) df_T6.to_csv(mapping_file_T6, index=False) del df_duplicates end = time.time() print( f"Fingerprint calculation and LSH folding took {end - start:.08} seconds." ) print(f"Descriptor calculation and LSH folding done.") else: print("Please use scaffold or lsh as folding method.") quit() start = time.time() print("Start aggregating values.") output_dir_agg = aggregate_values.prepare(_args, overwriting) ( df_T4r, df_failed_range, df_failed_aggr, df_failed_std, df_dup, df_T0_upd, ) = aggregate_values.aggregate_replicates( df_T0, df_T1, df_T5, ConfigDict.get_parameters()["credibility_range"], num_cpu) df_T4r = df_T4r[[ "input_assay_id", "descriptor_vector_id", "fold_id", "standard_qualifier", "standard_value", ]] save_df_as_csv( output_dir_agg, df_T4r, "T4r", [ "input_assay_id", "descriptor_vector_id", "fold_id", "standard_qualifier", "standard_value", ], ) save_df_as_csv( output_dir_agg, df_failed_range, "failed_range_T1", [ "input_compound_id", "input_assay_id", "standard_qualifier", "standard_value" ], ) save_df_as_csv( output_dir_agg, df_failed_aggr, "failed_aggr_T1", [ "descriptor_vector_id", "input_assay_id", "standard_qualifier", "standard_value", "fold_id", ], ) save_df_as_csv( output_dir_agg, df_failed_std, "failed_std_T1", [ "descriptor_vector_id", "input_assay_id", "standard_qualifier", "standard_value", "fold_id", ], ) save_df_as_csv( output_dir_agg, df_dup, "duplicates_T1", [ "input_assay_id", "input_compound_id", "descriptor_vector_id", "fold_id", "standard_qualifier", "standard_value", ], ) save_df_as_csv(output_dir_agg, df_T0_upd, "T0_upd") del df_T5, df_failed_range, df_failed_aggr, df_dup, df_T1 print(f"Replicate aggregation took {time.time() - start:.08} seconds.") print(f"Replicate aggregation done.") start = time.time() print("Start thresholding.") output_dir_thres = apply_thresholding.prepare(_args, overwriting) df_T0_upd = df_T0_upd.astype({"input_assay_id": "int"}) df_T4r = df_T4r.astype({"input_assay_id": "int"}) df_T4c, df_T3c = apply_thresholding.run(df_T0_upd, df_T4r, num_cpu) # Write final dataframes (T4c, T3c) columns_T3c = [ "classification_task_id", "input_assay_id", "assay_type", "variance_quorum_OK", "use_in_regression", "is_auxiliary", "threshold", "threshold_method", "direction", ] columns_T4c = [ "classification_task_id", "descriptor_vector_id", "fold_id", "input_assay_id", "standard_qualifier", "standard_value", "threshold", "class_label", ] df_T4c.sort_values("classification_task_id", inplace=True) df_T3c.sort_values("classification_task_id", inplace=True) # Filter ambiguous class labels df_T4c_failed = df_T4c[df_T4c.class_label.isna()] df_T4c = df_T4c[~df_T4c.class_label.isna()] df_T4c = df_T4c[columns_T4c] df_T3c = df_T3c[columns_T3c] save_df_as_csv(output_dir_thres, df_T4c_failed, "T4c.FAILED") save_df_as_csv(output_dir_thres, df_T4c, "T4c") save_df_as_csv(output_dir_thres, df_T3c, "T3c") print(f"Thresholding took {time.time() - start:.08} seconds.") print(f"Thresholding done.") print("Start filter classification data.") start = time.time() output_dir_filter_clf = filter_classification.prepare(_args, overwriting) T10c, T8c, T4c_filtered_out, T4c_dedup = filter_classification.filter_clf( df_T3c, df_T4c, ConfigDict.get_parameters()["training_quorum"]["classification"], ConfigDict.get_parameters()["evaluation_quorum"]["classification"], ConfigDict.get_parameters()["initial_task_weights"], ) filter_classification.write_tmp_output(output_dir_filter_clf, T10c, T8c, T4c_filtered_out, T4c_dedup) del df_T4c, df_T3c, T4c_filtered_out, T4c_dedup print(f"Classification filtering took {time.time() - start:.08} seconds.") print(f"Classification filtering done.") print("Start filter regression data.") ##### start = time.time() out_dir_filter_reg = filter_regression.prepare(_args, overwriting) T10r, T8r, T4r_filtered_out, T4r_dedup = filter_regression.filter_regression_tasks( df_T0_upd, df_T4r, ConfigDict.get_parameters()["training_quorum"]["regression"], ConfigDict.get_parameters()["evaluation_quorum"]["regression"], ConfigDict.get_parameters()["initial_task_weights"], ConfigDict.get_parameters()["censored_downweighting"], ) filter_regression.write_tmp_output(out_dir_filter_reg, T10r, T8r, T4r_filtered_out, T4r_dedup) del df_T0, df_T4r, T4r_filtered_out, T4r_dedup print(f"Filtering regression data took {time.time() - start:.08} seconds.") print(f"Filtering regression data done.") print("Start creating sparse matrices.") start = time.time() out_dir_matrices, results_dir = csv_2_mtx.prepare(_args, overwriting) df_T6_cont, T10c_cont, T10r_cont = csv_2_mtx.get_cont_id(df_T6, T10c, T10r) df_T11 = df_T6_cont[["cont_descriptor_vector_id", "fold_id", "fp_feat"]] save_df_as_csv(results_dir, T10c_cont, "T10c_cont") save_df_as_csv(results_dir, T10r_cont, "T10r_cont") save_df_as_csv(results_dir, df_T6_cont, "T6_cont") csv_2_mtx.save_csv_output(out_dir_matrices, tag, T8c, T8r) del df_T6, df_T6_cont, T10r, T10c ( x_matrix, fold_vector, y_matrix_clf, y_matrix_reg, censored_mask, ) = csv_2_mtx.make_matrices(df_T11, T10c_cont, T10r_cont, bit_size) del df_T11, T10c_cont, T10r_cont y_matrix_clf.data = np.nan_to_num(y_matrix_clf.data, copy=False) y_matrix_clf.eliminate_zeros() csv_2_mtx.save_npy_matrices( out_dir_matrices, tag, x_matrix, fold_vector, y_matrix_clf, y_matrix_reg, censored_mask, ) print(f"Formatting to matrices took {time.time() - start:.08} seconds.") end = time.time() print(f"Overall processing took {end - start_total:.08} seconds.") print(f"Files are ready for SparseChem.")
def main(args): """General wrapper function for thresholding. Args: args (dict): Dictionary of arguments from argparser Returns: df_T4c (DataFrame): dataframe containing classified activity data df_T3c (DataFrame): dataframe containing classification threshold definitions """ start = time.time() if args["non_interactive"] is True: overwriting = True else: overwriting = False load_config(args) load_key(args) num_cpu = args["number_cpu"] print("Consistency checks of config and key files.") hash_reference_set.main(args) print("Start thresholding.") # Load files output_dir = prepare(args, overwriting) T0 = read_input_file(args["assay_file"]) # T0 = T0.astype({'input_assay_id': 'str'}) T4r = read_input_file(args["activity_file"]) # T4r = T4r.astype({'input_assay_id': 'str'}) # Merge T0 and T4r on input_assay_id df_T4c, df_T3c = run(T0, T4r, num_cpu) # Write final dataframes (T4c, T3c) columns_T3c = [ "classification_task_id", "input_assay_id", "assay_type", "variance_quorum_OK", "use_in_regression", "is_auxiliary", "threshold", "threshold_method", "direction", ] columns_T4c = [ "classification_task_id", "descriptor_vector_id", "fold_id", "input_assay_id", "standard_qualifier", "standard_value", "threshold", "class_label", ] df_T4c.sort_values("classification_task_id", inplace=True) df_T3c.sort_values("classification_task_id", inplace=True) # Filter ambiguous class labels df_T4c_failed = df_T4c[df_T4c.class_label.isna()] df_T4c = df_T4c[~df_T4c.class_label.isna()] write_failed_output(output_dir, df_T4c_failed, columns_T4c) write_tmp_output(output_dir, df_T4c, df_T3c, columns_T4c, columns_T3c) print(f"Thresholding took {time.time() - start:.08} seconds.") print(f"Thresholding done.")