def test_scramble_desc_multiple_key(self): """test if scrambling is depending on the input key""" newKey = "melloddy_2" tempFilePathFeat = curDir / "output/tmp/ecfp_feat_scrambled_new_key.csv" df_smiles = read_csv(curDir / "input/chembl/chembl_23_example_T2.csv", nrows=10) dc = DescriptorCalculator.from_param_dict( secret=newKey, method_param_dict=self.config["fingerprint"], verbosity=0) outcols = ["fp_feat", "fp_val", "success", "error_message"] out_types = ["object", "object", "bool", "object"] dt = DfTransformer( dc, input_columns={"smiles": "smiles"}, output_columns=outcols, output_types=out_types, success_column="success", nproc=1, verbosity=0, ) df_test = dt.process_dataframe(df_smiles)[0] df_test.to_csv(tempFilePathFeat, index=False) # write test fingperprints result = filecmp.cmp( "unit_test/output/test_calculate_desc_y2.csv", tempFilePathFeat, shallow=False, ) self.assertEqual(result, False)
def test_calculate_desc_multiple(self): tempFilePath = curDir / "output/tmp/ecfp_feat_multiple.csv" df_smiles = read_csv(curDir / "input/chembl/chembl_23_example_T2.csv", nrows=10) dc = DescriptorCalculator.from_param_dict( secret=self.keys["key"], method_param_dict=self.config["fingerprint"], verbosity=0, ) outcols = ["fp_feat", "fp_val", "success", "error_message"] out_types = ["object", "object", "bool", "object"] dt = DfTransformer( dc, input_columns={"smiles": "smiles"}, output_columns=outcols, output_types=out_types, success_column="success", nproc=1, verbosity=0, ) # df_ref = dt.process_dataframe(df_smiles)[0] #calculate reference fingperprints # df_ref.to_csv("unit_test/output/test_calculate_desc_y2.csv", index=False) #write reference fingperprints df_test = dt.process_dataframe(df_smiles)[0] df_test.to_csv(tempFilePath, index=False) # write test fingperprints result = filecmp.cmp("unit_test/output/test_calculate_desc_y2.csv", tempFilePath, shallow=False) self.assertEqual(result, True)
def test_calculate_sn_fold_multiple(self): infile = os.path.join(curDir, "input", "test_sn_fold_input.csv") outfile = os.path.join(curDir, "output", "tmp", "sn_fold_output.csv") output_columns = [ "murcko_smiles", "sn_smiles", "fold_id", "success", "error_message", ] output_types = ["object", "object", "int", "bool", "object"] sa = ScaffoldFoldAssign( nfolds=self.config["scaffold_folding"]["nfolds"], secret=self.keys["key"]) dt = DfTransformer( sa, input_columns={"canonical_smiles": "smiles"}, output_columns=output_columns, output_types=output_types, ) dt.process_file(infile, outfile) result = filecmp.cmp( os.path.join(curDir, "output", "test_sn_fold_output.csv"), os.path.join(outfile), shallow=False, ) self.assertEqual(result, True)
def prepare(args): overwriting = True load_config(args) load_key(args) output_dir = make_dir(args, "reference_set", None, overwriting) key = SecretDict.get_secrets()["key"] method_params_standardizer = ConfigDict.get_parameters()["standardization"] st = Standardizer.from_param_dict( method_param_dict=method_params_standardizer, verbosity=0) outcols_st = ["canonical_smiles", "success", "error_message"] out_types_st = ["object", "bool", "object"] dt_standarizer = DfTransformer( st, input_columns={"smiles": "smiles"}, output_columns=outcols_st, output_types=out_types_st, success_column="success", nproc=1, verbosity=0, ) method_params_folding = ConfigDict.get_parameters()["scaffold_folding"] sa = ScaffoldFoldAssign.from_param_dict( secret=key, method_param_dict=method_params_folding, verbosity=0) outcols_sa = [ "murcko_smiles", "sn_smiles", "fold_id", "success", "error_message" ] out_types_sa = ["object", "object", "int", "bool", "object"] dt_fold = DfTransformer( sa, input_columns={"canonical_smiles": "smiles"}, output_columns=outcols_sa, output_types=out_types_sa, success_column="success", nproc=1, verbosity=0, ) method_params_descriptor = ConfigDict.get_parameters()["fingerprint"] dc = DescriptorCalculator.from_param_dict( secret=key, method_param_dict=method_params_descriptor, verbosity=0) outcols_dc = ["fp_feat", "fp_val", "success", "error_message"] out_types_dc = ["object", "object", "bool", "object"] dt_descriptor = DfTransformer( dc, input_columns={"canonical_smiles": "smiles"}, output_columns=outcols_dc, output_types=out_types_dc, success_column="success", nproc=1, verbosity=0, ) return output_dir, dt_standarizer, dt_fold, dt_descriptor
def prepare(args: dict, overwriting: bool): """Setup run by creating directories and log files. Args: args (dict): argparser arguments overwriting (bool): overwriting flag Returns: Tuple(DataFrame, DataFrame): Path to output and mapping_table subdirectories. """ output_dir_lsh = make_dir(args, "results_tmp", "lsh_folding", overwriting) mapping_table_dir = make_dir(args, "mapping_table", None, overwriting) create_log_files(output_dir_lsh) create_log_files(mapping_table_dir) load_config(args) load_key(args) method_params_fp = ConfigDict.get_parameters()["fingerprint"] method_params_lsh = ConfigDict.get_parameters()["lsh"] method_params = {**method_params_fp, **method_params_lsh} key = SecretDict.get_secrets()["key"] lshf = LSHFoldingCalculator.from_param_dict( secret=key, method_param_dict=method_params, verbosity=0) outcols = ["fp_feat", "fp_val", "fold_id", "success", "error_message"] out_types = ["object", "object", "object", "bool", "object"] dt = DfTransformer( lshf, input_columns={"canonical_smiles": "smiles"}, output_columns=outcols, output_types=out_types, success_column="success", nproc=args["number_cpu"], verbosity=0, ) return output_dir_lsh, mapping_table_dir, dt
def test_standardizer_pipeline(self): """ Testing standardization of a larger set of smiles from Chembl using serial execution Compared are resulting output files. """ infile = os.path.join(curDir, "input", "test_standardizer.csv") outfile = os.path.join(curDir, "output", "sn_fold_output.OK.csv") errfile = os.path.join(curDir, "output", "sn_fold_output.failed.csv") outfile_tmp = os.path.join(curDir, "output", "tmp", "sn_fold_output.OK.csv") errfile_tmp = os.path.join(curDir, "output", "tmp", "sn_fold_output.failed.csv") st = Standardizer.from_param_dict( method_param_dict=self.config["standardization"], verbosity=0) outcols = ["canonical_smiles", "success", "error_message"] out_types = ["object", "bool", "object"] dt = DfTransformer( st, input_columns={"smiles": "smiles"}, output_columns=outcols, output_types=out_types, success_column="success", nproc=2, verbosity=0, ) # build reference files, only run once # dt.process_file(infile, outfile, errfile) # run test with tmp files dt.process_file(infile, outfile_tmp, errfile_tmp) result = filecmp.cmp(outfile, outfile_tmp, shallow=False) error = filecmp.cmp(errfile, errfile_tmp, shallow=False) os.remove(outfile_tmp) os.remove(errfile_tmp) self.assertEqual(result, error, True)
def test_standardizer_different_configs(self): """Testing standardization of smiles using threading""" df_smiles = read_csv(curDir / "input/test_standardizer.csv") outcols = ["canonical_smiles", "success", "error_message"] out_types = ["object", "bool", "object"] ## Load ref standardizer st_ref = Standardizer( max_num_atoms=self.config["standardization"]["max_num_atoms"], max_num_tautomers=self.config["standardization"] ["max_num_tautomers"], include_stereoinfo=self.config["standardization"] ["include_stereoinfo"], verbosity=0, ) dt_ref = DfTransformer( st_ref, input_columns={"smiles": "smiles"}, output_columns=outcols, output_types=out_types, success_column="success", nproc=4, verbosity=0, ) response_ref = dt_ref.process_dataframe(df_smiles)[0] config_2 = ConfigDict(config_path=Path( os.path.join(curDir, "input/", "example_parameters_2.json"))).get_parameters() ## load test standardizer st_tmp = Standardizer( max_num_atoms=config_2["standardization"]["max_num_atoms"], max_num_tautomers=config_2["standardization"]["max_num_tautomers"], include_stereoinfo=config_2["standardization"] ["include_stereoinfo"], verbosity=0, ) dt_tmp = DfTransformer( st_tmp, input_columns={"smiles": "smiles"}, output_columns=outcols, output_types=out_types, success_column="success", nproc=2, verbosity=0, ) response_tmp = dt_tmp.process_dataframe(df_smiles)[0] try: assert_frame_equal(response_ref, response_tmp) except AssertionError: # frames are not equal pass else: # frames are equal raise AssertionError
def test_standardizer_parameter_atom_count(self): """Testing standardization with different number of max atom count""" df_smiles = read_csv(curDir / "input/test_standardizer.csv") outcols = ["canonical_smiles", "success", "error_message"] out_types = ["object", "bool", "object"] ## Load ref standardizer st_ref = Standardizer( max_num_atoms=self.config["standardization"]["max_num_atoms"], max_num_tautomers=self.config["standardization"] ["max_num_tautomers"], include_stereoinfo=self.config["standardization"] ["include_stereoinfo"], verbosity=0, ) dt_ref = DfTransformer( st_ref, input_columns={"smiles": "smiles"}, output_columns=outcols, output_types=out_types, success_column="success", nproc=4, verbosity=0, ) response_ref = dt_ref.process_dataframe(df_smiles)[0] ## load test standardizer st_tmp = Standardizer( max_num_atoms=5, max_num_tautomers=self.config["standardization"] ["max_num_tautomers"], include_stereoinfo=self.config["standardization"] ["include_stereoinfo"], verbosity=0, ) dt_tmp = DfTransformer( st_tmp, input_columns={"smiles": "smiles"}, output_columns=outcols, output_types=out_types, success_column="success", nproc=2, verbosity=0, ) response_tmp = dt_tmp.process_dataframe(df_smiles)[0] try: assert_frame_equal(response_ref, response_tmp) except AssertionError: # frames are not equal pass else: # frames are equal raise AssertionError
def prepare(args): """ Prepare output directories and instantiate df tansformer object for scaffold based folding Args: args (dict): argparser arguments Returns: Tuple(Path, DfTransformer): Path to output directory and instatitaed DfTranfomer for sccaffold folding """ output_dir = make_dir(args, "results_tmp", "folding", args["non_interactive"]) mapping_table_dir = make_dir(args, "mapping_table", None, args["non_interactive"]) create_log_files(output_dir) create_log_files(mapping_table_dir) load_config(args) load_key(args) key = SecretDict.get_secrets()["key"] method_params = ConfigDict.get_parameters()["scaffold_folding"] sa = ScaffoldFoldAssign.from_param_dict( secret=key, method_param_dict=method_params, verbosity=0 ) outcols = ["murcko_smiles", "sn_smiles", "fold_id", "success", "error_message"] out_types = ["object", "object", "int", "bool", "object"] dt = DfTransformer( sa, input_columns={"canonical_smiles": "smiles"}, output_columns=outcols, output_types=out_types, success_column="success", nproc=args["number_cpu"], verbosity=0, ) return output_dir, mapping_table_dir, dt
def test_standardizer_multiprocessing(self): """Testing standardization of smiles using threading""" df_smiles = read_csv(curDir / "input/chembl/chembl_23_example_T2.csv", nrows=10) st = Standardizer( max_num_atoms=self.config["standardization"]["max_num_atoms"], max_num_tautomers=self.config["standardization"] ["max_num_tautomers"], include_stereoinfo=self.config["standardization"] ["include_stereoinfo"], verbosity=0, ) outcols = ["canonical_smiles", "success", "error_message"] out_types = ["object", "bool", "object"] dt_2 = DfTransformer( st, input_columns={"smiles": "smiles"}, output_columns=outcols, output_types=out_types, success_column="success", nproc=2, verbosity=0, ) response2 = dt_2.process_dataframe(df_smiles)[0] dt_4 = DfTransformer( st, input_columns={"smiles": "smiles"}, output_columns=outcols, output_types=out_types, success_column="success", nproc=4, verbosity=0, ) response4 = dt_4.process_dataframe(df_smiles)[0] assert_frame_equal(response2, response4)