def compare(self): parser = argparse.ArgumentParser( prog='DeepCCS compare', description= "Compare the CCS values in a file with the value used to create " + "train this predictive model. No predictions involved in the " + "process, only comparaison.") parser.add_argument("-i", help="Input file name", required=True) parser.add_argument( "-o", help= "Prefix of output file name (ex: MyFile_). If not specified, stdout will be " + "used. If 'none', onlyt the stats will be shown.", default="") parser.add_argument( "-d", help= "List of datasets to compare to separated by coma (dtA,dtB,dtC)", default=None) if len(argv) <= 2: parser.print_help() exit() args = parser.parse_args(argv[2:]) from DeepCCS.utils import read_dataset, read_reference_table, output_results, output_global_stats logging.debug("Starting comparaison tool with the following args:" + str(args)) if not path.isfile(args.i): raise IOError("Reference file cannot be found") # Output prefix, if none : output to stdout out_file_name_prefix = None if args.o != "": out_file_name_prefix = args.o # Data used to create algorithm if args.d is not None: dt_list = args.d.split(", ") else: dt_list = [ "MetCCS_train_pos", "MetCCS_train_neg", "MetCCS_test_pos", "MetCCS_test_neg", "Astarita_pos", "Astarita_neg", "Baker", "McLean", "CBM" ] # Get a pandas dataframe for each dataset asked for comparaison # output another table with all the original values + the ccs given by user in an extra column # print general stats on the compaison logging.debug("Starting iterating on the dataset list of comparaison") for i in dt_list: df_dt = read_dataset("../DATASETS.h5", i) smiles = df_dt["SMILES"] adducts = df_dt["Adducts"] ccs = df_dt["CCS"] out_file_name = None if out_file_name_prefix is not None: out_file_name = out_file_name_prefix + i + ".txt" if out_file_name is None or out_file_name_prefix.lower() != "none": output_results(args.i, smiles, adducts, ccs, out_file_name) smiles_u, adducts_u, ccs_u = read_reference_table(args.i) df_user = pd.DataFrame({ "SMILES": smiles_u, "Adducts": adducts_u, "CCS": ccs_u }) df_ref = pd.DataFrame({ "SMILES": smiles, "Adducts": adducts, "CCS_DeepCCS": ccs }) merged_df = pd.merge(left=df_user, right=df_ref, on=["SMILES", "Adducts"], how='inner') if len(merged_df["CCS"]) == 0: print(i) print("No corresponding molecule, moving to next dataset.") continue else: print("{} dataset :".format(i)) print("=> {} molecules used for comparaison".format( len(merged_df["CCS"]))) print("--------Comparaison stats--------") output_global_stats(merged_df["CCS_DeepCCS"], merged_df["CCS"])
def predict(self): parser = argparse.ArgumentParser( prog='DeepCCS predict', description= "Predict CCS for some SMILES and adducts using a pre-trained " + "model.") parser.add_argument("-mp", help="Path to model directory", default="../saved_models/default/") parser.add_argument("-ap", help="Path to adducts_encoder directory", default="../saved_models/default/") parser.add_argument("-sp", help="Path to smiles_encoder directory", default="../saved_models/default/") parser.add_argument( "-i", help="Input file name with SMILES and adduct columns", required=True) parser.add_argument( "-o", help= "Output file name (ex: MyFile.csv). If not specified, stdout will be used", default="") if len(argv) <= 2: parser.print_help() exit() args = parser.parse_args(argv[2:]) from DeepCCS.utils import read_input_table, output_results from DeepCCS.model import DeepCCS print("Starting prediction tool with the following args:" + str(args)) if not path.isdir(args.mp): raise IOError("Model directory cannot be found") if not path.isdir(args.ap): raise IOError("adducts_encoder directory cannot be found") if not path.isdir(args.sp): raise IOError("smiles_encoder directory cannot be found") if not path.isfile(path.join(args.mp, "model.h5")): raise IOError("Model file is missing from model directory") if not path.isfile(path.join(args.ap, "adducts_encoder.json")): raise IOError( "adduct_encoder.json is missing from the adducts_encoder directory directory" ) if not path.isfile(path.join(args.sp, "smiles_encoder.json")): raise IOError( "smiles_encoder.json is missing from the smiles_encoder directory directory" ) model = DeepCCS.DeepCCSModel() model.load_model_from_file( filename=path.join(args.mp, "model.h5"), adduct_encoder_file=path.join(args.ap, "adducts_encoder.json"), smiles_encoder_file=path.join(args.sp, "smiles_encoder.json")) X_smiles, X_adducts = read_input_table(args.i) ccs_pred = model.predict(X_smiles, X_adducts) ccs_pred = ccs_pred.flatten() out_file_name = None if args.o != "": out_file_name = args.o output_results(args.i, X_smiles, X_adducts, ccs_pred, out_file_name)
def evaluate(self): parser = argparse.ArgumentParser( prog='DeepCCS evaluate', description= "Evaluate the model performances using SMILES and adducts for " + "which the CCS was measured.") parser.add_argument("-mp", help="Path to model directory", default="../saved_models/default/") parser.add_argument("-ap", help="Path to adducts_encoder directory", default="../saved_models/default/") parser.add_argument("-sp", help="Path to smiles_encoder directory", default="../saved_models/default/") parser.add_argument( "-i", help= "Input file name. Must contain columns SMILES, adducts and CCS", required=True) parser.add_argument( "-o", help= "Output file name (ex: MyFile.csv). If not specified, stdout will be used." + " If 'none', only global stats will be shown.", default="") if len(argv) <= 2: parser.print_help() exit() args = parser.parse_args(argv[2:]) from DeepCCS.utils import read_reference_table, output_results, output_global_stats from DeepCCS.model import DeepCCS print("Starting evaluation tool with the following args:" + str(args)) if not path.isdir(args.mp): raise IOError("Model directory cannot be found") if not path.isdir(args.ap): raise IOError("adducts_encoder directory cannot be found") if not path.isdir(args.sp): raise IOError("smiles_encoder directory cannot be found") if not path.isfile(args.i): raise IOError("Input file cannot be found") if not path.isfile(path.join(args.mp, "model.h5")): raise IOError("Model file is missing from model directory") if not path.isfile(path.join(args.ap, "adducts_encoder.json")): raise IOError( "adducts_encoder.json is missing from the adducts_encoder directory directory" ) if not path.isfile(path.join(args.sp, "smiles_encoder.json")): raise IOError( "smiles_encoder.json is missing from the smiles_encoder directory directory" ) model = DeepCCS.DeepCCSModel() model.load_model_from_file( filename=path.join(args.mp, "model.h5"), adduct_encoder_file=path.join(args.ap, "adducts_encoder.json"), smiles_encoder_file=path.join(args.sp, "smiles_encoder.json")) X_smiles, X_adducts, X_ccs = read_reference_table(args.i) ccs_pred = model.predict(X_smiles, X_adducts) ccs_pred = ccs_pred.flatten() out_file_name = None if args.o != "": out_file_name = args.o if out_file_name is None or out_file_name.lower() != "none": output_results(args.i, X_smiles, X_adducts, ccs_pred, out_file_name) print("-----------Model stats-----------") output_global_stats(X_ccs, ccs_pred)