Example #1
0
    def compare(self):
        parser = argparse.ArgumentParser(
            prog='DeepCCS compare',
            description=
            "Compare the CCS values in a file with the value used to create " +
            "train this predictive model. No predictions involved in the " +
            "process, only comparaison.")

        parser.add_argument("-i", help="Input file name", required=True)
        parser.add_argument(
            "-o",
            help=
            "Prefix of output file name (ex: MyFile_). If not specified, stdout will be "
            + "used. If 'none', onlyt the stats will be shown.",
            default="")
        parser.add_argument(
            "-d",
            help=
            "List of datasets to compare to separated by coma (dtA,dtB,dtC)",
            default=None)

        if len(argv) <= 2:
            parser.print_help()
            exit()

        args = parser.parse_args(argv[2:])

        from DeepCCS.utils import read_dataset, read_reference_table, output_results, output_global_stats

        logging.debug("Starting comparaison tool with the following args:" +
                      str(args))
        if not path.isfile(args.i):
            raise IOError("Reference file cannot be found")

        # Output prefix, if none : output to stdout
        out_file_name_prefix = None
        if args.o != "":
            out_file_name_prefix = args.o

        # Data used to create algorithm
        if args.d is not None:
            dt_list = args.d.split(", ")
        else:
            dt_list = [
                "MetCCS_train_pos", "MetCCS_train_neg", "MetCCS_test_pos",
                "MetCCS_test_neg", "Astarita_pos", "Astarita_neg", "Baker",
                "McLean", "CBM"
            ]

        # Get a pandas dataframe for each dataset asked for comparaison
        # output another table with all the original values + the ccs given by user in an extra column
        # print general stats on the compaison
        logging.debug("Starting iterating on the dataset list of comparaison")
        for i in dt_list:
            df_dt = read_dataset("../DATASETS.h5", i)
            smiles = df_dt["SMILES"]
            adducts = df_dt["Adducts"]
            ccs = df_dt["CCS"]

            out_file_name = None
            if out_file_name_prefix is not None:
                out_file_name = out_file_name_prefix + i + ".txt"

            if out_file_name is None or out_file_name_prefix.lower() != "none":
                output_results(args.i, smiles, adducts, ccs, out_file_name)

            smiles_u, adducts_u, ccs_u = read_reference_table(args.i)

            df_user = pd.DataFrame({
                "SMILES": smiles_u,
                "Adducts": adducts_u,
                "CCS": ccs_u
            })

            df_ref = pd.DataFrame({
                "SMILES": smiles,
                "Adducts": adducts,
                "CCS_DeepCCS": ccs
            })

            merged_df = pd.merge(left=df_user,
                                 right=df_ref,
                                 on=["SMILES", "Adducts"],
                                 how='inner')

            if len(merged_df["CCS"]) == 0:
                print(i)
                print("No corresponding molecule, moving to next dataset.")
                continue
            else:
                print("{} dataset :".format(i))
                print("=> {} molecules used for comparaison".format(
                    len(merged_df["CCS"])))
                print("--------Comparaison stats--------")
                output_global_stats(merged_df["CCS_DeepCCS"], merged_df["CCS"])
Example #2
0
    def predict(self):
        parser = argparse.ArgumentParser(
            prog='DeepCCS predict',
            description=
            "Predict CCS for some SMILES and adducts using a pre-trained " +
            "model.")
        parser.add_argument("-mp",
                            help="Path to model directory",
                            default="../saved_models/default/")
        parser.add_argument("-ap",
                            help="Path to adducts_encoder directory",
                            default="../saved_models/default/")
        parser.add_argument("-sp",
                            help="Path to smiles_encoder directory",
                            default="../saved_models/default/")
        parser.add_argument(
            "-i",
            help="Input file name with SMILES and adduct columns",
            required=True)
        parser.add_argument(
            "-o",
            help=
            "Output file name (ex: MyFile.csv). If not specified, stdout will be used",
            default="")

        if len(argv) <= 2:
            parser.print_help()
            exit()

        args = parser.parse_args(argv[2:])

        from DeepCCS.utils import read_input_table, output_results
        from DeepCCS.model import DeepCCS

        print("Starting prediction tool with the following args:" + str(args))
        if not path.isdir(args.mp):
            raise IOError("Model directory cannot be found")
        if not path.isdir(args.ap):
            raise IOError("adducts_encoder directory cannot be found")
        if not path.isdir(args.sp):
            raise IOError("smiles_encoder directory cannot be found")

        if not path.isfile(path.join(args.mp, "model.h5")):
            raise IOError("Model file is missing from model directory")
        if not path.isfile(path.join(args.ap, "adducts_encoder.json")):
            raise IOError(
                "adduct_encoder.json is missing from the adducts_encoder directory directory"
            )
        if not path.isfile(path.join(args.sp, "smiles_encoder.json")):
            raise IOError(
                "smiles_encoder.json is missing from the smiles_encoder directory directory"
            )

        model = DeepCCS.DeepCCSModel()
        model.load_model_from_file(
            filename=path.join(args.mp, "model.h5"),
            adduct_encoder_file=path.join(args.ap, "adducts_encoder.json"),
            smiles_encoder_file=path.join(args.sp, "smiles_encoder.json"))
        X_smiles, X_adducts = read_input_table(args.i)
        ccs_pred = model.predict(X_smiles, X_adducts)

        ccs_pred = ccs_pred.flatten()

        out_file_name = None
        if args.o != "":
            out_file_name = args.o
        output_results(args.i, X_smiles, X_adducts, ccs_pred, out_file_name)
Example #3
0
    def evaluate(self):
        parser = argparse.ArgumentParser(
            prog='DeepCCS evaluate',
            description=
            "Evaluate the model performances using SMILES and adducts for " +
            "which the CCS was measured.")

        parser.add_argument("-mp",
                            help="Path to model directory",
                            default="../saved_models/default/")
        parser.add_argument("-ap",
                            help="Path to adducts_encoder directory",
                            default="../saved_models/default/")
        parser.add_argument("-sp",
                            help="Path to smiles_encoder directory",
                            default="../saved_models/default/")
        parser.add_argument(
            "-i",
            help=
            "Input file name. Must contain columns SMILES, adducts and CCS",
            required=True)
        parser.add_argument(
            "-o",
            help=
            "Output file name (ex: MyFile.csv). If not specified, stdout will be used."
            + " If 'none', only global stats will be shown.",
            default="")

        if len(argv) <= 2:
            parser.print_help()
            exit()

        args = parser.parse_args(argv[2:])

        from DeepCCS.utils import read_reference_table, output_results, output_global_stats
        from DeepCCS.model import DeepCCS

        print("Starting evaluation tool with the following args:" + str(args))
        if not path.isdir(args.mp):
            raise IOError("Model directory cannot be found")
        if not path.isdir(args.ap):
            raise IOError("adducts_encoder directory cannot be found")
        if not path.isdir(args.sp):
            raise IOError("smiles_encoder directory cannot be found")

        if not path.isfile(args.i):
            raise IOError("Input file cannot be found")
        if not path.isfile(path.join(args.mp, "model.h5")):
            raise IOError("Model file is missing from model directory")
        if not path.isfile(path.join(args.ap, "adducts_encoder.json")):
            raise IOError(
                "adducts_encoder.json is missing from the adducts_encoder directory directory"
            )
        if not path.isfile(path.join(args.sp, "smiles_encoder.json")):
            raise IOError(
                "smiles_encoder.json is missing from the smiles_encoder directory directory"
            )

        model = DeepCCS.DeepCCSModel()
        model.load_model_from_file(
            filename=path.join(args.mp, "model.h5"),
            adduct_encoder_file=path.join(args.ap, "adducts_encoder.json"),
            smiles_encoder_file=path.join(args.sp, "smiles_encoder.json"))

        X_smiles, X_adducts, X_ccs = read_reference_table(args.i)
        ccs_pred = model.predict(X_smiles, X_adducts)

        ccs_pred = ccs_pred.flatten()

        out_file_name = None
        if args.o != "":
            out_file_name = args.o

        if out_file_name is None or out_file_name.lower() != "none":
            output_results(args.i, X_smiles, X_adducts, ccs_pred,
                           out_file_name)

        print("-----------Model stats-----------")
        output_global_stats(X_ccs, ccs_pred)