Example #1
0
    def train(self):
        parser = argparse.ArgumentParser(prog='DeepCCS train',
                                         description="Train a new model.")
        parser.add_argument("-ap",
                            help="path to adducts_encoder directory",
                            default=None)
        parser.add_argument("-sp",
                            help="path to smiles_encoder directory",
                            default=None)

        parser.add_argument(
            "-mtrain",
            help="Use MetCCS train datasets to create the model",
            default=False,
            action="store_true",
            dest="mtrain")
        parser.add_argument(
            "-mtest",
            help="Use MetCCS test datasets to create the model",
            default=False,
            action="store_true",
            dest="mtest")
        parser.add_argument("-ast",
                            help="Astarita test datasets to create the model",
                            default=False,
                            action="store_true",
                            dest="ast")

        parser.add_argument("-baker",
                            help="Baker dataset to create the model",
                            default=False,
                            action="store_true",
                            dest="baker")
        parser.add_argument("-cbm",
                            help="CBM2018 dataset to create the model",
                            default=False,
                            action="store_true",
                            dest="cbm")
        parser.add_argument("-mclean",
                            help="McLean dataset to create the model",
                            default=False,
                            action="store_true",
                            dest="mclean")

        parser.add_argument(
            "-nd",
            help=
            "New Data to create the model, list of template file (file1.csv,file2.csv,...)",
            default=None)
        parser.add_argument("-nepochs", help="Number of epochs", default=150)
        parser.add_argument("-verbose",
                            help="Keras verbosity (1 or 0)",
                            default=1)
        parser.add_argument(
            "-test",
            help="Proportion of the datasets to put in the testing set",
            default=0.2,
            type=float)
        parser.add_argument("-o",
                            help="Output directory for model and mappers",
                            default="./")
        parser.set_defaults(func=self.train)

        if len(argv) <= 2:
            parser.print_help()
            exit()

        args = parser.parse_args(argv[2:])

        if 0 >= args.test >= 1:
            raise ValueError(
                "Proportion in test set must be between 0 and 1. Recommended: 0.2"
            )

        logging.debug("\nCondition is: {}".format(not (
            args.mtrain or args.mtest or args.ast or args.baker or args.cbm
            or args.mclean)))
        if not (args.mtrain or args.mtest or args.ast or args.baker or args.cbm
                or args.mclean or args.nd is not None):
            raise ValueError(
                "At least one datafile must be used to train a model.")

        from DeepCCS.model import DeepCCS
        from DeepCCS.utils import read_dataset, read_reference_table, output_global_stats
        from keras.callbacks import ModelCheckpoint

        logging.debug("Starting prediction tool with the following args:" +
                      str(args))
        if not path.isdir(args.o):
            raise IOError("Directory for output model cannot be found")

        # Initialize lists
        training_datasets = []
        testing_datasets = []
        dt_list = []
        name_test_dataset = []

        date = datetime.datetime.now().strftime(
            "%Y-%m-%d_%Hh%M")  # ex:2018-05-25_14h40

        model_directory = args.o + "/" + date
        if not path.exists(model_directory):
            makedirs(model_directory)

        # ---> Exception !!!
        # MetCCS datasets are the only possible exception to the 80-20 rule

        # Load source datasets according to args
        source_data = "../DATASETS.h5"
        if args.mtrain:
            for d in ["MetCCS_train_pos", "MetCCS_train_neg"]:
                df_dt = read_dataset(source_data, d)
                smiles = df_dt["SMILES"]
                adducts = df_dt["Adducts"]
                ccs = df_dt["CCS"]
                training_datasets.append([smiles, adducts, ccs])

        if args.mtest:
            dt_list.extend(["MetCCS_test_pos", "MetCCS_test_neg"])
            name_test_dataset.extend(["MetCCS_test_pos", "MetCCS_test_neg"])
        else:
            for d in ["MetCCS_test_pos", "MetCCS_test_neg"]:
                df_dt = read_dataset(source_data, d)
                smiles = df_dt["SMILES"]
                adducts = df_dt["Adducts"]
                ccs = df_dt["CCS"]
                testing_datasets.append([smiles, adducts, ccs])
            name_test_dataset.extend(["MetCCS_test_pos", "MetCCS_test_neg"])

        if args.ast:
            dt_list.extend(["Astarita_pos", "Astarita_neg"])
            name_test_dataset.extend(["Astarita_pos", "Astarita_neg"])
        else:
            for d in ["Astarita_pos", "Astarita_neg"]:
                df_dt = read_dataset(source_data, d)
                smiles = df_dt["SMILES"]
                adducts = df_dt["Adducts"]
                ccs = df_dt["CCS"]
                testing_datasets.append([smiles, adducts, ccs])
            name_test_dataset.extend(["Astarita_pos", "Astarita_neg"])

        if args.baker:
            dt_list.append("Baker")

        if args.mclean:
            dt_list.append("McLean")

        if args.cbm:
            dt_list.append("CBM")

        logging.debug("Number of training dataset: {}".format(
            len(training_datasets)))

        logging.debug("Training dataset list: {}".format(dt_list))

        # Divide source dataset(s) using the specified proportions
        train_fraction = 1 - args.test
        for d in dt_list:
            name_test_dataset.append(d)
            data = read_dataset(source_data, d)
            train = data.sample(frac=train_fraction)
            test = data.drop(train.index)

            train_smiles = train["SMILES"]
            train_adducts = train["Adducts"]
            train_ccs = train["CCS"]

            test_smiles = test["SMILES"]
            test_adducts = test["Adducts"]
            test_ccs = test["CCS"]

            training_datasets.append([train_smiles, train_adducts, train_ccs])
            testing_datasets.append([test_smiles, test_adducts, test_ccs])
            logging.debug("\tTrain set shape: {}".format(train.shape))
            logging.debug("\tTest set shape: {}".format(test.shape))

        logging.debug(len(training_datasets))
        logging.debug("len testdt {}".format(len(testing_datasets)))

        # Load personnal dataset(s) given by -nd arg
        if args.nd is not None:
            new_datasets = args.nd.split(",")
        else:
            new_datasets = []

        # Divide new dataset(s) by the same rule as before
        if len(new_datasets) > 0:
            for f in new_datasets:
                name_test_dataset.append(f.split("/")[-1].split(".")[0])
                smiles, adducts, ccs = read_reference_table(f)

                mask_train = np.zeros(len(smiles), dtype=int)
                mask_train[:int(len(smiles) * train_fraction)] = 1
                np.random.shuffle(mask_train)
                mask_test = 1 - mask_train
                mask_train = mask_train.astype(bool)
                mask_test = mask_test.astype(bool)

                train_smiles = smiles[mask_train]
                train_adducts = adducts[mask_train]
                train_ccs = ccs[mask_train]

                test_smiles = smiles[mask_test]
                test_adducts = adducts[mask_test]
                test_ccs = ccs[mask_test]

                training_datasets.append(
                    [train_smiles, train_adducts, train_ccs])
                testing_datasets.append([test_smiles, test_adducts, test_ccs])

        # Format training_dataset arrays for learning
        training_datasets = np.concatenate(training_datasets, axis=1)

        smiles = training_datasets[0]
        adducts = training_datasets[1]
        ccs = training_datasets[2]

        # Divide training data by this rule : 90% in train, 10% in validation set
        mask_t = np.zeros(len(smiles), dtype=int)
        mask_t[:int(len(smiles) * 0.9)] = 1
        np.random.shuffle(mask_t)
        mask_v = 1 - mask_t
        mask_t = mask_t.astype(bool)
        mask_v = mask_v.astype(bool)

        X1_train = smiles[mask_t]
        X2_train = adducts[mask_t]
        Y_train = ccs[mask_t]

        X1_valid = smiles[mask_v]
        X2_valid = adducts[mask_v]
        Y_valid = ccs[mask_v]

        logging.debug("len X1_train  {}".format(len(X1_train)))
        logging.debug("len X1_valid  {}".format(len(X1_valid)))

        # Format testing_datasets (smiles and adducts) to include them in mappers creation
        test_concat = np.concatenate(testing_datasets, axis=1)
        X1_test = test_concat[0]
        X2_test = test_concat[1]

        logging.debug("len X1_test  {}".format(len(X1_test)))

        # Import DeepCCS and initialize model
        new_model = DeepCCS.DeepCCSModel()

        if args.ap is None:
            new_model.fit_adduct_encoder(
                np.concatenate([X2_train, X2_valid, X2_test]))
        elif args.ap == "d":
            if not path.isfile(
                    path.join("../saved_models/default/",
                              "adducts_encoder.json")):
                raise IOError(
                    "adduct_encoder.json is missing from the adducts_encoder directory directory"
                )
            new_model.adduct_encoder.load_encoder(
                "../saved_models/default/adducts_encoder.json")
        else:
            if not path.isfile(path.join(args.ap, "adducts_encoder.json")):
                raise IOError(
                    "adduct_encoder.json is missing from the adducts_encoder directory directory"
                )
            new_model.adduct_encoder.load_encoder(args.ap +
                                                  "adducts_encoder.json")

        if args.sp is None:
            new_model.fit_smiles_encoder(
                np.concatenate([X1_train, X1_valid, X1_test]))
        elif args.sp == "d":
            if not path.isfile(
                    path.join("../saved_models/default/",
                              "smiles_encoder.json")):
                raise IOError(
                    "smiles_encoder.json is missing from the smiles_encoder directory directory"
                )
            new_model.smiles_encoder.load_encoder(
                "../saved_models/default/smiles_encoder.json")
        else:
            if not path.isfile(path.join(args.sp, "smiles_encoder.json")):
                raise IOError(
                    "smiles_encoder.json is missing from the smiles_encoder directory directory"
                )
            logging.debug("Loading SMILES encoder")
            new_model.smiles_encoder.load_encoder(args.sp +
                                                  "smiles_encoder.json")

        logging.debug(new_model.smiles_encoder.converter)

        # Encode smiles and adducts
        X1_train_encoded = new_model.smiles_encoder.transform(X1_train)
        X1_valid_encoded = new_model.smiles_encoder.transform(X1_valid)

        X2_train_encoded = new_model.adduct_encoder.transform(X2_train)
        X2_valid_encoded = new_model.adduct_encoder.transform(X2_valid)

        # Create model structure
        new_model.create_model()

        model_file = model_directory + "/" + "model_checkpoint.model"
        model_checkpoint = ModelCheckpoint(model_file,
                                           save_best_only=True,
                                           save_weights_only=True)

        if args.verbose:
            print(new_model.model.summary())

        # Train model
        new_model.train_model(X1_train_encoded, X2_train_encoded, Y_train,
                              X1_valid_encoded,
                              X2_valid_encoded, Y_valid, model_checkpoint,
                              int(args.nepochs), args.verbose)

        new_model.model.load_weights(model_file)

        # Save model
        new_model.save_model_to_file(
            model_directory + "/" + "model.h5",
            model_directory + "/" + "adducts_encoder.json",
            model_directory + "/" + "smiles_encoder.json")

        # Test the new model on each testing datasets independantly and output metrics on the performance of the model
        if not new_model._is_fit:
            raise ValueError("Model must be load or fit first")

        for i, dt in enumerate(testing_datasets):
            dt_name = name_test_dataset[i]
            X1 = dt[0]
            X2 = dt[1]
            Y = dt[2]

            Y_pred = new_model.predict(X1, X2)
            Y_pred = Y_pred.flatten()

            print(" ")
            print("> Testing on " + dt_name + " dataset:")
            print("-----------Model stats-----------")
            output_global_stats(Y, Y_pred)
Example #2
0
    def compare(self):
        parser = argparse.ArgumentParser(
            prog='DeepCCS compare',
            description=
            "Compare the CCS values in a file with the value used to create " +
            "train this predictive model. No predictions involved in the " +
            "process, only comparaison.")

        parser.add_argument("-i", help="Input file name", required=True)
        parser.add_argument(
            "-o",
            help=
            "Prefix of output file name (ex: MyFile_). If not specified, stdout will be "
            + "used. If 'none', onlyt the stats will be shown.",
            default="")
        parser.add_argument(
            "-d",
            help=
            "List of datasets to compare to separated by coma (dtA,dtB,dtC)",
            default=None)

        if len(argv) <= 2:
            parser.print_help()
            exit()

        args = parser.parse_args(argv[2:])

        from DeepCCS.utils import read_dataset, read_reference_table, output_results, output_global_stats

        logging.debug("Starting comparaison tool with the following args:" +
                      str(args))
        if not path.isfile(args.i):
            raise IOError("Reference file cannot be found")

        # Output prefix, if none : output to stdout
        out_file_name_prefix = None
        if args.o != "":
            out_file_name_prefix = args.o

        # Data used to create algorithm
        if args.d is not None:
            dt_list = args.d.split(", ")
        else:
            dt_list = [
                "MetCCS_train_pos", "MetCCS_train_neg", "MetCCS_test_pos",
                "MetCCS_test_neg", "Astarita_pos", "Astarita_neg", "Baker",
                "McLean", "CBM"
            ]

        # Get a pandas dataframe for each dataset asked for comparaison
        # output another table with all the original values + the ccs given by user in an extra column
        # print general stats on the compaison
        logging.debug("Starting iterating on the dataset list of comparaison")
        for i in dt_list:
            df_dt = read_dataset("../DATASETS.h5", i)
            smiles = df_dt["SMILES"]
            adducts = df_dt["Adducts"]
            ccs = df_dt["CCS"]

            out_file_name = None
            if out_file_name_prefix is not None:
                out_file_name = out_file_name_prefix + i + ".txt"

            if out_file_name is None or out_file_name_prefix.lower() != "none":
                output_results(args.i, smiles, adducts, ccs, out_file_name)

            smiles_u, adducts_u, ccs_u = read_reference_table(args.i)

            df_user = pd.DataFrame({
                "SMILES": smiles_u,
                "Adducts": adducts_u,
                "CCS": ccs_u
            })

            df_ref = pd.DataFrame({
                "SMILES": smiles,
                "Adducts": adducts,
                "CCS_DeepCCS": ccs
            })

            merged_df = pd.merge(left=df_user,
                                 right=df_ref,
                                 on=["SMILES", "Adducts"],
                                 how='inner')

            if len(merged_df["CCS"]) == 0:
                print(i)
                print("No corresponding molecule, moving to next dataset.")
                continue
            else:
                print("{} dataset :".format(i))
                print("=> {} molecules used for comparaison".format(
                    len(merged_df["CCS"])))
                print("--------Comparaison stats--------")
                output_global_stats(merged_df["CCS_DeepCCS"], merged_df["CCS"])
Example #3
0
# -*- coding: utf-8 -*-
"""
Created on Thu Nov  7 10:09:53 2019

@author: hcji
"""

import pandas as pd
from DeepCCS.utils import read_dataset

datasets = ['Baker', 'CBM', 'McLean', 'Astarita_pos', 'Astarita_neg', 'MetCCS_train_pos', 'MetCCS_train_neg', 'MetCCS_test_pos', 'MetCCS_test_neg']

for i, d in enumerate(datasets):
    if i == 0:
        data = read_dataset('DATASETS.h5', d)
        data['Source'] = d
    else:
        ndata = read_dataset('DATASETS.h5', d)
        ndata['Source'] = d
        data = pd.concat([data, ndata])

data.to_csv('Data/data.csv')