Example #1
0
def run_fnn_training_multi(opts: opt.TrainOptions) -> None:

    logging.basicConfig(format="DFPL-%(levelname)s: %(message)s",
                        level=logging.INFO)
    logging.info("Adding fingerprint to dataset")

    df = fp.importDataFile(opts.inputFile,
                           import_function=fp.importSmilesCSV,
                           fp_size=opts.fpSize)

    t = opts.ecWeightsFile
    opts.ecWeightsFile = opts.outputDir + t

    if opts.trainAC:
        logging.info("Training autoencoder")
        encoder = ac.train_full_ac(df, opts)
        # encoder.save_weights(opts.acFile)
    else:
        logging.info("Using trained autoencoder")
        (_, encoder) = ac.define_ac_model(input_size=opts.fpSize,
                                          encoding_dim=opts.encFPSize)

    df = ac.compress_fingerprints(df, encoder)

    # train FNNs with compressed features
    logging.info("Training the FNN using compressed input data.")
    fNN.train_nn_models_multi(df=df, opts=opts, use_compressed=True)

    # train FNNs with uncompressed features
    logging.info("Training the FNN using un-compressed input data.")
    fNN.train_nn_models_multi(df=df, opts=opts, use_compressed=False)

    logging.info("Done")
Example #2
0
def test_predictions():
    opts = test_predict_args

    logging.basicConfig(format="DFPL-%(levelname)s: %(message)s", level=logging.INFO)
    logging.info(f"Predicting compounds in the input file {opts.inputFile} for association with target {opts.target}")

    df = fp.importDataFile(opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize)

    use_compressed = False
    if opts.acFile:
        use_compressed = True
        # load trained model for autoencoder
        (_, encoder) = ac.define_ac_model(input_size=opts.fpSize, encoding_dim=opts.encFPSize)
        encoder.load_weights(opts.acFile)
        # compress the fingerprints using the autoencoder
        df = ac.compress_fingerprints(df, encoder)

    # predict
    df2 = p.predict_values(df=df,
                           opts=opts,
                           use_compressed=use_compressed)

    names_columns = [c for c in df2.columns if c not in ['fp', 'fpcompressed']]

    output_file = path.join(opts.outputDir,
                            path.basename(path.splitext(opts.inputFile)[0]) + ".predictions.csv")
    df2[names_columns].to_csv(path_or_buf=output_file)

    logging.info(f"Predictions done.\nResults written to '{output_file}'.")
Example #3
0
def predict(opts: options.PredictOptions) -> None:
    """
    Run prediction given specific options
    :param opts: Options defining the details of the prediction
    """
    df = fp.importDataFile(opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize)

    # Create output dir if it doesn't exist
    createDirectory(opts.outputDir)

    use_compressed = False
    if opts.ecWeightsFile:
        logging.info(f"Using fingerprint compression with AC {opts.ecWeightsFile}")
        use_compressed = True
        # load trained model for autoencoder
        (_, encoder) = ac.define_ac_model(input_size=opts.fpSize, encoding_dim=opts.encFPSize)
        encoder.load_weights(opts.ecWeightsFile)
        # compress the fingerprints using the autoencoder
        df = ac.compress_fingerprints(df, encoder)

    # predict
    df2 = predictions.predict_values(df=df,
                                     opts=opts,
                                     use_compressed=use_compressed)

    names_columns = [c for c in df2.columns if c not in ['fp', 'fpcompressed']]

    output_file = path.join(opts.outputDir,
                            path.basename(path.splitext(opts.inputFile)[0]) + ".predictions.csv")
    df2[names_columns].to_csv(path_or_buf=output_file)
Example #4
0
def train(opts: options.TrainOptions):
    """
    Run the main training procedure
    :param opts: Options defining the details of the training
    """
    df = fp.importDataFile(opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize)

    # Create output dir if it doesn't exist
    createDirectory(opts.outputDir)

    if opts.compressFeatures:  # compress features

        if opts.trainAC:
            # train an autoencoder on the full feature matrix
            encoder = ac.train_full_ac(df, opts)
        else:
            # load trained model for autoencoder
            (_, encoder) = ac.define_ac_model(input_size=opts.fpSize, encoding_dim=opts.encFPSize)
            encoder.load_weights(makePathAbsolute(opts.ecWeightsFile))

        # compress the fingerprints using the autoencoder
        df = ac.compress_fingerprints(df, encoder)

    if opts.trainFNN:
        # train single label models
        fNN.train_nn_models(df=df, opts=opts)

    # train multi-label models
    if opts.enableMultiLabel:
        fNN.train_nn_models_multi(df=df, opts=opts)
def runAutoencoder(opts: opt.TrainOptions) -> None:
    """
    Run and test auto-encoder
    """
    logging.basicConfig(format="DFPL-%(levelname)s: %(message)s", level=logging.INFO)
    logging.info("Adding fingerprint to dataset")
    df = fp.importDataFile(opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize)
    logging.info("Training autoencoder")
    ac.train_full_ac(df, opts)
    logging.info("Done")
Example #6
0
def test_prepare_nn_training_data():
    project_directory = pathlib.Path(__file__).parent.parent.absolute()
    df = fp.importDataFile(
        os.path.join(project_directory, "data", "Sun_etal_dataset.csv"))

    targets = ["AR", "ER", "GR", "Aromatase", "TR", "PPARg"]
    fractions = [0.5, 1.0, 2.0, 3.0]
    for f in fractions:
        o = opts.TrainOptions(compressFeatures=False, sampleFractionOnes=f)
        for t in targets:
            x, y = fNN.prepare_nn_training_data(df, t, o)
            unique, counts = np.unique(y, return_counts=True)
            assert abs(counts[1] / counts[0] - f) < 0.01
            print(
                f"Wanted \"{t}\" fraction: {f}, got sampling: {dict(zip(unique, counts))}, Result fraction: {counts[1]/counts[0]}"
            )
Example #7
0
import dfpl.fingerprint as fp
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import Draw

import pandas as pd
import numpy as np

from dfpl import autoencoder as ac
from dfpl import feedforwardNN as fNN
from dfpl import predictions
from dfpl import options as opt

# read both datasets
dfS = fp.importDataFile("data/S_dataset_extended.pkl",
                        import_function=fp.importSmilesCSV,
                        fp_size=2048)
dfS.dropna(axis=0, subset=['cid'], inplace=True)
dfS['cid'] = dfS['cid'].apply(int).astype(str)
dfD = fp.importDataFile("data/dsstox_20160701.pkl",
                        import_function=fp.importSmilesCSV,
                        fp_size=2048)

# ids and structures of interest
cid_of_interest = ["87587", "77328", "2734118", "2736548", "154257"]
toxid_of_interest = [
    "DTXSID3027798", "DTXSID7041461", "DTXSID9048067", "DTXSID7049344",
    "DTXSID70173593"
]
df = pd.DataFrame(list(zip(cid_of_interest, toxid_of_interest)),
                  columns=["cid", "toxid"])