def run_fnn_training_multi(opts: opt.TrainOptions) -> None: logging.basicConfig(format="DFPL-%(levelname)s: %(message)s", level=logging.INFO) logging.info("Adding fingerprint to dataset") df = fp.importDataFile(opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize) t = opts.ecWeightsFile opts.ecWeightsFile = opts.outputDir + t if opts.trainAC: logging.info("Training autoencoder") encoder = ac.train_full_ac(df, opts) # encoder.save_weights(opts.acFile) else: logging.info("Using trained autoencoder") (_, encoder) = ac.define_ac_model(input_size=opts.fpSize, encoding_dim=opts.encFPSize) df = ac.compress_fingerprints(df, encoder) # train FNNs with compressed features logging.info("Training the FNN using compressed input data.") fNN.train_nn_models_multi(df=df, opts=opts, use_compressed=True) # train FNNs with uncompressed features logging.info("Training the FNN using un-compressed input data.") fNN.train_nn_models_multi(df=df, opts=opts, use_compressed=False) logging.info("Done")
def test_predictions(): opts = test_predict_args logging.basicConfig(format="DFPL-%(levelname)s: %(message)s", level=logging.INFO) logging.info(f"Predicting compounds in the input file {opts.inputFile} for association with target {opts.target}") df = fp.importDataFile(opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize) use_compressed = False if opts.acFile: use_compressed = True # load trained model for autoencoder (_, encoder) = ac.define_ac_model(input_size=opts.fpSize, encoding_dim=opts.encFPSize) encoder.load_weights(opts.acFile) # compress the fingerprints using the autoencoder df = ac.compress_fingerprints(df, encoder) # predict df2 = p.predict_values(df=df, opts=opts, use_compressed=use_compressed) names_columns = [c for c in df2.columns if c not in ['fp', 'fpcompressed']] output_file = path.join(opts.outputDir, path.basename(path.splitext(opts.inputFile)[0]) + ".predictions.csv") df2[names_columns].to_csv(path_or_buf=output_file) logging.info(f"Predictions done.\nResults written to '{output_file}'.")
def predict(opts: options.PredictOptions) -> None: """ Run prediction given specific options :param opts: Options defining the details of the prediction """ df = fp.importDataFile(opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize) # Create output dir if it doesn't exist createDirectory(opts.outputDir) use_compressed = False if opts.ecWeightsFile: logging.info(f"Using fingerprint compression with AC {opts.ecWeightsFile}") use_compressed = True # load trained model for autoencoder (_, encoder) = ac.define_ac_model(input_size=opts.fpSize, encoding_dim=opts.encFPSize) encoder.load_weights(opts.ecWeightsFile) # compress the fingerprints using the autoencoder df = ac.compress_fingerprints(df, encoder) # predict df2 = predictions.predict_values(df=df, opts=opts, use_compressed=use_compressed) names_columns = [c for c in df2.columns if c not in ['fp', 'fpcompressed']] output_file = path.join(opts.outputDir, path.basename(path.splitext(opts.inputFile)[0]) + ".predictions.csv") df2[names_columns].to_csv(path_or_buf=output_file)
def train(opts: options.TrainOptions): """ Run the main training procedure :param opts: Options defining the details of the training """ df = fp.importDataFile(opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize) # Create output dir if it doesn't exist createDirectory(opts.outputDir) if opts.compressFeatures: # compress features if opts.trainAC: # train an autoencoder on the full feature matrix encoder = ac.train_full_ac(df, opts) else: # load trained model for autoencoder (_, encoder) = ac.define_ac_model(input_size=opts.fpSize, encoding_dim=opts.encFPSize) encoder.load_weights(makePathAbsolute(opts.ecWeightsFile)) # compress the fingerprints using the autoencoder df = ac.compress_fingerprints(df, encoder) if opts.trainFNN: # train single label models fNN.train_nn_models(df=df, opts=opts) # train multi-label models if opts.enableMultiLabel: fNN.train_nn_models_multi(df=df, opts=opts)
def runAutoencoder(opts: opt.TrainOptions) -> None: """ Run and test auto-encoder """ logging.basicConfig(format="DFPL-%(levelname)s: %(message)s", level=logging.INFO) logging.info("Adding fingerprint to dataset") df = fp.importDataFile(opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize) logging.info("Training autoencoder") ac.train_full_ac(df, opts) logging.info("Done")
def test_prepare_nn_training_data(): project_directory = pathlib.Path(__file__).parent.parent.absolute() df = fp.importDataFile( os.path.join(project_directory, "data", "Sun_etal_dataset.csv")) targets = ["AR", "ER", "GR", "Aromatase", "TR", "PPARg"] fractions = [0.5, 1.0, 2.0, 3.0] for f in fractions: o = opts.TrainOptions(compressFeatures=False, sampleFractionOnes=f) for t in targets: x, y = fNN.prepare_nn_training_data(df, t, o) unique, counts = np.unique(y, return_counts=True) assert abs(counts[1] / counts[0] - f) < 0.01 print( f"Wanted \"{t}\" fraction: {f}, got sampling: {dict(zip(unique, counts))}, Result fraction: {counts[1]/counts[0]}" )
import dfpl.fingerprint as fp from rdkit import Chem from rdkit import DataStructs from rdkit.Chem import Draw import pandas as pd import numpy as np from dfpl import autoencoder as ac from dfpl import feedforwardNN as fNN from dfpl import predictions from dfpl import options as opt # read both datasets dfS = fp.importDataFile("data/S_dataset_extended.pkl", import_function=fp.importSmilesCSV, fp_size=2048) dfS.dropna(axis=0, subset=['cid'], inplace=True) dfS['cid'] = dfS['cid'].apply(int).astype(str) dfD = fp.importDataFile("data/dsstox_20160701.pkl", import_function=fp.importSmilesCSV, fp_size=2048) # ids and structures of interest cid_of_interest = ["87587", "77328", "2734118", "2736548", "154257"] toxid_of_interest = [ "DTXSID3027798", "DTXSID7041461", "DTXSID9048067", "DTXSID7049344", "DTXSID70173593" ] df = pd.DataFrame(list(zip(cid_of_interest, toxid_of_interest)), columns=["cid", "toxid"])