def run(args): # import ipdb; ipdb.set_trace() t0 = time() ID = 'TITLE' fea_sep = '_' assert args.par_jobs > 0, f"The arg 'par_jobs' must be int >1 (got {args.par_jobs})" outdir = Path(args.outdir) if outdir is None: outdir = Path(filepath, '../out', FEA_DIR.name, drg_set).resolve() os.makedirs(outdir, exist_ok=True) # Logger lg = Logger(outdir / 'gen.fea.dfs.log') print_fn = get_print_func(lg.logger) print_fn(f'File path: {filepath}') print_fn(f'\n{pformat(vars(args))}') # ======================================================== # Aggregate features from files # ----------------------------- drug_names = None # N = 20 N = None for fea_name in args.fea_type: if 'descriptors' == fea_name: dd_fea = load_mordred_descriptors(drg_set=args.drg_set, fea_name=fea_name, col_name=ID, drug_names=drug_names, fea_sep=fea_sep, n_jobs=args.par_jobs, N=N) print_fn('dd_fea.shape {}'.format(dd_fea.shape)) dd_fea.to_parquet(outdir / 'descriptors.mordred.parquet') del dd_fea else: dd_fea = None if 'fps' == fea_name: fps_fea = load_fps(drg_set=args.drg_set, fea_name=fea_name, col_name=ID, drug_names=drug_names, fea_sep=fea_sep, n_jobs=args.par_jobs, N=N) print_fn('fps_fea.shape {}'.format(fps_fea.shape)) fps_fea.to_parquet(outdir / 'fps.ecfp2.parquet') del fps_fea else: fps_fea = None if 'images' == fea_name: pass else: img_fea = None # -------------------------- # Generate images # -------------------------- # if 'images' in fea_type: # files_path = Path(FEA_DIR, drg_set, 'images').resolve() # fea_files = sorted( files_path.glob(f'{drg_set}-*.pkl') ) # if len(fea_files) > 0: # fea_outpath = outdir/'images' # os.makedirs(fea_outpath, exist_ok=True) # dfs = [] # for i, f in enumerate(fea_files[:N]): # if (i+1) % 100 == 0: # print(f'Load {i+1} ... {f.name}') # imgs = pickle.load(open(fea_files[i], 'rb')) # # That's from get_image_dct(mol) # # image = (255 * transforms.ToTensor()(Invert()(generateFeatures.smiles_to_image(mol))).numpy()).astype(np.uint8) # image = Invert()(image) # image = transforms.ToTensor()(image) # image = image.numpy() # image = 255 * image # image = image.astype(np.uint8) # # To dict # def img_data_to_dict( aa ): # dct = {} # dct['drg_set'] = aa[0] # dct['TITLE'] = aa[1] # dct['SMILES'] = aa[2] # dct['img'] = aa[3] # ======================================================== print_fn('\nRuntime {:.1f} mins'.format((time() - t0) / 60)) print_fn('Done.') lg.kill_logger()
def run(args): import pdb pdb.set_trace() t0 = time() datapath = Path(args['datapath']).resolve() if args['max_size'] is not None: assert args['min_size'] < args['max_size'], f"min train size (min_size={args['min_size']}) "\ f"must be smaller than max train size "\ f"(max_size={args['max_size']})." if args['splitdir'] is None: splitdir = None else: splitdir = Path(args['splitdir']).resolve() split_id = args['split_id'] # ----------------------------------------------- # Global outdir # ----------------------------------------------- if args['gout'] is not None: gout = Path(args['gout']).resolve() else: gout = fdir.parent / 'lc.trn' gout = gout / datapath.with_suffix('.lc').name args['gout'] = str(gout) os.makedirs(gout, exist_ok=True) # ----------------------------------------------- # Run (single split) outdir # ----------------------------------------------- if args['rout'] is not None: rout = gout / args['rout'] else: if splitdir is None: rout = gout / 'run_0' else: rout = gout / f'split_{split_id}' args['rout'] = str(rout) os.makedirs(rout, exist_ok=True) # ----------------------------------------------- # Logger # ----------------------------------------------- lg = Logger(rout / 'lc.log') print_fn = get_print_func(lg.logger) print_fn(f'File path: {fdir}') print_fn(f'\n{pformat(args)}') dump_dict(args, outpath=rout / 'trn.args.txt') # ----------------------------------------------- # Load data # ----------------------------------------------- print_fn('\nLoad master dataset.') data = load_data(datapath) print_fn('data.shape {}'.format(data.shape)) # Get features (x), target (y), and meta fea_list = args['fea_prfx'] fea_sep = args['fea_sep'] xdata = extract_subset_fea(data, fea_list=fea_list, fea_sep=fea_sep) meta = data.drop(columns=xdata.columns) ydata = meta[[args['trg_name']]] del data # ----------------------------------------------- # Scale features # ----------------------------------------------- xdata = scale_fea(xdata=xdata, scaler_name=args['scaler']) # ----------------------------------------------- # Data splits # ----------------------------------------------- if splitdir is None: cv_lists = None else: split_pattern = f'1fold_s{split_id}_*_id.csv' single_split_files = glob(str(splitdir / split_pattern)) # Get indices for the split for id_file in single_split_files: if 'tr_id' in id_file: tr_id = load_data(id_file).values.reshape(-1, ) elif 'vl_id' in id_file: vl_id = load_data(id_file).values.reshape(-1, ) elif 'te_id' in id_file: te_id = load_data(id_file).values.reshape(-1, ) cv_lists = (tr_id, vl_id, te_id) # ----------------------------------------------- # ML model configs # ----------------------------------------------- if args['ml'] == 'lgb': # LGBM regressor model definition import lightgbm as lgb framework = 'lightgbm' ml_model_def = lgb.LGBMRegressor mltype = 'reg' ml_init_kwargs = { 'n_estimators': args['n_estimators'], 'max_depth': args['max_depth'], 'learning_rate': args['learning_rate'], 'num_leaves': args['num_leaves'], 'n_jobs': args['n_jobs'], 'random_state': None } ml_fit_kwargs = {'verbose': False, 'early_stopping_rounds': 10} data_prep_def = None keras_callbacks_def = None keras_clr_kwargs = None elif args['ml'] == 'nn_reg0': # Keras model def from models.keras_model import (nn_reg0_model_def, data_prep_nn0_def, model_callback_def) framework = 'keras' mltype = 'reg' keras_callbacks_def = model_callback_def data_prep_def = data_prep_nn0_def if (args['ml'] == 'nn_reg0'): ml_model_def = nn_reg0_model_def ml_init_kwargs = { 'input_dim': xdata.shape[1], 'dr_rate': args['dr_rate'], 'opt_name': args['opt'], 'lr': args['lr'], 'batchnorm': args['batchnorm'] } ml_fit_kwargs = { 'epochs': args['epoch'], 'batch_size': args['batch_size'], 'verbose': 1 } keras_clr_kwargs = {} elif args['ml'] == 'nn_reg1': from models.keras_model import (nn_reg1_model_def, data_prep_nn1_def, model_callback_def) framework = 'keras' mltype = 'reg' keras_callbacks_def = model_callback_def data_prep_def = data_prep_nn1_def if (args['ml'] == 'nn_reg1'): ml_model_def = nn_reg1_model_def x_ge = extract_subset_fea(xdata, fea_list=['ge'], fea_sep='_') x_dd = extract_subset_fea(xdata, fea_list=['dd'], fea_sep='_') ml_init_kwargs = { 'in_dim_ge': x_ge.shape[1], 'in_dim_dd': x_dd.shape[1], 'dr_rate': args['dr_rate'], 'opt_name': args['opt'], 'lr': args['lr'], 'batchnorm': args['batchnorm'] } ml_fit_kwargs = { 'epochs': args['epoch'], 'batch_size': args['batch_size'], 'verbose': 1 } keras_clr_kwargs = {} del x_ge, x_dd # Print NN if len(ml_init_kwargs) and ('nn' in args['ml']): model = ml_model_def(**ml_init_kwargs) model.summary(print_fn=lg.logger.info) del model # ----------------------------------------------- # Learning curve # ----------------------------------------------- # LC args lc_init_args = { 'cv_lists': cv_lists, 'n_splits': args['n_splits'], 'mltype': mltype, 'lc_step_scale': args['lc_step_scale'], 'lc_sizes': args['lc_sizes'], 'min_size': args['min_size'], 'max_size': args['max_size'], 'lc_sizes_arr': args['lc_sizes_arr'], 'outdir': rout, 'print_fn': print_fn } lc_trn_args = { 'framework': framework, 'n_jobs': args['n_jobs'], 'ml_model_def': ml_model_def, 'ml_init_args': ml_init_kwargs, 'ml_fit_args': ml_fit_kwargs, 'data_prep_def': data_prep_def, 'keras_callbacks_def': keras_callbacks_def, 'keras_clr_args': keras_clr_kwargs } # LC object lc_obj = LearningCurve(X=xdata, Y=ydata, meta=meta, **lc_init_args) lc_scores = lc_obj.trn_learning_curve(**lc_trn_args) # Dump all scores lc_scores.to_csv(rout / 'lc_scores.csv', index=False) # Dump args dump_dict(args, outpath=rout / 'args.txt') # ------------------------------------------------------ if (time() - t0) // 3600 > 0: print_fn('Runtime: {:.1f} hrs'.format((time() - t0) / 3600)) else: print_fn('Runtime: {:.1f} mins'.format((time() - t0) / 60)) print_fn('Done.') lg.close_logger() del xdata, ydata return None
def run(args): te_size = verify_size(args.te_size) # Path appdir = MAIN_APPDIR / args.appname # import ipdb; ipdb.set_trace(context=11) # Hard split split_on = None if args.split_on is None else args.split_on.upper() te_method = args.cv_method # Specify ML task (regression or classification) if args.cv_method == 'strat': mltype = 'cls' # cast mltype to cls in case of stratification else: mltype = args.ml_task # ----------------------------------------------- # Create appdir # ----------------------------------------------- gout = appdir / 'splits' outfigs = gout / 'outfigs' os.makedirs(gout, exist_ok=True) os.makedirs(outfigs, exist_ok=True) # ----------------------------------------------- # Create logger # ----------------------------------------------- lg = Logger(gout / 'data.splitter.log') print_fn = get_print_func(lg.logger) print_fn(f'File dir: {fdir}') print_fn(f'\n{pformat(vars(args))}') dump_dict(vars(args), outpath=gout / 'data.splitter.args.txt') # dump args # ----------------------------------------------- # Load data # ----------------------------------------------- print_fn('\nLoad master dataset.') data = load_data(appdir / 'annotations.csv') print_fn('data.shape {}'.format(data.shape)) GE_LEN = sum([1 for c in data.columns if c.startswith('ge_')]) DD_LEN = sum([1 for c in data.columns if c.startswith('dd_')]) # import ipdb; ipdb.set_trace(context=11) # ----------------------------------------------- # Determine the dataset # ----------------------------------------------- ydata = data[args.trg_name] if args.trg_name in data.columns else None if (ydata is None) and (args.cv_method == 'strat'): raise ValueError( 'Y data must be specified if splits needs to be stratified.') if ydata is not None: plot_hist(ydata, title=f'{args.trg_name}', fit=None, bins=100, path=outfigs / f'{args.trg_name}_hist_all.png') # ----------------------------------------------- # Generate splits (train/val/test) # ----------------------------------------------- print_fn('\n{}'.format('-' * 50)) print_fn('Split into hold-out train/val/test') print_fn('{}'.format('-' * 50)) kwargs = { 'data': data, 'cv_method': args.cv_method, 'te_method': te_method, 'te_size': te_size, 'mltype': mltype, 'split_on': split_on } data_splitter(n_splits=args.n_splits, gout=gout, outfigs=outfigs, ydata=ydata, print_fn=print_fn, **kwargs) lg.kill_logger()
def run(args): # import pdb; pdb.set_trace() t0 = time() rsp_cols = [ 'AUC', 'AUC1', 'EC50', 'EC50se', 'R2fit', 'Einf', 'IC50', 'HS', 'AAC1', 'DSS1' ] outdir = create_outdir(args.gout, args) # ----------------------------------------------- # Logger # ----------------------------------------------- lg = Logger(outdir / 'gen.df.log') print_fn = get_print_func(lg.logger) print_fn(f'File path: {filepath}') print_fn(f'\n{pformat(vars(args))}') dump_dict(vars(args), outpath=outdir / 'gen.df.args') # ----------------------------------------------- # Load response data and features # ----------------------------------------------- rsp = load_rsp(args.rsp_path, src=args.src, r2fit_th=args.r2fit_th, print_fn=print_fn) ge = load_ge(args.cell_path, print_fn=print_fn, float_type=np.float32) dd = load_dd(args.drug_path, dropna_th=args.dropna_th, print_fn=print_fn, float_type=np.float32, src=args.src) # ----------------------------------------------- # Merge data # ----------------------------------------------- print_fn('\n{}'.format('-' * 40)) print_fn('Start merging response with other dfs.') print_fn('-' * 40) data = rsp # Merge with ge print_fn('\nMerge with expression (ge).') data = pd.merge(data, ge, on='CELL', how='inner') groupby_src_and_print(data, print_fn=print_fn) del ge # Merge with dd print_fn('\nMerge with descriptors (dd).') data = pd.merge(data, dd, on='DRUG', how='inner') groupby_src_and_print(data, print_fn=print_fn) del dd # Sample # if (args.n_samples is not None): # print_fn('\nSample the final dataset.') # if args.flatten: # data = flatten_dist(df=data, n=args.n_samples, score_name=args.trg_name) # else: # if args.n_samaples <= data.shape[0]: # data = data.sample(n=args.n_samples, replace=False, random_state=0) # print_fn(f'data.shape {data.shape}\n') # Memory usage print_fn('\nTidy dataframe: {:.1f} GB'.format(sys.getsizeof(data) / 1e9)) for fea_name, fea_prfx in fea_prfx_dct.items(): cols = [c for c in data.columns if fea_prfx in c] aa = data[cols] mem = 0 if aa.shape[1] == 0 else sys.getsizeof(aa) / 1e9 print_fn('Memory occupied by {} features: {} ({:.1f} GB)'.format( fea_name, len(cols), mem)) print_fn(f"\nData final: {data.shape}") # Plot histograms of target variables plot_rsp_dists(data, rsp_cols=rsp_cols, savepath=outdir / 'rsp_dists.png') # ----------------------------------------------- # Save data # ----------------------------------------------- # Save data print_fn('\nSave dataframe.') fname = create_basename(args) fpath = outdir / (fname + '.parquet') data.to_parquet(fpath) # Load data print_fn('Load dataframe.') data_fromfile = pd.read_parquet(fpath) # Check that the saved data is the same as original one print_fn(f'Loaded df is same as original: {data.equals(data_fromfile)}') print_fn('\n{}'.format('-' * 70)) print_fn(f'Dataframe filepath:\n{fpath.resolve()}') print_fn('-' * 70) # ------------------------------------------------------- print_fn('\nRuntime: {:.1f} mins'.format((time() - t0) / 60)) print_fn('Done.') lg.close_logger() return None
def run(args): t0=time() scores_path = Path( args['scores_path'] ).resolve() fea_path = Path( args['fea_path'] ).resolve() par_jobs = int( args['par_jobs'] ) fea_list = args['fea_list'] assert par_jobs > 0, f"The arg 'par_jobs' must be at least 1 (got {par_jobs})" if args['outdir'] is not None: outdir = Path( args['outdir'] ).resolve() else: batch_name = scores_path.parent.name outdir = Path( filepath/'../out'/batch_name ).resolve() outfigs = outdir/'figs' os.makedirs(outdir, exist_ok=True) os.makedirs(outfigs, exist_ok=True) args['outdir'] = outdir # Logger lg = Logger( outdir/'gen.ml.data.log' ) print_fn = get_print_func( lg.logger ) print_fn(f'File path: {filepath}') print_fn(f'\n{pformat(args)}') print_fn('\nDocking scores path {}'.format( scores_path )) print_fn('Features path {}'.format( fea_path )) print_fn('Outdir path {}'.format( outdir )) # ----------------------------------------- # Load data (features and docking scores) # ----------------------------------------- # Features (with SMILES) print_fn('\nLoad features ...') fea = load_data( fea_path ) print_fn('Features {}'.format( fea.shape )) fea = drop_dup_rows(fea, print_fn=print_fn) # Docking scores print_fn('\nLoad docking scores ...') rsp = load_data( args['scores_path'] ) print_fn('Docking {}'.format( rsp.shape )) rsp = drop_dup_rows(rsp, print_fn=print_fn) # # Check that 'SMILES' col exists # if 'SMILES' in rsp.columns: # rsp = rsp.rename(columns={'SMILES': 'SMILES'}) # assert 'SMILES' in rsp.columns, "Column 'SMILES' must exists in the docking scores file." # print_fn('\nCanonicalize SMILES ...') # can_smi_vec = canon_SMILES( rsp['SMILES'], par_jobs=args['par_jobs'] ) # can_smi_vec = pd.Series(can_smi_vec) # # Save to file bad SMILES (that were not canonicalized) # nan_ids = can_smi_vec.isna() # bad_smi = rsp[ nan_ids ] # if len(bad_smi)>0: # bad_smi.to_csv(outdir/'smi_canon_err.csv', index=False) # # Keep the good (canonicalized) SMILES # rsp['SMILES'] = can_smi_vec # rsp = rsp[ ~nan_ids ].reset_index(drop=True) print_fn( '\n{}'.format( rsp.columns.tolist() )) print_fn( '\n{}\n'.format( rsp.iloc[:3,:4] )) # ----------------------------------------- # Merge features with dock scores # ----------------------------------------- # merger = 'SMILES' merger = 'TITLE' assert merger in rsp.columns, f"Column '{merger}' must exist in the docking scores file." unq_smiles = set( rsp[merger] ).intersection( set(fea[merger]) ) print_fn( 'Unique {} in rsp: {}'.format( merger, rsp[merger].nunique() )) print_fn( 'Unique {} in fea: {}'.format( merger, fea[merger].nunique() )) print_fn( 'Intersect on {}: {}'.format( merger, len(unq_smiles) )) print_fn(f'\nMerge features with docking scores on {merger} ...') dd = pd.merge(rsp, fea, on=merger, how='inner') print_fn('Merged {}'.format( dd.shape )) print_fn('Unique {} in final df: {}'.format( merger, dd[merger].nunique() )) trg_names = rsp.columns[1:].tolist() del rsp, fea score_name = 'reg' # unified name for docking scores column in all output dfs bin_th = 2.0 # threshold value for the binner column (classifier) kwargs = { 'dd': dd, 'meta_cols': meta_cols, 'fea_list': fea_list, 'score_name': score_name, 'q_cls': args['q_bins'], 'bin_th': bin_th, 'print_fn': print_fn, 'outdir': outdir, 'outfigs': outfigs } if par_jobs > 1: # https://joblib.readthedocs.io/en/latest/parallel.html results = Parallel(n_jobs=par_jobs, verbose=20)( delayed(gen_ml_df)(trg_name=trg, **kwargs) for trg in trg_names ) else: results = [] # docking summary including ML baseline scores for trg in trg_names: res = gen_ml_df(trg_name=trg, **kwargs) results.append( res ) results = np.round(pd.DataFrame(results), decimals=3) results.to_csv( outdir/'dock.ml.baseline.csv', index=False ) # -------------------------------------------------------- print_fn('\nRuntime {:.2f} mins'.format( (time()-t0)/60 )) print_fn('Done.') lg.kill_logger()
def run(args): t0 = time() n_splits = int(args.n_splits) te_size = verify_size(args.te_size) # te_size = args['te_size'] datapath = Path(args.datapath).resolve() # Hard split split_on = None if args.split_on is None else args.split_on.upper() cv_method = args.cv_method te_method = cv_method # Specify ML task (regression or classification) if cv_method == 'strat': mltype = 'cls' # cast mltype to cls in case of stratification else: mltype = args.ml_task # Target column name trg_name = str(args.trg_name) # ----------------------------------------------- # Create outdir # ----------------------------------------------- if args.gout is not None: gout = Path(args.gout).resolve() gout = gout / datapath.with_suffix('.splits').name else: # Note! useful for drug response # sufx = 'none' if split_on is None else split_on # gout = gout / f'split_on_{sufx}' gout = datapath.with_suffix('.splits') outfigs = gout / 'outfigs' os.makedirs(gout, exist_ok=True) os.makedirs(outfigs, exist_ok=True) # ----------------------------------------------- # Create logger # ----------------------------------------------- lg = Logger(gout / 'data.splitter.log') print_fn = get_print_func(lg.logger) print_fn(f'File path: {filepath}') print_fn(f'\n{pformat(vars(args))}') dump_dict(vars(args), outpath=gout / 'data.splitter.args.txt') # dump args # ----------------------------------------------- # Load data # ----------------------------------------------- print_fn('\nLoad master dataset.') data = load_data(datapath) print_fn('data.shape {}'.format(data.shape)) ydata = data[trg_name] if trg_name in data.columns else None if (ydata is None) and (cv_method == 'strat'): raise ValueError( 'Y data must be available if splits are required to stratified.') if ydata is not None: plot_hist(ydata, title=f'{trg_name}', fit=None, bins=100, path=outfigs / f'{trg_name}_hist_all.png') # ----------------------------------------------- # Generate splits (train/val/test) # ----------------------------------------------- print_fn('\n{}'.format('-' * 50)) print_fn('Split into hold-out train/val/test') print_fn('{}'.format('-' * 50)) kwargs = { 'data': data, 'cv_method': cv_method, 'te_method': te_method, 'te_size': te_size, 'mltype': mltype, 'split_on': split_on } data_splitter(n_splits=n_splits, gout=gout, outfigs=outfigs, ydata=ydata, print_fn=print_fn, **kwargs) print_fn('Runtime: {:.1f} min'.format((time() - t0) / 60)) print_fn('Done.') lg.kill_logger()
def run(args): # import ipdb; ipdb.set_trace() t0 = time() scores_path = Path(args['scores_path']).resolve() fea_path = Path(args['fea_path']).resolve() img_path = None if args['img_path'] is None else Path( args['img_path']).resolve() par_jobs = int(args['par_jobs']) fea_list = args['fea_list'] assert par_jobs > 0, f"The arg 'par_jobs' must be at least 1 (got {par_jobs})" if args['outdir'] is not None: outdir = Path(args['outdir']).resolve() else: batch_name = scores_path.parent.name outdir = Path(GOUT / batch_name).resolve() outfigs = outdir / 'figs' os.makedirs(outdir, exist_ok=True) os.makedirs(outfigs, exist_ok=True) args['outdir'] = outdir # Logger lg = Logger(outdir / 'gen.ml.data.log') print_fn = get_print_func(lg.logger) print_fn(f'File path: {filepath}') print_fn(f'\n{pformat(args)}') print_fn('\nDocking scores {}'.format(scores_path)) print_fn('Features {}'.format(fea_path)) print_fn('Images {}'.format(img_path)) print_fn('Outdir {}'.format(outdir)) # ----------------------------------------- # Load data (features and docking scores) # ----------------------------------------- # Docking scores print_fn('\nLoad docking scores ...') rsp = load_data(args['scores_path']) print_fn('Docking {}'.format(rsp.shape)) rsp = drop_dup_rows(rsp, print_fn=print_fn) # Get target names trg_names = rsp.columns[1:].tolist()[:2] # ----------------------------------------- # Dump docks of each trg to separate file # ----------------------------------------- score_name = 'reg' # unified name for docking scores column in all output dfs bin_th = 2.0 # threshold value for the binner column (classifier) kwargs = { 'rsp': rsp, 'meta_cols': meta_cols, 'score_name': score_name, 'q_cls': args['q_bins'], 'print_fn': print_fn, 'outdir': outdir } # import pdb; pdb.set_trace() if par_jobs > 1: results = Parallel(n_jobs=par_jobs, verbose=20)( delayed(dump_single_trg)(trg_name=trg, **kwargs) for trg in trg_names) else: for trg in trg_names: dump_single_trg(trg_name=trg, **kwargs) # ----------------------------------------------------- # ----------------------------------------- # Process Images # ----------------------------------------- # Load images # import pdb; pdb.set_trace() if img_path is not None: print_fn('\nLoad images ...') images = load_data(img_path) print_fn('Images {} {}'.format(type(images), len(images))) # Keep intersect on samples (TITLE) kwargs = { 'images': images, 'rsp': rsp, 'print_fn': print_fn, 'outdir': outdir } if par_jobs > 1: Parallel(n_jobs=par_jobs, verbose=20)(delayed(gen_ml_images)(trg_name=trg, **kwargs) for trg in trg_names) else: for trg in trg_names: gen_ml_images(trg_name=trg, **kwargs) # ----------------------------------------------------- # Features (with SMILES) print_fn('\nLoad features ...') fea = load_data(fea_path) print_fn('Features {}'.format(fea.shape)) fea = drop_dup_rows(fea, print_fn=print_fn) print_fn('\n{}'.format(rsp.columns.tolist())) print_fn('\n{}\n'.format(rsp.iloc[:3, :4])) # ----------------------------------------- # Merge features with dock scores # ----------------------------------------- merger = 'TITLE' # we used 'SMILES' before assert merger in rsp.columns, f"Column '{merger}' must exist in the docking scores file." unq_smiles = set(rsp[merger]).intersection(set(fea[merger])) print_fn('Unique {} in rsp: {}'.format(merger, rsp[merger].nunique())) print_fn('Unique {} in fea: {}'.format(merger, fea[merger].nunique())) print_fn('Intersect on {}: {}'.format(merger, len(unq_smiles))) print_fn(f'\nMerge features with docking scores on {merger} ...') dd = pd.merge(rsp, fea, on=merger, how='inner') print_fn('Merged {}'.format(dd.shape)) print_fn('Unique {} in final df: {}'.format(merger, dd[merger].nunique())) del rsp, fea score_name = 'reg' # unified name for docking scores column in all output dfs bin_th = 2.0 # threshold value for the binner column (classifier) kwargs = { 'dd': dd, 'meta_cols': meta_cols, 'fea_list': fea_list, 'score_name': score_name, 'q_cls': args['q_bins'], 'bin_th': bin_th, 'print_fn': print_fn, 'outdir': outdir, 'outfigs': outfigs } # import pdb; pdb.set_trace() if par_jobs > 1: results = Parallel(n_jobs=par_jobs, verbose=20)( delayed(gen_ml_df)(trg_name=trg, **kwargs) for trg in trg_names) else: results = [] # docking summary including ML baseline scores for trg in trg_names: res = gen_ml_df(trg_name=trg, **kwargs) results.append(res) # TODO consider to generate baselines using ecfp features as well results = np.round(pd.DataFrame(results), decimals=3) results.to_csv(outdir / 'dock.ml.dsc.baseline.csv', index=False) # -------------------------------------------------------- print_fn('\nRuntime {:.2f} mins'.format((time() - t0) / 60)) print_fn('Done.') lg.kill_logger()
def run(args): import pdb pdb.set_trace() t0 = time() drg_set = Path(args.drg_set) scr_dir = Path(args.scr_dir).resolve() # fea_type = args.fea_type ID = 'TITLE' # ID = 'SMILES' par_jobs = int(args.par_jobs) assert par_jobs > 0, f"The arg 'par_jobs' must be int >0 (got {par_jobs})" if args.outdir is not None: outdir = Path(args.outdir).resolve() else: batch_name = scr_dir.parent.name outdir = Path(GOUT, batch_name).resolve() outfigs = outdir / 'figs' os.makedirs(outdir, exist_ok=True) os.makedirs(outfigs, exist_ok=True) # Logger lg = Logger(outdir / 'gen.ml.data.log') print_fn = get_print_func(lg.logger) print_fn(f'File path: {filepath}') print_fn(f'\n{pformat(vars(args))}') print_fn(f'\nDocking files {scr_dir}') # print_fn(f'Features dir {FEA_DIR}') print_fn(f'Outdir {outdir}') # ======================================================== # Glob the docking files # ---------------------- scr_dir = Path(scr_dir, drg_set).resolve() ## scr_file_pattern = '*4col.csv' # V5.1 ## scr_file_pattern = '*sorted*csv' # V7.0 scr_file_pattern = 'rec_*3col.csv' # Tom's receptors scr_files = sorted(scr_dir.glob(scr_file_pattern)) # ss = ['ADRP_6W02_A_1_H', # 'NSP10-16_6W61_AB_1_F', # 'NSP10-16_6W61_AB_2_F'] # def fnc(f): # for s in ss: # if s in str(f): # return True # return False # scr_files = [f for f in scr_files if fnc(f)] # ======================================================== # Load features # ------------------------------ dd_names = None fps_names = None img_names = None if (args.dd_fpath is not None) and (args.dd_fpath.lower() != 'none'): dd_fea = load_data(args.dd_fpath) dd_names = dd_fea[ID].tolist() dd_fea = dd_fea.drop(columns='SMILES') # tmp = dd_fea.isna().sum(axis=0).sort_values(ascending=False) dd_fea = dd_fea.fillna(0) else: dd_fea = None dd_names = None if (args.fps_fpath is not None) and (args.fps_fpath.lower() != 'none'): fps_fea = load_data(args.fps_fpath) fps_names = fps_fea[ID].tolist() fps_fea = fps_fea.drop(columns='SMILES') # tmp = fps_fea.isna().sum(axis=0).sort_values(ascending=False) fps_fea = fps_fea.fillna(0) else: fps_fea = None fps_names = None if (args.img_fpath is not None) and (args.img_fpath.lower() != 'none'): # TODO pass else: img_fea = None img_names = None # ======================================================== # Get the common samples (by ID) # ------------------------------ """ For each feature type (descriptors, fps, images), obtain the list of drug names for which the features are available. Also, get the intersect of drug names across the feature types. This is required for multimodal learning (we want to make sure that we have all the feature types for a compound). """ # Union of TITLE names across all features types all_names = [] for ii in [dd_names, fps_names, img_names]: if ii is not None: all_names.extend(list(ii)) print_fn( f'Union of titles across all feature types: {len(set(all_names))}') # Intersect of TITLE names across all features types common_names = None for ii in [dd_names, fps_names, img_names]: if (common_names is not None) and (ii is not None): common_names = set(common_names).intersection(set(ii)) elif (common_names is None) and (ii is not None): common_names = ii print_fn( f'Intersect of titles across all feature types: {len(set(common_names))}' ) # Get TITLEs that are not available across all feature types bb_names = list(set(all_names).difference(set(common_names))) if len(bb_names) > 0: # TODO consider to dump these titles! print_fn( f'Difference of titles across all feature types: {len(set(bb_names))}' ) # Retain the common samples in fea dfs if dd_fea is not None: dd_fea = dd_fea[dd_fea[ID].isin( common_names)] # .reset_index(drop=True) if fps_fea is not None: fps_fea = fps_fea[fps_fea[ID].isin( common_names)] # .reset_index(drop=True) # ======================================================== kwargs = { 'common_samples': common_names, # 'fea_type': fea_type, # 'drg_set': drg_set, 'dd_fea': dd_fea, 'fps_fea': fps_fea, 'img_fea': img_fea, 'ID': ID, 'print_fn': print_fn, 'outdir': outdir, 'outfigs': outfigs, 'baseline': args.baseline, 'n_samples': args.n_samples, 'n_top': args.n_top, # 'flatten': args.flatten, 'sampling': args.sampling, } if par_jobs > 1: results = Parallel(n_jobs=par_jobs, verbose=20)(delayed(gen_ml_data)(fpath=f, **kwargs) for f in scr_files) else: results = [] # dock summary including ML baseline scores for f in scr_files: res = gen_ml_data(fpath=f, **kwargs) results.append(res) results = [r for r in results if r is not None] results = np.round(pd.DataFrame(results), decimals=3) results.sort_values('target').reset_index(drop=True) results.to_csv(outdir / 'dock.ml.baseline.csv', index=False) # ======================================================== if (time() - t0) // 3600 > 0: print_fn('\nRuntime: {:.1f} hrs'.format((time() - t0) / 3600)) else: print_fn('\nRuntime: {:.1f} min'.format((time() - t0) / 60)) print_fn('Done.') lg.kill_logger()
def run(args): import ipdb ipdb.set_trace(context=5) t0 = time() smiles_path = args.smiles_path id_name = args.id_name par_jobs = args.par_jobs fea_type = args.fea_type print('\nLoad SMILES.') smiles_path = Path(args.smiles_path) smi = pd.read_csv(smiles_path, sep='\t') smi = smi.astype({'SMILES': str, id_name: str}) smi['SMILES'] = smi['SMILES'].map(lambda x: x.strip()) smi[id_name] = smi[id_name].map(lambda x: x.strip()) # n_smiles = smi.shape[0] fea_id0 = smi.shape[1] # index of the first feature # Create Outdir # i1, i2 = args.i1, args.i2 # ids_dir = 'smi.ids.{}-{}'.format(i1, i2) # if i2 is None: # i2 = n_smiles # gout = Path(args.gout, ids_dir) gout = Path(args.gout) os.makedirs(gout, exist_ok=True) # Logger lg = Logger(gout / 'gen.fea.dfs.log') print_fn = get_print_func(lg.logger) print_fn(f'File path: {filepath}') print_fn(f'\n{pformat(vars(args))}') print_fn('\nInput data path {}'.format(smiles_path)) print_fn('Output data dir {}'.format(gout)) # Duplicates # dup = smi[ smi.duplicated(subset=['smiles'], keep=False) ].reset_index(drop=True) # print(dup['smiles'].value_counts()) # Drop duplicates smi = drop_dup_rows(smi, print_fn) # Exract subset SMILES # smi = smi.iloc[i1:i2+1, :].reset_index(drop=True) print_fn('\nCanonicalize SMILES.') can_smi_vec = canon_smiles(smi['SMILES'], par_jobs=par_jobs) can_smi_vec = pd.Series(can_smi_vec) # Save bad SMILES to file (that were not canonicalized) nan_ids = can_smi_vec.isna() bad_smi = smi[nan_ids] if len(bad_smi) > 0: bad_smi.to_csv(gout / 'smi_canon_err.csv', index=False) # Keep the good (canonicalized) SMILES smi['SMILES'] = can_smi_vec smi = smi[~nan_ids].reset_index(drop=True) # ======================================================== # Generate images # --------------- if 'images' in fea_type: images = smiles_to_images(smi, smi_col_name='SMILES', title_col_name=id_name, molSize=(128, 128), kekulize=True, par_jobs=par_jobs) # print(images[0].keys()) # img_outpath = gout/f'images.ids.{i1}-{i2}.pkl' img_outpath = gout / 'images.pkl' # Dump images to file (list of dicts) pickle.dump(images, open(img_outpath, 'wb')) # Load pkl # aa = pickle.load(open(img_outpath, 'rb')) # sum(images[0]['img'].reshape(-1,)-aa[0]['img'].reshape(-1,)) # ======================================================== # Generate fingerprints # --------------------- if 'fps' in fea_type: def gen_fps_and_save(smi, radius=1, par_jobs=par_jobs): ecfp = smiles_to_fps(smi, smi_name='SMILES', radius=radius, par_jobs=par_jobs) ecfp = add_fea_prfx(ecfp, prfx=f'ecfp{2*radius}.', id0=fea_id0) # ecfp.to_parquet(gout/f'ecfp{2*radius}.ids.{i1}-{i2}.{file_format}') ecfp.to_parquet(gout / f'ecfp{2*radius}.parquet') ecfp.to_csv(gout / f'ecfp{2*radius}', sep='\t', index=False) del ecfp gen_fps_and_save(smi, radius=1, par_jobs=par_jobs) gen_fps_and_save(smi, radius=2, par_jobs=par_jobs) gen_fps_and_save(smi, radius=3, par_jobs=par_jobs) # ======================================================== # Generate descriptors # -------------------- if 'descriptors' in fea_type: dd = smiles_to_mordred(smi, smi_name='SMILES', ignore_3D=args.ignore_3D, par_jobs=par_jobs) dd = add_fea_prfx(dd, prfx='dd_', id0=fea_id0) # Filter NaNs (step 1) # Drop rows where all values are NaNs print_fn('\nDrop rows where all values are NaN.') print_fn('Shape: {}'.format(dd.shape)) idx = (dd.isna().sum(axis=1) == dd.shape[1]).values dd = dd.iloc[~idx, :].reset_index(drop=True) # Drop cols where all values are NaNs # idx = ( dd.isna().sum(axis=0) == dd.shape[0] ).values # dd = dd.iloc[:, ~idx].reset_index(drop=True) print_fn('Shape: {}'.format(dd.shape)) # Filter NaNs (step 2) # Drop rows based on a thershold of NaN values. # print(dd.isna().sum(axis=1).sort_values(ascending=False)) # p=dd.isna().sum(axis=1).sort_values(ascending=False).hist(bins=100); th = 0.2 print_fn('\nDrop rows with at least {} NaNs (at least {} out of {}).'. format(th, int(th * dd.shape[1]), dd.shape[1])) print_fn('Shape: {}'.format(dd.shape)) dd = dropna(dd, axis=0, th=th) print_fn('Shape: {}'.format(dd.shape)) # Cast features (descriptors) print_fn('\nCast descriptors to float.') dd = dd.astype({c: np.float32 for c in dd.columns[fea_id0:]}) # Dump the count of NANs in each column aa = dd.isna().sum(axis=0).reset_index() aa = aa.rename(columns={'index': 'col', 0: 'count'}) aa = aa.sort_values('count', ascending=False).reset_index(drop=True) aa.to_csv(gout / 'nan_count_per_col.csv', index=False) # Impute missing values if args.impute: print_fn('\nImpute NaNs.') print_fn('Total NaNs: {}'.format(dd.isna().values.flatten().sum())) dd = dd.fillna(0.0) print_fn('Total NaNs: {}'.format(dd.isna().values.flatten().sum())) # Save print_fn('\nSave.') dd = dd.reset_index(drop=True) fname = 'dd.mordred.{}'.format('' if args.impute else 'with.nans') dd.to_parquet(gout / (fname + '.parquet')) dd.to_csv(gout / (fname + '.csv'), sep='\t', index=False) # dd.to_csv( gout/'dd.ids.{}-{}.{}'.format(i1, i2, file_format), index=False ) # ====================================================== print_fn('\nRuntime {:.1f} mins'.format((time() - t0) / 60)) print_fn('Done.') lg.kill_logger()
def run(args): print("\nInput args:") pprint(vars(args)) t0 = time() te_size = verify_size(args.te_size) datapath = Path(args.datapath).resolve() # Hard split # split_on = None if args.split_on is None else args.split_on.upper() cv_method = args.cv_method te_method = cv_method # Specify ML task (regression or classification) if cv_method == "strat": mltask = "cls" # cast mltask to cls in case of stratification else: mltask = args.ml_task # Target column name trg_name = str(args.trg_name) # assert args.trg_name in data.columns, f'The prediction target ({args.name}) \ # was not found in the dataset.' # import ipdb; ipdb.set_trace() # ----------------------------------------------- # Create outdir # ----------------------------------------------- if args.gout is not None: gout = Path(args.gout).resolve() sufx = "none" if args.split_on is None else args.split_on gout = gout / datapath.with_suffix(".splits") if args.split_on is not None: gout = gout / f"split_on_{sufx}" else: gout = gout / f"split_on_none" else: # Note! useful for drug response sufx = "none" if args.split_on is None else args.split_on gout = datapath.with_suffix(".splits") outfigs = gout / "outfigs" os.makedirs(gout, exist_ok=True) os.makedirs(outfigs, exist_ok=True) # ----------------------------------------------- # Create logger # ----------------------------------------------- lg = Logger(gout / "data.splitter.log") print_fn = get_print_func(lg.logger) print_fn(f"File path: {fdir}") print_fn(f"\n{pformat(vars(args))}") dump_dict(vars(args), outpath=gout / "data.splitter.args.txt") # ----------------------------------------------- # Load data # ----------------------------------------------- print_fn("\nLoad master dataset.") data = load_data(datapath) print_fn("data.shape {}".format(data.shape)) # ydata = data[trg_name] if trg_name in data.columns else None # if (cv_method == "strat") and (ydata is None): # raise ValueError("Prediction target column must be available if splits need to be stratified.") if (cv_method == "strat") and (trg_name not in data.columns): raise ValueError( "Prediction target column must be available if splits need to be stratified." ) # if ydata is not None: # plot_hist(ydata, title=f"{trg_name}", fit=None, bins=100, # path=outfigs/f"{trg_name}_hist_all.png") if trg_name in data.columns: plot_hist(data[trg_name], title=f"{trg_name}", fit=None, bins=100, path=outfigs / f"{trg_name}_hist_all.png") # ----------------------------------------------- # Generate splits (train/val/test) # ----------------------------------------------- print_fn("\n{}".format("-" * 50)) print_fn("Split data into hold-out train/val/test") print_fn("{}".format("-" * 50)) kwargs = { "cv_method": cv_method, "te_method": te_method, "te_size": te_size, "mltask": mltask, "split_on": args.split_on } data_splitter( data=data, n_splits=args.n_splits, gout=gout, outfigs=outfigs, # ydata = ydata, target_name=trg_name, print_fn=print_fn, seed=seed, **kwargs) print_fn("Runtime: {:.1f} min".format((time() - t0) / 60)) print_fn("Done.") lg.close_logger()