def make_write_jobs(mseqs, fasta_file, seqs_bed_file, seqs_cov_dir, tfr_dir, gi, unmap_npy, targets_start, sum_targets, options): """Make basenji_data_write.py jobs for one genome.""" write_jobs = [] for tvt_set in ['train', 'valid', 'test']: tvt_set_indexes = [ i for i in range(len(mseqs)) if mseqs[i].label == tvt_set ] tvt_set_start = tvt_set_indexes[0] tvt_set_end = tvt_set_indexes[-1] + 1 tfr_i = 0 tfr_start = tvt_set_start tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end) while tfr_start <= tvt_set_end: tfr_stem = '%s/%s-%d-%d' % (tfr_dir, tvt_set, gi, tfr_i) cmd = 'basenji_data_write.py' cmd += ' -s %d' % tfr_start cmd += ' -e %d' % tfr_end cmd += ' -g %d' % gi cmd += ' --ts %d' % targets_start cmd += ' --te %d' % sum_targets if unmap_npy is not None: cmd += ' -u %s' % unmap_npy if options.umap_set is not None: cmd += ' --umap_set %f' % options.umap_set cmd += ' %s' % fasta_file cmd += ' %s' % seqs_bed_file cmd += ' %s' % seqs_cov_dir cmd += ' %s.tfr' % tfr_stem if options.run_local: cmd += ' &> %s.err' % tfr_stem write_jobs.append(cmd) else: j = slurm.Job(cmd, name='write_%s-%d' % (tvt_set, tfr_i), out_file='%s.out' % tfr_stem, err_file='%s.err' % tfr_stem, queue='standard', mem=15000, time='12:0:0') write_jobs.append(j) # update tfr_i += 1 tfr_start += options.seqs_per_tfr tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end) return write_jobs
def make_read_jobs(seqs_bed_file, targets_df, gi, seqs_cov_dir, options): """Make basenji_data_read.py jobs for one genome.""" # filter targets targets_df_gi = targets_df[targets_df.genome == gi] read_jobs = [] for ti in range(targets_df_gi.shape[0]): genome_cov_file = targets_df_gi["file"].iloc[ti] seqs_cov_stem = "%s/%d-%d" % (seqs_cov_dir, gi, ti) seqs_cov_file = "%s.h5" % seqs_cov_stem clip_ti = None if "clip" in targets_df_gi.columns: clip_ti = targets_df_gi["clip"].iloc[ti] scale_ti = 1 if "scale" in targets_df_gi.columns: scale_ti = targets_df_gi["scale"].iloc[ti] if os.path.isfile(seqs_cov_file): print("Skipping existing %s" % seqs_cov_file, file=sys.stderr) else: cmd = "basenji_data_read.py" cmd += " -u %s" % targets_df_gi["sum_stat"].iloc[ti] cmd += " -w %d" % options.pool_width if clip_ti is not None: cmd += " -c %f" % clip_ti if options.soft_clip: cmd += " --soft" cmd += " -s %f" % scale_ti if options.blacklist_beds[gi]: cmd += " -b %s" % options.blacklist_beds[gi] cmd += " %s" % genome_cov_file cmd += " %s" % seqs_bed_file cmd += " %s" % seqs_cov_file if options.run_local: cmd += " &> %s.err" % seqs_cov_stem read_jobs.append(cmd) else: j = slurm.Job( cmd, name="read_t%d" % ti, out_file="%s.out" % seqs_cov_stem, err_file="%s.err" % seqs_cov_stem, queue="standard", mem=15000, time="12:0:0", ) read_jobs.append(j) return read_jobs
def make_read_jobs(seqs_bed_file, targets_df, gi, seqs_cov_dir, options): """Make basenji_data_read.py jobs for one genome.""" # filter targets targets_df_gi = targets_df[targets_df.genome == gi] read_jobs = [] for ti in range(targets_df_gi.shape[0]): genome_cov_file = targets_df_gi['file'].iloc[ti] seqs_cov_stem = '%s/%d-%d' % (seqs_cov_dir, gi, ti) seqs_cov_file = '%s.h5' % seqs_cov_stem clip_ti = None if 'clip' in targets_df_gi.columns: clip_ti = targets_df_gi['clip'].iloc[ti] scale_ti = 1 if 'scale' in targets_df_gi.columns: scale_ti = targets_df_gi['scale'].iloc[ti] if options.restart and os.path.isfile(seqs_cov_file): print('Skipping existing %s' % seqs_cov_file, file=sys.stderr) else: cmd = 'basenji_data_read.py' cmd += ' --crop %d' % options.crop_bp cmd += ' -u %s' % targets_df_gi['sum_stat'].iloc[ti] cmd += ' -w %d' % options.pool_width if clip_ti is not None: cmd += ' -c %f' % clip_ti if options.soft_clip: cmd += ' --soft' cmd += ' -s %f' % scale_ti if options.blacklist_beds[gi]: cmd += ' -b %s' % options.blacklist_beds[gi] if options.interp_nan: cmd += ' -i' cmd += ' %s' % genome_cov_file cmd += ' %s' % seqs_bed_file cmd += ' %s' % seqs_cov_file if options.run_local: cmd += ' &> %s.err' % seqs_cov_stem read_jobs.append(cmd) else: j = slurm.Job(cmd, name='read_t%d' % ti, out_file='%s.out' % seqs_cov_stem, err_file='%s.err' % seqs_cov_stem, queue='standard', mem=15000, time='12:0:0') read_jobs.append(j) return read_jobs
def test_train(self): if os.path.isdir(self.exp_dir): shutil.rmtree(self.exp_dir) os.mkdir(self.exp_dir) ################################################################ # basenji test ################################################################ basenji_cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' basenji_cmd += ' conda activate %s;' % self.conda_env basenji_cmd += ' basenji_test.py' basenji_cmd += ' -o %s' % self.exp_dir basenji_cmd += ' --rc' basenji_cmd += ' --shifts "1,0,-1"' basenji_cmd += ' %s' % self.params_file basenji_cmd += ' %s' % self.model_file basenji_cmd += ' %s' % self.data_dir basenji_job = slurm.Job(basenji_cmd, name='test_test', out_file='%s/test.out'%self.exp_dir, err_file='%s/test.err'%self.exp_dir, queue=self.queue, cpu=1, gpu=1, mem=23000, time='1:00:00') slurm.multi_run([basenji_job], verbose=True) ################################################################ # compare ################################################################ if os.path.isfile(self.ref_acc_file): ref_df = pd.read_csv(self.ref_acc_file, sep='\t', index_col=0) exp_acc_file = '%s/acc.txt' % self.exp_dir exp_df = pd.read_csv(exp_acc_file, sep='\t', index_col=0) np.testing.assert_allclose(ref_df.pearsonr, exp_df.pearsonr, atol=1e-3, rtol=1e-3) np.testing.assert_allclose(ref_df.r2, exp_df.r2, atol=1e-3, rtol=1e-3) else: print('Moving experiment to reference.') os.rename(self.exp_dir, os.path.split(self.ref_acc_file)[0])
def make_write_jobs( mseqs, fasta_file, seqs_bed_file, seqs_cov_dir, tfr_dir, gi, unmap_npy, targets_start, sum_targets, options, ): """Make basenji_data_write.py jobs for one genome.""" write_jobs = [] for tvt_set in ["train", "valid", "test"]: tvt_set_indexes = [i for i in range(len(mseqs)) if mseqs[i].label == tvt_set] tvt_set_start = tvt_set_indexes[0] tvt_set_end = tvt_set_indexes[-1] + 1 tfr_i = 0 tfr_start = tvt_set_start tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end) while tfr_start <= tvt_set_end: tfr_stem = "%s/%s-%d-%d" % (tfr_dir, tvt_set, gi, tfr_i) cmd = "basenji_data_write.py" cmd += " -s %d" % tfr_start cmd += " -e %d" % tfr_end cmd += " -g %d" % gi cmd += " --ts %d" % targets_start cmd += " --te %d" % sum_targets if unmap_npy is not None: cmd += " -u %s" % unmap_npy if options.umap_set is not None: cmd += " --umap_set %f" % options.umap_set cmd += " %s" % fasta_file cmd += " %s" % seqs_bed_file cmd += " %s" % seqs_cov_dir cmd += " %s.tfr" % tfr_stem if options.run_local: cmd += " &> %s.err" % tfr_stem write_jobs.append(cmd) else: j = slurm.Job( cmd, name="write_%s-%d" % (tvt_set, tfr_i), out_file="%s.out" % tfr_stem, err_file="%s.err" % tfr_stem, queue="standard", mem=15000, time="12:0:0", ) write_jobs.append(j) # update tfr_i += 1 tfr_start += options.seqs_per_tfr tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end) return write_jobs
def main(): usage = 'usage: %prog [options] <params_file> <model_file> <vcf_file>' parser = OptionParser(usage) # sad parser.add_option( '-c', dest='center_pct', default=0.25, type='float', help='Require clustered SNPs lie in center region [Default: %default]') parser.add_option('-f', dest='genome_fasta', default='%s/data/hg19.fa' % os.environ['BASENJIDIR'], help='Genome FASTA for sequences [Default: %default]') parser.add_option('-g', dest='genome_file', default='%s/data/human.hg19.genome' % os.environ['BASENJIDIR'], help='Chromosome lengths file [Default: %default]') parser.add_option('--h5', dest='out_h5', default=False, action='store_true', help='Output stats to sad.h5 [Default: %default]') parser.add_option('--local', dest='local', default=1024, type='int', help='Local SAD score [Default: %default]') parser.add_option('-n', dest='norm_file', default=None, help='Normalize SAD scores') parser.add_option( '-o', dest='out_dir', default='sad', help='Output directory for tables and plots [Default: %default]') parser.add_option('--pseudo', dest='log_pseudo', default=1, type='float', help='Log2 pseudocount [Default: %default]') parser.add_option( '--rc', dest='rc', default=False, action='store_true', help= 'Average forward and reverse complement predictions [Default: %default]' ) parser.add_option('--shifts', dest='shifts', default='0', type='str', help='Ensemble prediction shifts [Default: %default]') parser.add_option( '--stats', dest='sad_stats', default='SAD', help='Comma-separated list of stats to save. [Default: %default]') parser.add_option( '-t', dest='targets_file', default=None, type='str', help='File specifying target indexes and labels in table format') parser.add_option( '--ti', dest='track_indexes', default=None, type='str', help='Comma-separated list of target indexes to output BigWig tracks') parser.add_option( '-u', dest='penultimate', default=False, action='store_true', help='Compute SED in the penultimate layer [Default: %default]') parser.add_option('-z', dest='out_zarr', default=False, action='store_true', help='Output stats to sad.zarr [Default: %default]') # multi parser.add_option('--name', dest='name', default='sad', help='SLURM name prefix [Default: %default]') parser.add_option('-p', dest='processes', default=None, type='int', help='Number of processes, passed by multi script') parser.add_option( '-q', dest='queue', default='k80', help='SLURM queue on which to run the jobs [Default: %default]') parser.add_option( '-r', dest='restart', default=False, action='store_true', help='Restart a partially completed job [Default: %default]') (options, args) = parser.parse_args() if len(args) != 3: parser.error('Must provide parameters and model files and VCF file') else: params_file = args[0] model_file = args[1] vcf_file = args[2] ####################################################### # prep work # output directory if not options.restart: if os.path.isdir(options.out_dir): print('Please remove %s' % options.out_dir, file=sys.stderr) exit(1) os.mkdir(options.out_dir) # pickle options options_pkl_file = '%s/options.pkl' % options.out_dir options_pkl = open(options_pkl_file, 'wb') pickle.dump(options, options_pkl) options_pkl.close() ####################################################### # launch worker threads jobs = [] for pi in range(options.processes): if not options.restart or not job_completed(options, pi): cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' cmd += ' conda activate tf1.13-gpu;' cmd += ' echo $HOSTNAME;' cmd += ' basenji_sad_ref.py %s %s %d' % (options_pkl_file, ' '.join(args), pi) name = '%s_p%d' % (options.name, pi) outf = '%s/job%d.out' % (options.out_dir, pi) errf = '%s/job%d.err' % (options.out_dir, pi) j = slurm.Job(cmd, name, outf, errf, queue=options.queue, gpu=1, mem=37000, time='7-0:0:0') jobs.append(j) slurm.multi_run(jobs, max_proc=options.processes, verbose=True, launch_sleep=10, update_sleep=60) ####################################################### # collect output if options.out_h5: collect_h5('sad.h5', options.out_dir, options.processes) elif options.out_zarr: collect_zarr('sad.zarr', options.out_dir, options.processes) else: collect_table('sad_table.txt', options.out_dir, options.processes)
def main(): usage = 'usage: %prog [options] <params_file> <data1_dir> ...' parser = OptionParser(usage) # train train_options = OptionGroup(parser, 'basenji_train.py options') train_options.add_option( '-k', dest='keras_fit', default=False, action='store_true', help='Train with Keras fit method [Default: %default]') train_options.add_option( '-o', dest='out_dir', default='train_out', help='Output directory for test statistics [Default: %default]') train_options.add_option( '--restore', dest='restore', help= 'Restore model and continue training, from existing fold train dir [Default: %default]' ) train_options.add_option( '--trunk', dest='trunk', default=False, action='store_true', help='Restore only model trunk [Default: %default]') train_options.add_option( '--tfr_train', dest='tfr_train_pattern', default=None, help= 'Training TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]' ) train_options.add_option( '--tfr_eval', dest='tfr_eval_pattern', default=None, help= 'Evaluation TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]' ) parser.add_option_group(train_options) # test test_options = OptionGroup(parser, 'basenji_test.py options') test_options.add_option( '--rc', dest='rc', default=False, action='store_true', help= 'Average forward and reverse complement predictions [Default: %default]' ) test_options.add_option( '--shifts', dest='shifts', default='0', type='str', help='Ensemble prediction shifts [Default: %default]') parser.add_option_group(test_options) # multi rep_options = OptionGroup(parser, 'replication options') rep_options.add_option( '-c', dest='crosses', default=1, type='int', help='Number of cross-fold rounds [Default:%default]') rep_options.add_option('-e', dest='conda_env', default='tf2.4', help='Anaconda environment [Default: %default]') rep_options.add_option('-f', dest='fold_subset', default=None, type='int', help='Run a subset of folds [Default:%default]') rep_options.add_option('--name', dest='name', default='fold', help='SLURM name prefix [Default: %default]') rep_options.add_option('-p', dest='processes', default=None, type='int', help='Number of processes, passed by multi script') rep_options.add_option( '-q', dest='queue', default='gtx1080ti', help='SLURM queue on which to run the jobs [Default: %default]') rep_options.add_option('-r', dest='restart', default=False, action='store_true') rep_options.add_option('--spec_off', dest='spec_off', default=False, action='store_true') rep_options.add_option('--test_off', dest='test_off', default=False, action='store_true') rep_options.add_option('--test_train_off', dest='test_train_off', default=False, action='store_true') parser.add_option_group(rep_options) (options, args) = parser.parse_args() if len(args) < 2: parser.error('Must provide parameters and data directory.') else: params_file = os.path.abspath(args[0]) data_dirs = [os.path.abspath(arg) for arg in args[1:]] # read model parameters with open(params_file) as params_open: params = json.load(params_open) params_train = params['train'] ####################################################### # prep work if not options.restart and os.path.isdir(options.out_dir): print('Output directory %s exists. Please remove.' % options.out_dir) exit(1) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) # read data parameters num_data = len(data_dirs) data_stats_file = '%s/statistics.json' % data_dirs[0] with open(data_stats_file) as data_stats_open: data_stats = json.load(data_stats_open) # count folds num_folds = len([dkey for dkey in data_stats if dkey.startswith('fold')]) # subset folds if options.fold_subset is not None: num_folds = min(options.fold_subset, num_folds) ####################################################### # train jobs = [] for ci in range(options.crosses): for fi in range(num_folds): rep_dir = '%s/f%d_c%d' % (options.out_dir, fi, ci) if options.restart and os.path.isdir(rep_dir): print('%s found and skipped.' % rep_dir) else: # make rep dir os.mkdir(rep_dir) # make rep data rep_data_dirs = [] for di in range(num_data): rep_data_dirs.append('%s/data%d' % (rep_dir, di)) make_rep_data(data_dirs[di], rep_data_dirs[-1], fi, ci) # train command cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' cmd += ' conda activate %s;' % options.conda_env cmd += ' echo $HOSTNAME;' cmd += ' basenji_train.py' cmd += ' %s' % options_string(options, train_options, rep_dir) cmd += ' %s %s' % (params_file, ' '.join(rep_data_dirs)) name = '%s-train-f%dc%d' % (options.name, fi, ci) sbf = os.path.abspath('%s/train.sb' % rep_dir) outf = os.path.abspath('%s/train.out' % rep_dir) errf = os.path.abspath('%s/train.err' % rep_dir) j = slurm.Job(cmd, name, outf, errf, sbf, queue=options.queue, cpu=4, gpu=params_train.get('num_gpu', 1), mem=37000, time='28-0:0:0') jobs.append(j) slurm.multi_run(jobs, max_proc=options.processes, verbose=True, launch_sleep=10, update_sleep=60) ####################################################### # test train jobs = [] if not options.test_train_off: for ci in range(options.crosses): for fi in range(num_folds): it_dir = '%s/f%d_c%d' % (options.out_dir, fi, ci) for di in range(num_data): if num_data == 1: out_dir = '%s/test_train' % it_dir model_file = '%s/train/model_check.h5' % it_dir else: out_dir = '%s/test%d_train' % (it_dir, di) model_file = '%s/train/model%d_check.h5' % (it_dir, di) # check if done acc_file = '%s/acc.txt' % out_dir if os.path.isfile(acc_file): print('%s already generated.' % acc_file) else: # basenji test basenji_cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' basenji_cmd += ' conda activate %s;' % options.conda_env basenji_cmd += ' basenji_test.py' basenji_cmd += ' --head %d' % di basenji_cmd += ' -o %s' % out_dir if options.rc: basenji_cmd += ' --rc' if options.shifts: basenji_cmd += ' --shifts %s' % options.shifts basenji_cmd += ' --split train' basenji_cmd += ' %s' % params_file basenji_cmd += ' %s' % model_file basenji_cmd += ' %s/data%d' % (it_dir, di) name = '%s-testtr-f%dc%d' % (options.name, fi, ci) basenji_job = slurm.Job(basenji_cmd, name=name, out_file='%s.out' % out_dir, err_file='%s.err' % out_dir, queue=options.queue, cpu=1, gpu=1, mem=23000, time='8:00:00') jobs.append(basenji_job) ####################################################### # test best if not options.test_off: for ci in range(options.crosses): for fi in range(num_folds): it_dir = '%s/f%d_c%d' % (options.out_dir, fi, ci) for di in range(num_data): if num_data == 1: out_dir = '%s/test' % it_dir model_file = '%s/train/model_best.h5' % it_dir else: out_dir = '%s/test%d' % (it_dir, di) model_file = '%s/train/model%d_best.h5' % (it_dir, di) # check if done acc_file = '%s/acc.txt' % out_dir if os.path.isfile(acc_file): print('%s already generated.' % acc_file) else: # basenji test basenji_cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' basenji_cmd += ' conda activate %s;' % options.conda_env basenji_cmd += ' basenji_test.py' basenji_cmd += ' --head %d' % di basenji_cmd += ' -o %s' % out_dir if options.rc: basenji_cmd += ' --rc' if options.shifts: basenji_cmd += ' --shifts %s' % options.shifts basenji_cmd += ' %s' % params_file basenji_cmd += ' %s' % model_file basenji_cmd += ' %s/data%d' % (it_dir, di) name = '%s-test-f%dc%d' % (options.name, fi, ci) basenji_job = slurm.Job(basenji_cmd, name=name, out_file='%s.out' % out_dir, err_file='%s.err' % out_dir, queue=options.queue, cpu=1, gpu=1, mem=23000, time='4:00:00') jobs.append(basenji_job) ####################################################### # test best specificity if not options.spec_off: for ci in range(options.crosses): for fi in range(num_folds): it_dir = '%s/f%d_c%d' % (options.out_dir, fi, ci) for di in range(num_data): if num_data == 1: out_dir = '%s/test_spec' % it_dir model_file = '%s/train/model_best.h5' % it_dir else: out_dir = '%s/test%d_spec' % (it_dir, di) model_file = '%s/train/model%d_best.h5' % (it_dir, di) # check if done acc_file = '%s/acc.txt' % out_dir if os.path.isfile(acc_file): print('%s already generated.' % acc_file) else: # basenji test basenji_cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' basenji_cmd += ' conda activate %s;' % options.conda_env basenji_cmd += ' basenji_test_specificity.py' basenji_cmd += ' --head %d' % di basenji_cmd += ' -o %s' % out_dir if options.rc: basenji_cmd += ' --rc' if options.shifts: basenji_cmd += ' --shifts %s' % options.shifts basenji_cmd += ' %s' % params_file basenji_cmd += ' %s' % model_file basenji_cmd += ' %s/data%d' % (it_dir, di) name = '%s-spec-f%dc%d' % (options.name, fi, ci) basenji_job = slurm.Job(basenji_cmd, name=name, out_file='%s.out' % out_dir, err_file='%s.err' % out_dir, queue=options.queue, cpu=1, gpu=1, mem=90000, time='6:00:00') jobs.append(basenji_job) slurm.multi_run(jobs, max_proc=options.processes, verbose=True, launch_sleep=10, update_sleep=60)
def main(): usage = 'usage: %prog [options] <params_file> <model_file> <bed_file>' parser = OptionParser(usage) # basenji_predict_bed.py options parser.add_option( '-b', dest='bigwig_indexes', default=None, help='Comma-separated list of target indexes to write BigWigs') parser.add_option('-e', dest='embed_layer', default=None, type='int', help='Embed sequences using the specified layer index.') parser.add_option('-f', dest='genome_fasta', default=None, help='Genome FASTA for sequences [Default: %default]') parser.add_option('-g', dest='genome_file', default=None, help='Chromosome length information [Default: %default]') parser.add_option( '-l', dest='site_length', default=None, type='int', help='Prediction site length. [Default: params.seq_length]') parser.add_option('-o', dest='out_dir', default='pred_out', help='Output directory [Default: %default]') parser.add_option( '--rc', dest='rc', default=False, action='store_true', help= 'Ensemble forward and reverse complement predictions [Default: %default]' ) parser.add_option('-s', dest='sum', default=False, action='store_true', help='Sum site predictions [Default: %default]') parser.add_option('--shifts', dest='shifts', default='0', help='Ensemble prediction shifts [Default: %default]') parser.add_option( '-t', dest='targets_file', default=None, type='str', help='File specifying target indexes and labels in table format') # _multi.py options parser.add_option('-p', dest='processes', default=None, type='int', help='Number of processes, passed by multi script') parser.add_option( '-q', dest='queue', default='gtx1080ti', help='SLURM queue on which to run the jobs [Default: %default]') parser.add_option( '-r', dest='restart', default=False, action='store_true', help='Restart a partially completed job [Default: %default]') (options, args) = parser.parse_args() if len(args) != 3: print(args) parser.error('Must provide parameters and model files and BED file') else: params_file = args[0] model_file = args[1] bed_file = args[2] ####################################################### # prep work # output directory if not options.restart: if os.path.isdir(options.out_dir): print('Please remove %s' % options.out_dir, file=sys.stderr) exit(1) os.mkdir(options.out_dir) # pickle options options_pkl_file = '%s/options.pkl' % options.out_dir options_pkl = open(options_pkl_file, 'wb') pickle.dump(options, options_pkl) options_pkl.close() ####################################################### # launch worker threads jobs = [] for pi in range(options.processes): if not options.restart or not job_completed(options, pi): cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' cmd += ' conda activate tf1.15-gpu;' cmd += ' basenji_predict_bed.py %s %s %d' % (options_pkl_file, ' '.join(args), pi) name = 'pred_p%d' % pi outf = '%s/job%d.out' % (options.out_dir, pi) errf = '%s/job%d.err' % (options.out_dir, pi) j = slurm.Job(cmd, name, outf, errf, queue=options.queue, gpu=1, mem=60000, time='14-0:0:0') jobs.append(j) slurm.multi_run(jobs, max_proc=options.processes, verbose=True, launch_sleep=10, update_sleep=60) ####################################################### # collect output collect_h5(options.out_dir, options.processes)
def test_train(self): exp_dir = 'train_full/exp' if os.path.isdir(exp_dir): shutil.rmtree(exp_dir) os.mkdir(exp_dir) ################################################################ # train ################################################################ jobs = [] for i in range(self.iterations): it_dir = '%s/%d' % (exp_dir, i) os.mkdir(it_dir) # basenji train basenji_cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' basenji_cmd += ' conda activate %s;' % self.conda_env basenji_cmd += ' %s/basenji_train.py' % self.basenji_path basenji_cmd += ' -o %s/train' % it_dir basenji_cmd += ' %s' % self.params_file basenji_cmd += ' %s' % self.data_dir basenji_job = slurm.Job(basenji_cmd, name='train%d' % i, out_file='%s/train.out'%it_dir, err_file='%s/train.err'%it_dir, queue=self.queue, cpu=1, gpu=1, mem=23000, time='12-00:00:00') jobs.append(basenji_job) slurm.multi_run(jobs, verbose=True) ################################################################ # test check ################################################################ jobs = [] for i in range(self.iterations): it_dir = '%s/%d' % (exp_dir, i) # basenji test basenji_cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' basenji_cmd += ' conda activate %s;' % self.conda_env basenji_cmd += ' %s/basenji_test.py' % self.basenji_path basenji_cmd += ' -o %s/test_train' % it_dir basenji_cmd += ' --tfr "train-*.tfr"' basenji_cmd += ' %s' % self.params_file basenji_cmd += ' %s/train/model_check.h5' % it_dir basenji_cmd += ' %s' % self.data_dir basenji_job = slurm.Job(basenji_cmd, name='test%d' % i, out_file='%s/test_train.out'%it_dir, err_file='%s/test_train.err'%it_dir, queue=self.queue, cpu=1, gpu=1, mem=23000, time='4:00:00') jobs.append(basenji_job) slurm.multi_run(jobs, verbose=True) ################################################################ # test best ################################################################ jobs = [] for i in range(self.iterations): it_dir = '%s/%d' % (exp_dir, i) # basenji test basenji_cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' basenji_cmd += ' conda activate %s;' % self.conda_env basenji_cmd += ' %s/basenji_test.py' % self.basenji_path basenji_cmd += ' -o %s/test' % it_dir basenji_cmd += ' %s' % self.params_file basenji_cmd += ' %s/train/model_best.h5' % it_dir basenji_cmd += ' %s' % self.data_dir basenji_job = slurm.Job(basenji_cmd, name='test%d' % i, out_file='%s/test.out'%it_dir, err_file='%s/test.err'%it_dir, queue=self.queue, cpu=1, gpu=1, mem=23000, time='4:00:00') jobs.append(basenji_job) slurm.multi_run(jobs, verbose=True) ################################################################ # compare checkpoint on training set ################################################################ ref_cors = [] for acc_file in glob.glob('%s/*/test_train/acc.txt' % self.ref_dir): acc_df = pd.read_csv(acc_file, sep='\t', index_col=0) ref_cors.append(acc_df.pearsonr.mean()) exp_cors = [] for acc_file in glob.glob('%s/*/test_train/acc.txt' % exp_dir): acc_df = pd.read_csv(acc_file, sep='\t', index_col=0) exp_cors.append(acc_df.pearsonr.mean()) _, mwp = mannwhitneyu(ref_cors, exp_cors, alternative='two-sided') _, tp = ttest_ind(ref_cors, exp_cors) print('\nTrain:') print('Reference PearsonR: %.4f (%.4f)' % (np.mean(ref_cors), np.std(ref_cors))) print('Experiment PearsonR: %.4f (%.4f)' % (np.mean(exp_cors), np.std(exp_cors))) print('Mann-Whitney U p-value: %.3g' % mwp) print('T-test p-value: %.3g' % tp) # self.assertGreater(mwp, 0.05) # self.assertGreater(tp, 0.05) ################################################################ # compare best on test set ################################################################ ref_cors = [] for acc_file in glob.glob('%s/*/test/acc.txt' % self.ref_dir): acc_df = pd.read_csv(acc_file, sep='\t', index_col=0) ref_cors.append(acc_df.pearsonr.mean()) exp_cors = [] for acc_file in glob.glob('%s/*/test/acc.txt' % exp_dir): acc_df = pd.read_csv(acc_file, sep='\t', index_col=0) exp_cors.append(acc_df.pearsonr.mean()) _, mwp = mannwhitneyu(ref_cors, exp_cors, alternative='two-sided') _, tp = ttest_ind(ref_cors, exp_cors) print('\nTest:') print('Reference PearsonR: %.4f (%.4f)' % (np.mean(ref_cors), np.std(ref_cors))) print('Experiment PearsonR: %.4f (%.4f)' % (np.mean(exp_cors), np.std(exp_cors))) print('Mann-Whitney U p-value: %.3g' % mwp) print('T-test p-value: %.3g' % tp)
def main(): usage = 'usage: %prog [options] <fasta_file> <targets_file>' parser = OptionParser(usage) parser.add_option('-b', dest='blacklist_bed', help='Set blacklist nucleotides to a baseline value.') parser.add_option( '--break', dest='break_t', default=8388608, type='int', help='Break in half contigs above length [Default: %default]') parser.add_option('--crop', dest='crop_bp', default=0, type='int', help='Crop bp off each end [Default: %default]') parser.add_option( '-d', dest='diagonal_offset', default=2, type='int', help='Positions on the diagonal to ignore [Default: %default]') parser.add_option('-g', dest='gaps_file', help='Genome assembly gaps BED [Default: %default]') parser.add_option( '-k', dest='kernel_stddev', default=0, type='int', help='Gaussian kernel stddev to smooth values [Default: %default]') parser.add_option('-l', dest='seq_length', default=131072, type='int', help='Sequence length [Default: %default]') parser.add_option( '--limit', dest='limit_bed', help='Limit to segments that overlap regions in a BED file') parser.add_option( '--local', dest='run_local', default=False, action='store_true', help='Run jobs locally as opposed to on SLURM [Default: %default]') parser.add_option('-o', dest='out_dir', default='data_out', help='Output directory [Default: %default]') parser.add_option('-p', dest='processes', default=None, type='int', help='Number parallel processes [Default: %default]') parser.add_option('-r', dest='seqs_per_tfr', default=128, type='int', help='Sequences per TFRecord file [Default: %default]') parser.add_option( '--restart', dest='restart', default=False, action='store_true', help='Skip already read HDF5 coverage values. [Default: %default]') parser.add_option('--sample', dest='sample_pct', default=1.0, type='float', help='Down-sample the segments') parser.add_option('--seed', dest='seed', default=44, type='int', help='Random seed [Default: %default]') parser.add_option( '--stride_train', dest='stride_train', default=1., type='float', help='Stride to advance train sequences [Default: seq_length]') parser.add_option( '--stride_test', dest='stride_test', default=1., type='float', help='Stride to advance valid and test sequences [Default: seq_length]' ) parser.add_option( '--soft', dest='soft_clip', default=False, action='store_true', help= 'Soft clip values, applying sqrt to the execess above the threshold [Default: %default]' ) parser.add_option( '-t', dest='test_pct_or_chr', default=0.05, type='str', help='Proportion of the data for testing [Default: %default]') parser.add_option('-u', dest='umap_bed', help='Unmappable regions in BED format') parser.add_option( '--umap_midpoints', dest='umap_midpoints', help='Regions with midpoints to exclude in BED format. Used for 4C/HiC.' ) parser.add_option( '--umap_t', dest='umap_t', default=0.3, type='float', help= 'Remove sequences with more than this unmappable bin % [Default: %default]' ) parser.add_option( '--umap_set', dest='umap_set', default=None, type='float', help= 'Set unmappable regions to this percentile in the sequences\' distribution of values' ) parser.add_option('-w', dest='pool_width', default=128, type='int', help='Sum pool width [Default: %default]') parser.add_option( '-v', dest='valid_pct_or_chr', default=0.05, type='str', help='Proportion of the data for validation [Default: %default]') parser.add_option( '--snap', dest='snap', default=None, type='int', help= 'snap stride to multiple for binned targets in bp, if not None seq_length must be a multiple of snap' ) parser.add_option('--as_obsexp', dest='as_obsexp', action="store_true", default=False, help='save targets as obsexp profiles') parser.add_option('--global_obsexp', dest='global_obsexp', action="store_true", default=False, help='use pre-calculated by-chromosome obs/exp') parser.add_option('--no_log', dest='no_log', action="store_true", default=False, help='do not take log for obs/exp') (options, args) = parser.parse_args() if len(args) != 2: parser.error( 'Must provide FASTA and sample coverage labels and paths.') else: fasta_file = args[0] targets_file = args[1] random.seed(options.seed) np.random.seed(options.seed) # transform proportion strides to base pairs if options.stride_train <= 1: print('stride_train %.f' % options.stride_train, end='') options.stride_train = options.stride_train * options.seq_length print(' converted to %f' % options.stride_train) options.stride_train = int(np.round(options.stride_train)) if options.stride_test <= 1: print('stride_test %.f' % options.stride_test, end='') options.stride_test = options.stride_test * options.seq_length print(' converted to %f' % options.stride_test) options.stride_test = int(np.round(options.stride_test)) if options.snap != None: if np.mod(options.seq_length, options.snap) != 0: raise ValueError('seq_length must be a multiple of snap') if np.mod(options.stride_train, options.snap) != 0: raise ValueError('stride_train must be a multiple of snap') if np.mod(options.stride_test, options.snap) != 0: raise ValueError('stride_test must be a multiple of snap') if os.path.isdir(options.out_dir) and not options.restart: print('Remove output directory %s or use --restart option.' % options.out_dir) exit(1) elif not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) # dump options with open('%s/options.json' % options.out_dir, 'w') as options_json_out: json.dump(options.__dict__, options_json_out, sort_keys=True, indent=4) ################################################################ # define genomic contigs ################################################################ chrom_contigs = genome.load_chromosomes(fasta_file) # remove gaps if options.gaps_file: chrom_contigs = genome.split_contigs(chrom_contigs, options.gaps_file) # ditch the chromosomes for contigs contigs = [] for chrom in chrom_contigs: contigs += [ Contig(chrom, ctg_start, ctg_end) for ctg_start, ctg_end in chrom_contigs[chrom] ] # limit to a BED file if options.limit_bed is not None: contigs = limit_contigs(contigs, options.limit_bed) # filter for large enough contigs = [ ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length ] # break up large contigs if options.break_t is not None: contigs = break_large_contigs(contigs, options.break_t) # print contigs to BED file ctg_bed_file = '%s/contigs.bed' % options.out_dir write_seqs_bed(ctg_bed_file, contigs) ################################################################ # divide between train/valid/test ################################################################ try: # convert to float pct valid_pct = float(options.valid_pct_or_chr) test_pct = float(options.test_pct_or_chr) assert (0 <= valid_pct <= 1) assert (0 <= test_pct <= 1) # divide by pct contig_sets = divide_contigs_pct(contigs, test_pct, valid_pct) except (ValueError, AssertionError): # divide by chr valid_chrs = options.valid_pct_or_chr.split(',') test_chrs = options.test_pct_or_chr.split(',') contig_sets = divide_contigs_chr(contigs, test_chrs, valid_chrs) train_contigs, valid_contigs, test_contigs = contig_sets # rejoin broken contigs within set train_contigs = rejoin_large_contigs(train_contigs) valid_contigs = rejoin_large_contigs(valid_contigs) test_contigs = rejoin_large_contigs(test_contigs) ################################################################ # define model sequences ################################################################ # stride sequences across contig train_mseqs = contig_sequences(train_contigs, options.seq_length, options.stride_train, options.snap, label='train') valid_mseqs = contig_sequences(valid_contigs, options.seq_length, options.stride_test, options.snap, label='valid') test_mseqs = contig_sequences(test_contigs, options.seq_length, options.stride_test, options.snap, label='test') # shuffle random.shuffle(train_mseqs) random.shuffle(valid_mseqs) random.shuffle(test_mseqs) # down-sample if options.sample_pct < 1.0: train_mseqs = random.sample(train_mseqs, int(options.sample_pct * len(train_mseqs))) valid_mseqs = random.sample(valid_mseqs, int(options.sample_pct * len(valid_mseqs))) test_mseqs = random.sample(test_mseqs, int(options.sample_pct * len(test_mseqs))) # merge mseqs = train_mseqs + valid_mseqs + test_mseqs ################################################################ # mappability ################################################################ if (options.umap_bed is not None) or (options.umap_midpoints is not None): if shutil.which('bedtools') is None: print('Install Bedtools to annotate unmappable sites', file=sys.stderr) exit(1) if options.umap_bed is not None: # annotate unmappable positions mseqs_unmap = annotate_unmap(mseqs, options.umap_bed, options.seq_length, options.pool_width) # filter unmappable mseqs_map_mask = (mseqs_unmap.mean(axis=1, dtype='float64') < options.umap_t) mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]] mseqs_unmap = mseqs_unmap[mseqs_map_mask, :] # write to file unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir np.save(unmap_npy, mseqs_unmap) if options.umap_midpoints is not None: # annotate unmappable midpoints for 4C/HiC mseqs_unmap = annotate_unmap(mseqs, options.umap_midpoints, options.seq_length, options.pool_width) # filter unmappable seqmid = mseqs_unmap.shape[ 1] // 2 #int( options.seq_length / options.pool_width /2) mseqs_map_mask = (np.sum(mseqs_unmap[:, seqmid - 1:seqmid + 1], axis=1) == 0) mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]] mseqs_unmap = mseqs_unmap[mseqs_map_mask, :] # write to file unmap_npy = '%s/mseqs_unmap_midpoints.npy' % options.out_dir np.save(unmap_npy, mseqs_unmap) # write sequences to BED print('writing sequences to BED') seqs_bed_file = '%s/sequences.bed' % options.out_dir write_seqs_bed(seqs_bed_file, mseqs, True) ################################################################ # read sequence coverage values ################################################################ # read target datasets targets_df = pd.read_csv(targets_file, index_col=0, sep='\t') seqs_cov_dir = '%s/seqs_cov' % options.out_dir if not os.path.isdir(seqs_cov_dir): os.mkdir(seqs_cov_dir) read_jobs = [] for ti in range(targets_df.shape[0]): genome_cov_file = targets_df['file'].iloc[ti] seqs_cov_stem = '%s/%d' % (seqs_cov_dir, ti) seqs_cov_file = '%s.h5' % seqs_cov_stem clip_ti = None if 'clip' in targets_df.columns: clip_ti = targets_df['clip'].iloc[ti] # scale_ti = 1 # if 'scale' in targets_df.columns: # scale_ti = targets_df['scale'].iloc[ti] if options.restart and os.path.isfile(seqs_cov_file): print('Skipping existing %s' % seqs_cov_file, file=sys.stderr) else: cmd = 'akita_data_read.py' cmd += ' --crop %d' % options.crop_bp cmd += ' -k %d' % options.kernel_stddev cmd += ' -w %d' % options.pool_width if clip_ti is not None: cmd += ' --clip %f' % clip_ti if options.soft_clip: cmd += ' --soft' # cmd += ' -s %f' % scale_ti if options.blacklist_bed: cmd += ' -b %s' % options.blacklist_bed if options.as_obsexp: cmd += ' --as_obsexp' if options.global_obsexp: cmd += ' --global_obsexp' if options.no_log: cmd += ' --no_log' cmd += ' %s' % genome_cov_file cmd += ' %s' % seqs_bed_file cmd += ' %s' % seqs_cov_file if options.run_local: # breaks on some OS # cmd += ' &> %s.err' % seqs_cov_stem read_jobs.append(cmd) else: j = slurm.Job(cmd, name='read_t%d' % ti, out_file='%s.out' % seqs_cov_stem, err_file='%s.err' % seqs_cov_stem, queue='standard', mem=15000, time='12:0:0') read_jobs.append(j) if options.run_local: util.exec_par(read_jobs, options.processes, verbose=True) else: slurm.multi_run(read_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5) ################################################################ # write TF Records ################################################################ # copy targets file shutil.copy(targets_file, '%s/targets.txt' % options.out_dir) # initialize TF Records dir tfr_dir = '%s/tfrecords' % options.out_dir if not os.path.isdir(tfr_dir): os.mkdir(tfr_dir) write_jobs = [] for tvt_set in ['train', 'valid', 'test']: tvt_set_indexes = [ i for i in range(len(mseqs)) if mseqs[i].label == tvt_set ] tvt_set_start = tvt_set_indexes[0] tvt_set_end = tvt_set_indexes[-1] + 1 tfr_i = 0 tfr_start = tvt_set_start tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end) while tfr_start <= tvt_set_end: tfr_stem = '%s/%s-%d' % (tfr_dir, tvt_set, tfr_i) cmd = 'basenji_data_write.py' cmd += ' -s %d' % tfr_start cmd += ' -e %d' % tfr_end # do not use # if options.umap_bed is not None: # cmd += ' -u %s' % unmap_npy # if options.umap_set is not None: # cmd += ' --umap_set %f' % options.umap_set cmd += ' %s' % fasta_file cmd += ' %s' % seqs_bed_file cmd += ' %s' % seqs_cov_dir cmd += ' %s.tfr' % tfr_stem if options.run_local: # breaks on some OS # cmd += ' &> %s.err' % tfr_stem write_jobs.append(cmd) else: j = slurm.Job(cmd, name='write_%s-%d' % (tvt_set, tfr_i), out_file='%s.out' % tfr_stem, err_file='%s.err' % tfr_stem, queue='standard', mem=15000, time='12:0:0') write_jobs.append(j) # update tfr_i += 1 tfr_start += options.seqs_per_tfr tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end) if options.run_local: util.exec_par(write_jobs, options.processes, verbose=True) else: slurm.multi_run(write_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5) ################################################################ # stats ################################################################ stats_dict = {} stats_dict['num_targets'] = targets_df.shape[0] stats_dict['train_seqs'] = len(train_mseqs) stats_dict['valid_seqs'] = len(valid_mseqs) stats_dict['test_seqs'] = len(test_mseqs) stats_dict['seq_length'] = options.seq_length stats_dict['pool_width'] = options.pool_width stats_dict['crop_bp'] = options.crop_bp stats_dict['diagonal_offset'] = options.diagonal_offset target1_length = options.seq_length - 2 * options.crop_bp target1_length = target1_length // options.pool_width target1_length = target1_length - options.diagonal_offset target_length = target1_length * (target1_length + 1) // 2 stats_dict['target_length'] = target_length with open('%s/statistics.json' % options.out_dir, 'w') as stats_json_out: json.dump(stats_dict, stats_json_out, indent=4)
def main(): usage = 'usage: %prog [options] <fasta_file> <sample_wigs_file> <hdf5_file>' parser = OptionParser(usage) parser.add_option( '-b', dest='limit_bed', help='Limit to segments that overlap regions in a BED file') parser.add_option( '-c', dest='clip', default=None, type='float', help='Clip target values to have minimum [Default: %default]') parser.add_option('--cluster_dir', dest='cluster_dir', default='basenji_hdf5') parser.add_option('-d', dest='sample_pct', default=1.0, type='float', help='Down-sample the segments') parser.add_option('-f', dest='fourier_dim', default=None, type='int', help='Fourier transform dimension [Default: %default]') parser.add_option('-g', dest='gaps_file', help='Genome assembly gaps BED [Default: %default]') parser.add_option('-l', dest='seq_length', default=1024, type='int', help='Sequence length [Default: %default]') parser.add_option( '--log2', dest='log10to2', default=False, action='store_true', help='Transform values from log10 to log2 [Default: %default]') parser.add_option( '--mult_cov', dest='cov_multiplier', default=1, type='float', help= 'Coverage multiplier, useful when the read extension and pool width do not match [Default: %default]' ) parser.add_option( '-n', dest='na_t', default=0.25, type='float', help= 'Remove sequences with an NA% greater than this threshold [Default: %default]' ) parser.add_option( '-o', dest='out_bed_file', help='Output the train/valid/test sequences as a BED file') parser.add_option( '-p', dest='processes', default=1, type='int', help='Number parallel processes to load data [Default: %default]') parser.add_option('-s', dest='stride', type='int', help='Stride to advance segments [Default: seq_length]') parser.add_option( '-t', dest='test_pct_or_chr', type='str', default=0.05, help='Proportion of the data for testing [Default: %default]') parser.add_option('-u', dest='unmap_bed', help='Unmappable segments to set to NA') parser.add_option('-w', dest='pool_width', type='int', default=1, help='Average pooling width [Default: %default]') parser.add_option( '-v', dest='valid_pct_or_chr', type='str', default=0.05, help='Proportion of the data for validation [Default: %default]') parser.add_option('-z', dest='compression', help='h5py compression [Default: %default]') (options, args) = parser.parse_args() if len(args) != 3: parser.error( 'Must provide genome FASTA file, sample Wig/BigWig labels and paths, ' 'and model output file') else: fasta_file = args[0] sample_wigs_file = args[1] hdf5_file = args[2] random.seed(1) if options.stride is None: options.stride = options.seq_length ################################################################ # assess bigwigs ################################################################ # get wig files and labels target_wigs = OrderedDict() target_strands = [] target_labels = [] for line in open(sample_wigs_file, encoding='UTF-8'): a = line.rstrip().split('\t') if a[0] in target_wigs: print('WARNING: duplicate target id %s' % a[0], file=sys.stderr) target_wigs[a[0]] = a[1] target_strands.append(a[2]) if len(a) > 3: target_labels.append(a[3]) else: target_labels.append('') if options.fourier_dim is not None and 2 * options.fourier_dim >= options.seq_length / options.pool_width: print( "Fourier transform to %d dims won't compress %d length sequences with %d pooling" % (options.fourier_dim, options.seq_length, options.pool_width), file=sys.stderr) exit(1) ################################################################ # prepare genomic segments ################################################################ chrom_segments = genome.load_chromosomes(fasta_file) # remove gaps if options.gaps_file: chrom_segments = genome.split_contigs(chrom_segments, options.gaps_file) # ditch the chromosomes segments = [] for chrom in chrom_segments: segments += [(chrom, seg_start, seg_end) for seg_start, seg_end in chrom_segments[chrom]] # standardize order segments.sort() # filter for large enough segments = [ cse for cse in segments if cse[2] - cse[1] >= options.seq_length ] # down-sample if options.sample_pct < 1.0: segments = random.sample(segments, int(options.sample_pct * len(segments))) # limit to a BED file if options.limit_bed is not None: segments = limit_segments(segments, options.limit_bed) if not os.path.isdir(options.cluster_dir): os.mkdir(options.cluster_dir) # print segments to BED file seg_bed_file = '%s/segments.bed' % options.cluster_dir seg_bed_out = open(seg_bed_file, 'w') for chrom, seg_start, seg_end in segments: print('%s\t%d\t%d' % (chrom, seg_start, seg_end), file=seg_bed_out) seg_bed_out.close() ################################################################ # bigwig read and process ################################################################ print('Reading and pre-processing bigwigs for %d segments' % len(segments), flush=True) targets_real = [] targets_imag = [] # generate numpy arrays on cluster jobs = [] for target_label in target_wigs.keys(): wig_file = target_wigs[target_label] npy_file = '%s/%s' % (options.cluster_dir, target_label) if not os.path.isfile(npy_file) and not os.path.isfile( '%s.npy' % npy_file): print(npy_file) if os.path.splitext(wig_file)[1] == '.h5': script = 'seqs_hdf5.py' else: script = 'bigwig_hdf5.py' cmd = 'echo $HOSTNAME; %s -l %d -s %d -w %d %s %s %s' % ( script, options.seq_length, options.stride, options.pool_width, wig_file, seg_bed_file, npy_file) name = 'hdf5_%s' % target_label outf = '%s/%s.out' % (options.cluster_dir, target_label) errf = '%s/%s.err' % (options.cluster_dir, target_label) j = slurm.Job(cmd, name, outf, errf, queue='standard,tbdisk', mem=15000, time='12:0:0') jobs.append(j) slurm.multi_run(jobs) # load into targets_real, targets_imag for target_label in target_wigs.keys(): npy_file = '%s/%s.npy' % (options.cluster_dir, target_label) wig_targets = np.load(npy_file) targets_real.append(wig_targets) # transpose from TxSxL to SxLxT targets_real = np.transpose(np.array(targets_real), axes=(1, 2, 0)) print('%d target sequences' % targets_real.shape[0]) ################################################################ # one hot code sequences ################################################################ seqs_1hot, seqs_segments = segments_1hot(fasta_file, segments, options.seq_length, options.stride) print('%d sequences one hot coded' % seqs_1hot.shape[0]) ################################################################ # correct for unmappable regions ################################################################ if options.unmap_bed is not None: seqs_na = annotate_na(seqs_segments, options.unmap_bed, options.seq_length, options.pool_width) # determine mappable sequences and update test indexes map_indexes = [] for i in range(seqs_na.shape[0]): # mappable if seqs_na[i, :].mean(dtype='float64') < options.na_t: map_indexes.append(i) # unmappable else: # forget it pass # update data structures targets_real = targets_real[map_indexes] if options.fourier_dim is not None: targets_imag = targets_imag[map_indexes] seqs_1hot = seqs_1hot[map_indexes] seqs_segments = [seqs_segments[mi] for mi in map_indexes] seqs_na = seqs_na[map_indexes] ################################################################ # write to train, valid, test HDF5 ################################################################ # choose test indexes if options.test_pct_or_chr.startswith('chr'): test_indexes = [ si for si in range(len(seqs_segments)) if seqs_segments[si][0] == options.test_pct_or_chr ] else: test_pct = float(options.test_pct_or_chr) test_indexes = [ twi for twi in range(len(seqs_segments)) if random.random() < test_pct ] # choose valid indexes if options.valid_pct_or_chr.startswith('chr'): # valid_indexes = np.array([seq_seg[0] == options.valid_pct_or_chr for seq_seg in seqs_segments]) valid_indexes = [ si for si in range(len(seqs_segments)) if seqs_segments[si][0] == options.valid_pct_or_chr ] else: valid_pct = float(options.valid_pct_or_chr) valid_n = int(valid_pct * len(seqs_segments)) nontest_indexes = set(range(len(seqs_segments))) - set(test_indexes) valid_indexes = random.sample(nontest_indexes, valid_n) # remainder is training train_indexes = list( set(range(len(seqs_segments))) - set(valid_indexes) - set(test_indexes)) # training may require shuffling random.shuffle(train_indexes) random.shuffle(valid_indexes) random.shuffle(test_indexes) # write to HDF5 hdf5_out = h5py.File(hdf5_file, 'w') # store pooling hdf5_out.create_dataset('pool_width', data=options.pool_width, dtype='int') # store targets target_ids = np.array(list(target_wigs.keys()), dtype='S') hdf5_out.create_dataset('target_ids', data=target_ids) target_labels = np.array(target_labels, dtype='S') hdf5_out.create_dataset('target_labels', data=target_labels) target_strands = np.array(target_strands, dtype='S') hdf5_out.create_dataset('target_strands', data=target_strands) # HDF5 train hdf5_out.create_dataset('train_in', data=seqs_1hot[train_indexes], dtype='bool', compression=options.compression) hdf5_out.create_dataset('train_out', data=targets_real[train_indexes], dtype='float16', compression=options.compression) if options.fourier_dim is not None: hdf5_out.create_dataset('train_out_imag', data=targets_imag[train_indexes], dtype='float16', compression=options.compression) hdf5_out.create_dataset('train_na', data=seqs_na[train_indexes], dtype='bool', compression=options.compression) # HDF5 valid hdf5_out.create_dataset('valid_in', data=seqs_1hot[valid_indexes], dtype='bool', compression=options.compression) hdf5_out.create_dataset('valid_out', data=targets_real[valid_indexes], dtype='float16', compression=options.compression) if options.fourier_dim is not None: hdf5_out.create_dataset('valid_out_imag', data=targets_imag[valid_indexes], dtype='float16', compression=options.compression) hdf5_out.create_dataset('valid_na', data=seqs_na[valid_indexes], dtype='bool', compression=options.compression) # HDF5 test hdf5_out.create_dataset('test_in', data=seqs_1hot[test_indexes], dtype='bool', compression=options.compression) hdf5_out.create_dataset('test_out', data=targets_real[test_indexes], dtype='float16', compression=options.compression) if options.fourier_dim is not None: hdf5_out.create_dataset('test_out_imag', data=targets_imag[test_indexes], dtype='float16', compression=options.compression) hdf5_out.create_dataset('test_na', data=seqs_na[test_indexes], dtype='bool', compression=options.compression) hdf5_out.close() # output BED file if options.out_bed_file: out_bed_out = open(options.out_bed_file, 'w') for si in train_indexes: print('%s\t%d\t%d\ttrain' % seqs_segments[si], file=out_bed_out) for si in valid_indexes: print('%s\t%d\t%d\tvalid' % seqs_segments[si], file=out_bed_out) for si in test_indexes: print('%s\t%d\t%d\ttest' % seqs_segments[si], file=out_bed_out) out_bed_out.close()
def main(): usage = "usage: %prog [options] <params_file> <model_file> <vcf_file>" parser = OptionParser(usage) parser.add_option( "-b", dest="batch_size", default=256, type="int", help="Batch size [Default: %default]", ) parser.add_option( "-c", dest="csv", default=False, action="store_true", help="Print table as CSV [Default: %default]", ) parser.add_option( "-f", dest="genome_fasta", default="%s/data/hg19.fa" % os.environ["BASENJIDIR"], help="Genome FASTA for sequences [Default: %default]", ) parser.add_option( "-g", dest="genome_file", default="%s/data/human.hg19.genome" % os.environ["BASENJIDIR"], help="Chromosome lengths file [Default: %default]", ) parser.add_option( "--h5", dest="out_h5", default=False, action="store_true", help="Output stats to sad.h5 [Default: %default]", ) parser.add_option( "--local", dest="local", default=1024, type="int", help="Local SAD score [Default: %default]", ) parser.add_option("-n", dest="norm_file", default=None, help="Normalize SAD scores") parser.add_option( "-o", dest="out_dir", default="sad", help="Output directory for tables and plots [Default: %default]", ) parser.add_option( "-p", dest="processes", default=None, type="int", help="Number of processes, passed by multi script", ) parser.add_option( "--pseudo", dest="log_pseudo", default=1, type="float", help="Log2 pseudocount [Default: %default]", ) parser.add_option( "-q", dest="queue", default="k80", help="SLURM queue on which to run the jobs [Default: %default]", ) parser.add_option( "-r", dest="restart", default=False, action="store_true", help="Restart a partially completed job [Default: %default]", ) parser.add_option( "--rc", dest="rc", default=False, action="store_true", help= "Average forward and reverse complement predictions [Default: %default]", ) parser.add_option( "--shifts", dest="shifts", default="0", type="str", help="Ensemble prediction shifts [Default: %default]", ) parser.add_option( "--stats", dest="sad_stats", default="SAD,xSAR", help="Comma-separated list of stats to save. [Default: %default]", ) parser.add_option( "-t", dest="targets_file", default=None, type="str", help="File specifying target indexes and labels in table format", ) parser.add_option( "--ti", dest="track_indexes", default=None, type="str", help="Comma-separated list of target indexes to output BigWig tracks", ) parser.add_option( "-u", dest="penultimate", default=False, action="store_true", help="Compute SED in the penultimate layer [Default: %default]", ) parser.add_option( "-z", dest="out_zarr", default=False, action="store_true", help="Output stats to sad.zarr [Default: %default]", ) (options, args) = parser.parse_args() if len(args) != 3: parser.error("Must provide parameters and model files and VCF file") else: params_file = args[0] model_file = args[1] vcf_file = args[2] ####################################################### # prep work # output directory if not options.restart: if os.path.isdir(options.out_dir): print("Please remove %s" % options.out_dir, file=sys.stderr) exit(1) os.mkdir(options.out_dir) # pickle options options_pkl_file = "%s/options.pkl" % options.out_dir options_pkl = open(options_pkl_file, "wb") pickle.dump(options, options_pkl) options_pkl.close() ####################################################### # launch worker threads jobs = [] for pi in range(options.processes): if not options.restart or not job_completed(options, pi): cmd = "source activate tf1.12-gpu; basenji_sadf.py %s %s %d" % ( options_pkl_file, " ".join(args), pi, ) name = "sad_p%d" % pi outf = "%s/job%d.out" % (options.out_dir, pi) errf = "%s/job%d.err" % (options.out_dir, pi) j = slurm.Job( cmd, name, outf, errf, queue=options.queue, gpu=1, mem=15000, time="7-0:0:0", ) jobs.append(j) slurm.multi_run(jobs, max_proc=options.processes, verbose=True, launch_sleep=10, update_sleep=60) ####################################################### # collect output if options.out_h5: collect_h5("sad.h5", options.out_dir, options.processes) elif options.out_zarr: collect_zarr("sad.zarr", options.out_dir, options.processes) else: collect_table("sad_table.txt", options.out_dir, options.processes)
def main(): usage = 'usage: %prog [options] <exp_dir> <params_file> <data_dir> <bed_file>' parser = OptionParser(usage) # sat options sat_options = OptionGroup(parser, 'basenji_sat_bed.py options') sat_options.add_option( '-d', dest='mut_down', default=0, type='int', help= 'Nucleotides downstream of center sequence to mutate [Default: %default]' ) sat_options.add_option( '-f', dest='genome_fasta', default=None, help='Genome FASTA for sequences [Default: %default]') sat_options.add_option( '-l', dest='mut_len', default=0, type='int', help='Length of center sequence to mutate [Default: %default]') sat_options.add_option('-o', dest='out_dir', default='sat_mut', help='Output directory [Default: %default]') sat_options.add_option('--plots', dest='plots', default=False, action='store_true', help='Make heatmap plots [Default: %default]') sat_options.add_option('-p', dest='processes', default=None, type='int', help='Number of processes, passed by multi script') sat_options.add_option( '--rc', dest='rc', default=False, action='store_true', help= 'Ensemble forward and reverse complement predictions [Default: %default]' ) sat_options.add_option( '--shifts', dest='shifts', default='0', help='Ensemble prediction shifts [Default: %default]') sat_options.add_option( '--stats', dest='sad_stats', default='sum', help='Comma-separated list of stats to save. [Default: %default]') sat_options.add_option( '-t', dest='targets_file', default=None, type='str', help='File specifying target indexes and labels in table format') sat_options.add_option( '-u', dest='mut_up', default=0, type='int', help= 'Nucleotides upstream of center sequence to mutate [Default: %default]' ) parser.add_option_group(sat_options) phylop_options = OptionGroup(parser, 'basenji_bench_phylop.py options') # phylop_options.add_option('-e', dest='num_estimators', # default=100, type='int', # help='Number of random forest estimators [Default: %default]') phylop_options.add_option( '-g', dest='genome', default='ce11', help='PhyloP and FASTA genome [Default: %default]') # phylop_options.add_option('--pca', dest='n_components', # default=None, type='int', # help='PCA n_components [Default: %default]') parser.add_option_group(phylop_options) fold_options = OptionGroup(parser, 'cross-fold options') fold_options.add_option( '-a', '--alt', dest='alternative', default='two-sided', help='Statistical test alternative [Default: %default]') fold_options.add_option( '-c', dest='crosses', default=1, type='int', help='Number of cross-fold rounds [Default:%default]') fold_options.add_option('-e', dest='conda_env', default='tf2-gpu', help='Anaconda environment [Default: %default]') fold_options.add_option('--label_exp', dest='label_exp', default='Experiment', help='Experiment label [Default: %default]') fold_options.add_option('--label_ref', dest='label_ref', default='Reference', help='Reference label [Default: %default]') fold_options.add_option('--name', dest='name', default='sat', help='SLURM name prefix [Default: %default]') fold_options.add_option( '-q', dest='queue', default='gtx1080ti', help='SLURM queue on which to run the jobs [Default: %default]') fold_options.add_option('-r', dest='ref_dir', default=None, help='Reference directory for statistical tests') parser.add_option_group(fold_options) (options, args) = parser.parse_args() if len(args) != 4: parser.error('Must provide parameters file and data directory') else: exp_dir = args[0] params_file = args[1] data_dir = args[2] bed_file = args[3] # read data parameters data_stats_file = '%s/statistics.json' % data_dir with open(data_stats_file) as data_stats_open: data_stats = json.load(data_stats_open) # count folds num_folds = len([dkey for dkey in data_stats if dkey.startswith('fold')]) # genome genome_path = os.environ[options.genome.upper()] options.genome_fasta = '%s/assembly/%s.fa' % (genome_path, options.genome) ################################################################ # saturation mutagenesis ################################################################ jobs = [] scores_files = [] for ci in range(options.crosses): for fi in range(num_folds): it_dir = '%s/f%d_c%d' % (exp_dir, fi, ci) # update output directory sat_dir = '%s/%s' % (it_dir, options.out_dir) # check if done scores_file = '%s/scores.h5' % sat_dir scores_files.append(scores_file) if os.path.isfile(scores_file): print('%s already generated.' % scores_file) else: basenji_cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' basenji_cmd += ' conda activate %s;' % options.conda_env basenji_cmd += ' echo $HOSTNAME;' basenji_cmd += ' basenji_sat_bed.py' basenji_cmd += ' %s' % options_string(options, sat_options, sat_dir) basenji_cmd += ' %s' % params_file basenji_cmd += ' %s/train/model_best.h5' % it_dir basenji_cmd += ' %s' % bed_file name = '%s-f%dc%d' % (options.name, fi, ci) basenji_job = slurm.Job(basenji_cmd, name, out_file='%s.out' % sat_dir, err_file='%s.err' % sat_dir, cpu=2, gpu=1, queue=options.queue, mem=30000, time='7-0:00:00') jobs.append(basenji_job) slurm.multi_run(jobs, verbose=True) ################################################################ # ensemble ################################################################ ensemble_dir = '%s/ensemble' % exp_dir if not os.path.isdir(ensemble_dir): os.mkdir(ensemble_dir) sat_dir = '%s/%s' % (ensemble_dir, options.out_dir) if not os.path.isdir(sat_dir): os.mkdir(sat_dir) if not os.path.isfile('%s/scores.h5' % sat_dir): print('Generating ensemble scores.') ensemble_scores_h5(sat_dir, scores_files) else: print('Ensemble scores already generated.') ################################################################ # PhyloP regressors ################################################################ # num_pcs = int(data_stats['num_targets']**0.75) jobs = [] for ci in range(options.crosses): for fi in range(num_folds): it_dir = '%s/f%d_c%d' % (exp_dir, fi, ci) sat_dir = '%s/%s' % (it_dir, options.out_dir) if not os.path.isfile('%s/stats.txt' % sat_dir): phylop_cmd = 'basenji_bench_phylop.py' phylop_cmd += ' -e 200 -p 4' # phylop_cmd += ' -d %d' % num_pcs phylop_cmd += ' -o %s' % sat_dir phylop_cmd += ' %s/scores.h5' % sat_dir name = '%s-f%dc%d' % (options.name, fi, ci) std_pre = '%s/phylop' % sat_dir j = slurm.Job(phylop_cmd, name, '%s.out' % std_pre, '%s.err' % std_pre, queue='standard', cpu=4, mem=45000, time='1-0:0:0') jobs.append(j) # ensemble sat_dir = '%s/%s' % (ensemble_dir, options.out_dir) if not os.path.isfile('%s/stats.txt' % sat_dir): phylop_cmd = 'basenji_bench_phylop.py' phylop_cmd += ' -e 200 -p 4' # phylop_cmd += ' -d %d' % num_pcs phylop_cmd += ' -o %s' % sat_dir phylop_cmd += ' %s/scores.h5' % sat_dir name = '%s-ens' % options.name std_pre = '%s/phylop' % sat_dir j = slurm.Job(phylop_cmd, name, '%s.out' % std_pre, '%s.err' % std_pre, queue='standard', cpu=4, mem=45000, time='1-0:0:0') jobs.append(j) slurm.multi_run(jobs, verbose=True) ################################################################ # compare ################################################################ ref_sat_dirs = [] exp_sat_dirs = [] for ci in range(options.crosses): for fi in range(num_folds): exp_sat_dir = '%s/f%d_c%d/%s' % (exp_dir, fi, ci, options.out_dir) exp_sat_dirs.append(exp_sat_dir) if options.ref_dir is not None: ref_sat_dir = '%s/f%d_c%d/%s' % (options.ref_dir, fi, ci, options.out_dir) ref_sat_dirs.append(ref_sat_dir) exp_pcor_folds, exp_r2_folds = read_metrics(exp_sat_dirs) exp_sat_dirs = ['%s/ensemble/%s' % (exp_dir, options.out_dir)] exp_pcor_ens, exp_r2_ens = read_metrics(exp_sat_dirs) if options.ref_dir is not None: ref_pcor_folds, ref_r2_folds = read_metrics(ref_sat_dirs) ref_sat_dirs = ['%s/ensemble/%s' % (options.ref_dir, options.out_dir)] ref_pcor_ens, ref_r2_ens = read_metrics(ref_sat_dirs) print('PearsonR') exp_mean = exp_pcor_folds.mean() exp_stdm = exp_pcor_folds.std() / np.sqrt(len(exp_pcor_folds)) expe_mean = exp_pcor_ens.mean() expe_stdm = exp_pcor_ens.std() / np.sqrt(len(exp_pcor_ens)) print('%12s: %.4f (%.4f)' % (options.label_exp, exp_mean, exp_stdm)) print('%12s (ens): %.4f (%.4f)' % (options.label_exp, expe_mean, expe_stdm)) if options.ref_dir is not None: ref_mean = ref_pcor_folds.mean() ref_stdm = ref_pcor_folds.std() / np.sqrt(len(ref_pcor_folds)) refe_mean = ref_pcor_ens.mean() refe_stdm = ref_pcor_ens.std() / np.sqrt(len(ref_pcor_ens)) print('%12s: %.4f (%.4f)' % (options.label_ref, ref_mean, ref_stdm)) print('%12s (ens): %.4f (%.4f)' % (options.label_ref, refe_mean, refe_stdm)) mwp, tp = stat_tests(exp_pcor_folds, ref_pcor_folds, options.alternative) print('Mann-Whitney U p-value: %.3g' % mwp) print('T-test p-value: %.3g' % tp) print('\nR2') exp_mean = exp_r2_folds.mean() exp_stdm = exp_r2_folds.std() / np.sqrt(len(exp_r2_folds)) expe_mean = exp_r2_ens.mean() expe_stdm = exp_r2_ens.std() / np.sqrt(len(exp_r2_ens)) print('%12s: %.4f (%.4f)' % (options.label_exp, exp_mean, exp_stdm)) print('%12s (ens): %.4f (%.4f)' % (options.label_exp, expe_mean, expe_stdm)) if options.ref_dir is not None: ref_mean = ref_r2_folds.mean() ref_stdm = ref_r2_folds.std() / np.sqrt(len(ref_r2_folds)) refe_mean = ref_r2_ens.mean() refe_stdm = ref_r2_ens.std() / np.sqrt(len(ref_r2_ens)) print('%12s: %.4f (%.4f)' % (options.label_ref, ref_mean, ref_stdm)) print('%12s (ens): %.4f (%.4f)' % (options.label_ref, refe_mean, refe_stdm)) mwp, tp = stat_tests(exp_r2_folds, ref_r2_folds, options.alternative) print('Mann-Whitney U p-value: %.3g' % mwp) print('T-test p-value: %.3g' % tp)
def main(): usage = 'usage: %prog [options] <params_file> <model_file> <vcf_file>' parser = OptionParser(usage) # scd parser.add_option('-f', dest='genome_fasta', default='%s/data/hg19.fa' % os.environ['BASENJIDIR'], help='Genome FASTA for sequences [Default: %default]') parser.add_option('-m', dest='plot_map', default=False, action='store_true', help='Plot contact map for each allele [Default: %default]') parser.add_option('-o',dest='out_dir', default='scd', help='Output directory for tables and plots [Default: %default]') parser.add_option('--rc', dest='rc', default=False, action='store_true', help='Average forward and reverse complement predictions [Default: %default]') parser.add_option('--shifts', dest='shifts', default='0', type='str', help='Ensemble prediction shifts [Default: %default]') parser.add_option('--stats', dest='scd_stats', default='SCD', help='Comma-separated list of stats to save. [Default: %default]') parser.add_option('-t', dest='targets_file', default=None, type='str', help='File specifying target indexes and labels in table format') # multi parser.add_option('--cpu', dest='cpu', default=False, action='store_true', help='Run without a GPU [Default: %default]') parser.add_option('--name', dest='name', default='scd', help='SLURM name prefix [Default: %default]') parser.add_option('--max_proc', dest='max_proc', default=None, type='int', help='Maximum concurrent processes [Default: %default]') parser.add_option('-p', dest='processes', default=None, type='int', help='Number of processes, passed by multi script') parser.add_option('-q', dest='queue', default='gtx1080ti', help='SLURM queue on which to run the jobs [Default: %default]') parser.add_option('-r', dest='restart', default=False, action='store_true', help='Restart a partially completed job [Default: %default]') (options, args) = parser.parse_args() if len(args) != 3: parser.error('Must provide parameters and model files and VCF file') else: params_file = args[0] model_file = args[1] vcf_file = args[2] ####################################################### # prep work # output directory if not options.restart: if os.path.isdir(options.out_dir): print('Please remove %s' % options.out_dir, file=sys.stderr) exit(1) os.mkdir(options.out_dir) # pickle options options_pkl_file = '%s/options.pkl' % options.out_dir options_pkl = open(options_pkl_file, 'wb') pickle.dump(options, options_pkl) options_pkl.close() ####################################################### # launch worker threads jobs = [] for pi in range(options.processes): if not options.restart or not job_completed(options, pi): if options.cpu: cmd = '' else: cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' cmd += 'conda activate tf1.15-gpu;' cmd += ' akita_scd.py %s %s %d' % ( options_pkl_file, ' '.join(args), pi) name = '%s_p%d' % (options.name, pi) outf = '%s/job%d.out' % (options.out_dir, pi) errf = '%s/job%d.err' % (options.out_dir, pi) num_gpu = 1*(not options.cpu) j = slurm.Job(cmd, name, outf, errf, queue=options.queue, gpu=num_gpu, mem=15000, time='14-0:0:0') jobs.append(j) slurm.multi_run(jobs, max_proc=options.max_proc, verbose=True, launch_sleep=10, update_sleep=60) ####################################################### # collect output collect_h5('scd.h5', options.out_dir, options.processes)
def main(): usage = 'usage: %prog [options] <exp_dir> <params_file> <data_dir>' parser = OptionParser(usage) parser.add_option('-a', '--alt', dest='alternative', default='two-sided', help='Statistical test alternative [Default: %default]') parser.add_option('-d', dest='dataset_i', default=None, type='int', help='Dataset index [Default:%default]') parser.add_option('-e', dest='conda_env', default='tf2-gpu', help='Anaconda environment [Default: %default]') parser.add_option('--name', dest='name', default='test', help='SLURM name prefix [Default: %default]') parser.add_option('-q', dest='queue', default='gtx1080ti') parser.add_option('-r', dest='ref_dir', default=None, help='Reference directory for statistical tests') parser.add_option( '--rc', dest='rc', default=False, action='store_true', help= 'Average forward and reverse complement predictions [Default: %default]' ) parser.add_option('--shifts', dest='shifts', default='0', type='str', help='Ensemble prediction shifts [Default: %default]') parser.add_option('--spec', dest='specificity', default=False, action='store_true', help='Test specificity [Default: %default]') parser.add_option('--train', dest='train', default=False, action='store_true', help='Test on the training set, too [Default: %default]') (options, args) = parser.parse_args() if len(args) != 3: parser.error('Must provide parameters file and data directory') else: exp_dir = args[0] params_file = args[1] data_dirs = [os.path.abspath(arg) for arg in args[2:]] if options.dataset_i is None: head_i = 0 else: head_i = options.dataset_i iterations = len(glob.glob('%s/*' % exp_dir)) ################################################################ # test check ################################################################ jobs = [] if options.train: for i in range(iterations): it_dir = '%s/%d' % (exp_dir, i) if options.dataset_i is None: out_dir = '%s/test_train' % it_dir model_file = '%s/train/model_check.h5' % it_dir data_dir = data_dirs[0] else: out_dir = '%s/test%d_train' % (it_dir, options.dataset_i) model_file = '%s/train/model%d_check.h5' % (it_dir, options.dataset_i) data_dir = data_dirs[options.dataset_i] # check if done acc_file = '%s/acc.txt' % out_dir if os.path.isfile(acc_file): print('%s already generated.' % acc_file) else: cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' cmd += ' conda activate %s;' % options.conda_env cmd += ' basenji_test.py' cmd += ' --head %d' % head_i cmd += ' -o %s' % out_dir if options.rc: cmd += ' --rc' if options.shifts: cmd += ' --shifts %s' % options.shifts cmd += ' --split train' cmd += ' %s' % params_file cmd += ' %s' % model_file cmd += ' %s' % data_dir name = '%s-testtr%d' % (options.name, i) j = slurm.Job(cmd, name=name, out_file='%s.out' % out_dir, err_file='%s.err' % out_dir, queue=options.queue, cpu=1, gpu=1, mem=23000, time='4:00:00') jobs.append(j) ################################################################ # test best ################################################################ for i in range(iterations): it_dir = '%s/%d' % (exp_dir, i) if options.dataset_i is None: out_dir = '%s/test' % it_dir model_file = '%s/train/model_best.h5' % it_dir data_dir = data_dirs[0] else: out_dir = '%s/test%d' % (it_dir, options.dataset_i) model_file = '%s/train/model%d_best.h5' % (it_dir, options.dataset_i) data_dir = data_dirs[options.dataset_i] # check if done acc_file = '%s/acc.txt' % out_dir if os.path.isfile(acc_file): print('%s already generated.' % acc_file) else: # basenji test cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' cmd += ' conda activate %s;' % options.conda_env cmd += ' basenji_test.py' cmd += ' --head %d' % head_i cmd += ' -o %s' % out_dir if options.rc: cmd += ' --rc' if options.shifts: cmd += ' --shifts %s' % options.shifts cmd += ' %s' % params_file cmd += ' %s' % model_file cmd += ' %s' % data_dir name = '%s-test%d' % (options.name, i) j = slurm.Job(cmd, name=name, out_file='%s.out' % out_dir, err_file='%s.err' % out_dir, queue=options.queue, cpu=1, gpu=1, mem=23000, time='4:00:00') jobs.append(j) ################################################################ # test best specificity ################################################################ if options.specificity: for i in range(iterations): it_dir = '%s/%d' % (exp_dir, i) if options.dataset_i is None: out_dir = '%s/test_spec' % it_dir model_file = '%s/train/model_best.h5' % it_dir data_dir = data_dirs[0] else: out_dir = '%s/test%d_spec' % (it_dir, di) model_file = '%s/train/model%d_best.h5' % (it_dir, di) data_dir = data_dirs[options.dataset_i] # check if done acc_file = '%s/acc.txt' % out_dir if os.path.isfile(acc_file): print('%s already generated.' % acc_file) else: # basenji test cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' cmd += ' conda activate %s;' % options.conda_env cmd += ' basenji_test_specificity.py' cmd += ' --head %d' % head_i cmd += ' -o %s' % out_dir if options.rc: cmd += ' --rc' if options.shifts: cmd += ' --shifts %s' % options.shifts cmd += ' %s' % params_file cmd += ' %s' % model_file cmd += ' %s' % data_dir name = '%s-spec%d' % (options.name, i) j = slurm.Job(cmd, name=name, out_file='%s.out' % out_dir, err_file='%s.err' % out_dir, queue=options.queue, cpu=1, gpu=1, mem=75000, time='6:00:00') jobs.append(j) slurm.multi_run(jobs, verbose=True) if options.ref_dir is not None: ################################################################ # compare checkpoint on training set ################################################################ if options.train: ref_glob_str = '%s/*/test_train/acc.txt' % options.ref_dir ref_cors, ref_mean, ref_stdm = read_cors(ref_glob_str) exp_glob_str = '%s/*/test_train/acc.txt' % exp_dir exp_cors, exp_mean, exp_stdm = read_cors(exp_glob_str) mwp, tp = stat_tests(ref_cors, exp_cors, options.alternative) print('\nTrain:') print('Reference PearsonR: %.4f (%.4f)' % (ref_mean, ref_stdm)) print('Experiment PearsonR: %.4f (%.4f)' % (exp_mean, exp_stdm)) print('Mann-Whitney U p-value: %.3g' % mwp) print('T-test p-value: %.3g' % tp) ################################################################ # compare best on test set ################################################################ ref_glob_str = '%s/*/test/acc.txt' % options.ref_dir ref_cors, ref_mean, ref_stdm = read_cors(ref_glob_str) exp_glob_str = '%s/*/test/acc.txt' % exp_dir exp_cors, exp_mean, exp_stdm = read_cors(exp_glob_str) mwp, tp = stat_tests(ref_cors, exp_cors, options.alternative) print('\nTest:') print('Reference PearsonR: %.4f (%.4f)' % (ref_mean, ref_stdm)) print('Experiment PearsonR: %.4f (%.4f)' % (exp_mean, exp_stdm)) print('Mann-Whitney U p-value: %.3g' % mwp) print('T-test p-value: %.3g' % tp) ################################################################ # compare best on test set specificity ################################################################ if options.specificity: ref_glob_str = '%s/*/test_spec/acc.txt' % options.ref_dir ref_cors, ref_mean, ref_stdm = read_cors(ref_glob_str) exp_glob_str = '%s/*/test_spec/acc.txt' % exp_dir exp_cors, exp_mean, exp_stdm = read_cors(exp_glob_str) mwp, tp = stat_tests(ref_cors, exp_cors, options.alternative) print('\nSpecificity:') print('Reference PearsonR: %.4f (%.4f)' % (ref_mean, ref_stdm)) print('Experiment PearsonR: %.4f (%.4f)' % (exp_mean, exp_stdm)) print('Mann-Whitney U p-value: %.3g' % mwp) print('T-test p-value: %.3g' % tp)
def main(): usage = 'usage: %prog [options] <model> <vcf_file>' parser = OptionParser(usage) # sad parser.add_option('-b', dest='batch_size', default=4, type='int', help='Batch size [Default: %default]') parser.add_option('-c', dest='slice_center', default=None, type='int', help='Slice center positions [Default: %default]') parser.add_option('-f', dest='genome_fasta', default='%s/data/hg19.fa' % os.environ['BASENJIDIR'], help='Genome FASTA for sequences [Default: %default]') parser.add_option( '-o', dest='out_dir', default='sad', help='Output directory for tables and plots [Default: %default]') parser.add_option('--pseudo', dest='log_pseudo', default=1, type='float', help='Log2 pseudocount [Default: %default]') parser.add_option( '--rc', dest='rc', default=False, action='store_true', help= 'Average forward and reverse complement predictions [Default: %default]' ) parser.add_option('--shifts', dest='shifts', default='0', type='str', help='Ensemble prediction shifts [Default: %default]') parser.add_option('--species', dest='species', default='human') parser.add_option( '--stats', dest='sad_stats', default='SAD', help='Comma-separated list of stats to save. [Default: %default]') parser.add_option( '-t', dest='targets_file', default=None, type='str', help='File specifying target indexes and labels in table format') # multi parser.add_option('-e', dest='conda_env', default='tf2.2-gpu', help='Anaconda environment [Default: %default]') parser.add_option('--name', dest='name', default='sad', help='SLURM name prefix [Default: %default]') parser.add_option('--max_proc', dest='max_proc', default=None, type='int', help='Maximum concurrent processes [Default: %default]') parser.add_option('-p', dest='processes', default=None, type='int', help='Number of processes, passed by multi script') parser.add_option( '-q', dest='queue', default='gtx1080ti', help='SLURM queue on which to run the jobs [Default: %default]') parser.add_option( '-r', dest='restart', default=False, action='store_true', help='Restart a partially completed job [Default: %default]') (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide model and VCF file') else: model_file = args[0] vcf_file = args[1] ####################################################### # prep work # output directory if not options.restart: if os.path.isdir(options.out_dir): print('Please remove %s' % options.out_dir, file=sys.stderr) exit(1) os.mkdir(options.out_dir) # pickle options options_pkl_file = '%s/options.pkl' % options.out_dir options_pkl = open(options_pkl_file, 'wb') pickle.dump(options, options_pkl) options_pkl.close() ####################################################### # launch worker threads jobs = [] for pi in range(options.processes): if not options.restart or not job_completed(options, pi): cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' cmd += ' conda activate %s;' % options.conda_env cmd += ' sonnet_sad.py %s %s %d' % (options_pkl_file, ' '.join(args), pi) name = '%s_p%d' % (options.name, pi) outf = '%s/job%d.out' % (options.out_dir, pi) errf = '%s/job%d.err' % (options.out_dir, pi) j = slurm.Job(cmd, name, outf, errf, queue=options.queue, gpu=1, mem=22000, time='14-0:0:0') jobs.append(j) slurm.multi_run(jobs, max_proc=options.max_proc, verbose=True, launch_sleep=10, update_sleep=60) ####################################################### # collect output collect_h5('sad.h5', options.out_dir, options.processes)
def main(): usage = ('usage: %prog [options] <params_file> <model_file> <genes_hdf5_file>' ' <vcf_file>') parser = OptionParser(usage) parser.add_option( '-a', dest='all_sed', default=False, action='store_true', help= 'Print all variant-gene pairs, as opposed to only nonzero [Default: %default]' ) parser.add_option( '-b', dest='batch_size', default=None, type='int', help='Batch size [Default: %default]') parser.add_option( '-c', dest='csv', default=False, action='store_true', help='Print table as CSV [Default: %default]') parser.add_option( '-g', dest='genome_file', default='%s/data/human.hg19.genome' % os.environ['BASENJIDIR'], help='Chromosome lengths file [Default: %default]') parser.add_option( '-o', dest='out_dir', default='sed', help='Output directory for tables and plots [Default: %default]') parser.add_option( '-p', dest='processes', default=2, type='int', help='Number of parallel processes to run [Default: %default]') parser.add_option( '--pseudo', dest='log_pseudo', default=0.125, type='float', help='Log2 pseudocount [Default: %default]') parser.add_option( '-q', dest='queue', default='k80', help='SLURM queue on which to run the jobs [Default: %default]') parser.add_option( '-r', dest='tss_radius', default=0, type='int', help='Radius of bins considered to quantify TSS transcription [Default: %default]') parser.add_option( '--rc', dest='rc', default=False, action='store_true', help= 'Average the forward and reverse complement predictions when testing [Default: %default]' ) parser.add_option( '--shifts', dest='shifts', default='0', help='Ensemble prediction shifts [Default: %default]') parser.add_option( '-t', dest='targets_file', default=None, help='File specifying target indexes and labels in table format.') parser.add_option( '--ti', dest='track_indexes', help='Comma-separated list of target indexes to output BigWig tracks') parser.add_option( '-u', dest='penultimate', default=False, action='store_true', help='Compute SED in the penultimate layer [Default: %default]') parser.add_option( '-x', dest='tss_table', default=False, action='store_true', help='Print TSS table in addition to gene [Default: %default]') (options, args) = parser.parse_args() if len(args) != 4: parser.error( 'Must provide parameters and model files, genes HDF5 file, and QTL VCF' ' file') else: params_file = args[0] model_file = args[1] genes_hdf5_file = args[2] vcf_file = args[3] ####################################################### # prep work # output directory if os.path.isdir(options.out_dir): shutil.rmtree(options.out_dir) os.mkdir(options.out_dir) # pickle options options_pkl_file = '%s/options.pkl' % options.out_dir options_pkl = open(options_pkl_file, 'wb') pickle.dump(options, options_pkl) options_pkl.close() ####################################################### # launch worker threads jobs = [] for pi in range(options.processes): cmd = 'source activate py3_gpu; basenji_sed.py %s %s %d' % ( options_pkl_file, ' '.join(args), pi) name = 'sed_p%d' % pi outf = '%s/job%d.out' % (options.out_dir, pi) errf = '%s/job%d.err' % (options.out_dir, pi) j = slurm.Job( cmd, name, outf, errf, queue=options.queue, mem=30000, time='4:0:0', gpu=1) jobs.append(j) slurm.multi_run(jobs, max_proc=options.processes, verbose=True, sleep_time=60) ####################################################### # collect output collect_table_multi('sed_gene.txt', options.out_dir, options.processes, options.log_pseudo) if options.tss_table: collect_table('sed_tss.txt', options.out_dir, options.processes) if options.track_indexes is not None: if not os.path.isdir('%s/tracks' % options.out_dir): os.mkdir('%s/tracks' % options.out_dir) for track_file in glob.glob('%s/job*/tracks/*'): track_base = os.path.split(track_file)[1] os.rename(track_file, '%s/tracks/%s' % (options.out_dir, track_base)) for pi in range(options.processes): shutil.rmtree('%s/job%d' % (options.out_dir, pi))
def main(): usage = "usage: %prog [options] <params_file> <model_file> <bed_file>" parser = OptionParser(usage) # basenji_sat_bed.py options parser.add_option( "-f", dest="genome_fasta", default=None, help="Genome FASTA for sequences [Default: %default]", ) parser.add_option( "-l", dest="mut_len", default=200, type="int", help="Length of center sequence to mutate [Default: %default]", ) parser.add_option( "-o", dest="out_dir", default="sat_mut", help="Output directory [Default: %default]", ) parser.add_option( "--plots", dest="plots", default=False, action="store_true", help="Make heatmap plots [Default: %default]", ) parser.add_option( "--rc", dest="rc", default=False, action="store_true", help= "Ensemble forward and reverse complement predictions [Default: %default]", ) parser.add_option( "--shifts", dest="shifts", default="0", help="Ensemble prediction shifts [Default: %default]", ) parser.add_option( "-t", dest="targets_file", default=None, type="str", help="File specifying target indexes and labels in table format", ) # _multi.py options parser.add_option( "-n", dest="name", default="sat", help="SLURM job name prefix [Default: %default]", ) parser.add_option( "-p", dest="processes", default=None, type="int", help="Number of processes, passed by multi script", ) parser.add_option( "-q", dest="queue", default="k80", help="SLURM queue on which to run the jobs [Default: %default]", ) parser.add_option( "-r", dest="restart", default=False, action="store_true", help="Restart a partially completed job [Default: %default]", ) (options, args) = parser.parse_args() if len(args) != 3: print(args) parser.error("Must provide parameters and model files and BED file") else: params_file = args[0] model_file = args[1] bed_file = args[2] ####################################################### # prep work # output directory if not options.restart: if os.path.isdir(options.out_dir): print("Please remove %s" % options.out_dir, file=sys.stderr) exit(1) os.mkdir(options.out_dir) # pickle options options_pkl_file = "%s/options.pkl" % options.out_dir options_pkl = open(options_pkl_file, "wb") pickle.dump(options, options_pkl) options_pkl.close() ####################################################### # launch worker threads jobs = [] for pi in range(options.processes): if not options.restart or not job_completed(options, pi): cmd = "source activate tf1.12-gpu; basenji_sat_bed.py %s %s %d" % ( options_pkl_file, " ".join(args), pi, ) name = "%s_p%d" % (options.name, pi) outf = "%s/job%d.out" % (options.out_dir, pi) errf = "%s/job%d.err" % (options.out_dir, pi) j = slurm.Job( cmd, name, outf, errf, queue=options.queue, gpu=1, mem=30000, time="14-0:0:0", ) jobs.append(j) slurm.multi_run(jobs, max_proc=options.processes, verbose=True, launch_sleep=10, update_sleep=60) ####################################################### # collect output collect_h5(options.out_dir, options.processes)
def main(): usage = 'usage: %prog [options] <params_file> <model_file> <bed_file>' parser = OptionParser(usage) # basenji_sat_bed.py options parser.add_option('-f', dest='genome_fasta', default='%s/assembly/hg19.fa' % os.environ['HG19'], help='Genome FASTA for sequences [Default: %default]') parser.add_option( '-l', dest='mut_len', default=200, type='int', help='Length of center sequence to mutate [Default: %default]') parser.add_option('-o', dest='out_dir', default='sat_mut', help='Output directory [Default: %default]') parser.add_option('--plots', dest='plots', default=False, action='store_true', help='Make heatmap plots [Default: %default]') parser.add_option( '--rc', dest='rc', default=False, action='store_true', help= 'Ensemble forward and reverse complement predictions [Default: %default]' ) parser.add_option('--shifts', dest='shifts', default='0', help='Ensemble prediction shifts [Default: %default]') parser.add_option( '-t', dest='targets_file', default=None, type='str', help='File specifying target indexes and labels in table format') # _multi.py options parser.add_option('-p', dest='processes', default=None, type='int', help='Number of processes, passed by multi script') parser.add_option( '-q', dest='queue', default='k80', help='SLURM queue on which to run the jobs [Default: %default]') parser.add_option( '-r', dest='restart', default=False, action='store_true', help='Restart a partially completed job [Default: %default]') (options, args) = parser.parse_args() if len(args) != 3: parser.error('Must provide parameters and model files and VCF file') else: params_file = args[0] model_file = args[1] bed_file = args[2] ####################################################### # prep work # output directory if not options.restart: if os.path.isdir(options.out_dir): print('Please remove %s' % options.out_dir, file=sys.stderr) exit(1) os.mkdir(options.out_dir) # pickle options options_pkl_file = '%s/options.pkl' % options.out_dir options_pkl = open(options_pkl_file, 'wb') pickle.dump(options, options_pkl) options_pkl.close() ####################################################### # launch worker threads jobs = [] for pi in range(options.processes): if not options.restart or not job_completed(options, pi): cmd = 'source activate py3_gpu; basenji_sat_bed.py %s %s %d' % ( options_pkl_file, ' '.join(args), pi) name = 'sat_p%d' % pi outf = '%s/job%d.out' % (options.out_dir, pi) errf = '%s/job%d.err' % (options.out_dir, pi) j = slurm.Job(cmd, name, outf, errf, queue=options.queue, gpu=1, mem=30000, time='14-0:0:0') jobs.append(j) slurm.multi_run(jobs, max_proc=options.processes, verbose=True, launch_sleep=10, update_sleep=60) ####################################################### # collect output collect_h5(options.out_dir, options.processes)
def main(): usage = 'usage: %prog [options] <exp_dir> <params_file> <data_dir>' parser = OptionParser(usage) parser.add_option('-a', '--alt', dest='alternative', default='two-sided', help='Statistical test alternative [Default: %default]') parser.add_option('-c', dest='crosses', default=1, type='int', help='Number of cross-fold rounds [Default:%default]') parser.add_option('-e', dest='conda_env', default='tf2-gpu', help='Anaconda environment [Default: %default]') parser.add_option('--l1', dest='label1', default='Reference', help='Reference label [Default: %default]') parser.add_option('--l2', dest='label2', default='Experiment', help='Experiment label [Default: %default]') parser.add_option('--name', dest='name', default='test', help='SLURM name prefix [Default: %default]') parser.add_option('-o', dest='out_stem', default=None, help='Outplut plot stem [Default: %default]') parser.add_option('-q', dest='queue', default='gtx1080ti') parser.add_option('-r', dest='ref_dir', default=None, help='Reference directory for statistical tests') parser.add_option( '--rc', dest='rc', default=False, action='store_true', help= 'Average forward and reverse complement predictions [Default: %default]' ) parser.add_option('--shifts', dest='shifts', default='0', type='str', help='Ensemble prediction shifts [Default: %default]') parser.add_option('--spec', dest='specificity', default=False, action='store_true', help='Test specificity [Default: %default]') parser.add_option('--train', dest='train', default=False, action='store_true', help='Test on the training set, too [Default: %default]') (options, args) = parser.parse_args() if len(args) != 3: parser.error('Must provide parameters file and data directory') else: exp_dir = args[0] params_file = args[1] data_dir = args[2] # read data parameters data_stats_file = '%s/statistics.json' % data_dir with open(data_stats_file) as data_stats_open: data_stats = json.load(data_stats_open) # count folds num_folds = len([dkey for dkey in data_stats if dkey.startswith('fold')]) ################################################################ # test check ################################################################ jobs = [] if options.train: for ci in range(options.crosses): for fi in range(num_folds): it_dir = '%s/f%d_c%d' % (exp_dir, fi, ci) # check if done acc_file = '%s/test_train/acc.txt' % it_dir if os.path.isfile(acc_file): print('%s already generated.' % acc_file) else: # basenji test basenji_cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' basenji_cmd += ' conda activate %s;' % options.conda_env basenji_cmd += ' basenji_test.py' basenji_cmd += ' -o %s/test_train' % it_dir if options.rc: basenji_cmd += ' --rc' if options.shifts: basenji_cmd += ' --shifts %s' % options.shifts basenji_cmd += ' --split train' basenji_cmd += ' %s' % params_file basenji_cmd += ' %s/train/model_check.h5' % it_dir basenji_cmd += ' %s/data' % it_dir name = '%s-testtr-f%dc%d' % (options.name, fi, ci) basenji_job = slurm.Job( basenji_cmd, name=name, out_file='%s/test_train.out' % it_dir, err_file='%s/test_train.err' % it_dir, queue=options.queue, cpu=1, gpu=1, mem=23000, time='4:00:00') jobs.append(basenji_job) ################################################################ # test best ################################################################ for ci in range(options.crosses): for fi in range(num_folds): it_dir = '%s/f%d_c%d' % (exp_dir, fi, ci) # check if done acc_file = '%s/test/acc.txt' % it_dir if os.path.isfile(acc_file): print('%s already generated.' % acc_file) else: # basenji test basenji_cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' basenji_cmd += ' conda activate %s;' % options.conda_env basenji_cmd += ' basenji_test.py' basenji_cmd += ' -o %s/test' % it_dir if options.rc: basenji_cmd += ' --rc' if options.shifts: basenji_cmd += ' --shifts %s' % options.shifts basenji_cmd += ' %s' % params_file basenji_cmd += ' %s/train/model_best.h5' % it_dir basenji_cmd += ' %s/data' % it_dir name = '%s-test-f%dc%d' % (options.name, fi, ci) basenji_job = slurm.Job(basenji_cmd, name=name, out_file='%s/test.out' % it_dir, err_file='%s/test.err' % it_dir, queue=options.queue, cpu=1, gpu=1, mem=23000, time='4:00:00') jobs.append(basenji_job) ################################################################ # test best specificity ################################################################ if options.specificity: for ci in range(options.crosses): for fi in range(num_folds): it_dir = '%s/f%d_c%d' % (exp_dir, fi, ci) # check if done acc_file = '%s/test_spec/acc.txt' % it_dir if os.path.isfile(acc_file): print('%s already generated.' % acc_file) else: # basenji test basenji_cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' basenji_cmd += ' conda activate %s;' % options.conda_env basenji_cmd += ' basenji_test_specificity.py' basenji_cmd += ' -o %s/test_spec' % it_dir if options.rc: basenji_cmd += ' --rc' if options.shifts: basenji_cmd += ' --shifts %s' % options.shifts basenji_cmd += ' %s' % params_file basenji_cmd += ' %s/train/model_best.h5' % it_dir basenji_cmd += ' %s/data' % it_dir name = '%s-spec-f%dc%d' % (options.name, fi, ci) basenji_job = slurm.Job( basenji_cmd, name=name, out_file='%s/test_spec.out' % it_dir, err_file='%s/test_spec.err' % it_dir, queue=options.queue, cpu=1, gpu=1, mem=60000, time='6:00:00') jobs.append(basenji_job) slurm.multi_run(jobs, verbose=True) if options.ref_dir is not None: # classification or regression with open('%s/f0_c0/test/acc.txt' % exp_dir) as test0_open: header = test0_open.readline().split() if 'pearsonr' in header: metric = 'pearsonr' else: metric = 'auprc' ################################################################ # compare checkpoint on training set ################################################################ if options.train: ref_glob_str = '%s/*/test_train/acc.txt' % options.ref_dir ref_cors, ref_mean, ref_stdm = read_metrics(ref_glob_str, metric) exp_glob_str = '%s/*/test_train/acc.txt' % exp_dir exp_cors, exp_mean, exp_stdm = read_metrics(exp_glob_str, metric) mwp, tp = stat_tests(ref_cors, exp_cors, options.alternative) print('\nTrain:') print('%12s %s: %.4f (%.4f)' % (options.label1, metric, ref_mean, ref_stdm)) print('%12s %s: %.4f (%.4f)' % (options.label2, metric, exp_mean, exp_stdm)) print('Mann-Whitney U p-value: %.3g' % mwp) print('T-test p-value: %.3g' % tp) if options.out_stem is not None: jointplot(ref_cors, exp_cors, '%s_train.pdf' % options.out_stem, options.label1, options.label2) ################################################################ # compare best on test set ################################################################ ref_glob_str = '%s/*/test/acc.txt' % options.ref_dir ref_cors, ref_mean, ref_stdm = read_metrics(ref_glob_str, metric) exp_glob_str = '%s/*/test/acc.txt' % exp_dir exp_cors, exp_mean, exp_stdm = read_metrics(exp_glob_str, metric) mwp, tp = stat_tests(ref_cors, exp_cors, options.alternative) print('\nTest:') print('%12s %s: %.4f (%.4f)' % (options.label1, metric, ref_mean, ref_stdm)) print('%12s %s: %.4f (%.4f)' % (options.label2, metric, exp_mean, exp_stdm)) print('Mann-Whitney U p-value: %.3g' % mwp) print('T-test p-value: %.3g' % tp) if options.out_stem is not None: jointplot(ref_cors, exp_cors, '%s_test.pdf' % options.out_stem, options.label1, options.label2) ################################################################ # compare best on test set specificity ################################################################ if options.specificity: ref_glob_str = '%s/*/test_spec/acc.txt' % options.ref_dir ref_cors, ref_mean, ref_stdm = read_metrics(ref_glob_str, metric) exp_glob_str = '%s/*/test_spec/acc.txt' % exp_dir exp_cors, exp_mean, exp_stdm = read_metrics(exp_glob_str, metric) mwp, tp = stat_tests(ref_cors, exp_cors, options.alternative) print('\nSpecificity:') print('%12s %s: %.4f (%.4f)' % (options.label1, metric, ref_mean, ref_stdm)) print('%12s %s: %.4f (%.4f)' % (options.label2, metric, exp_mean, exp_stdm)) print('Mann-Whitney U p-value: %.3g' % mwp) print('T-test p-value: %.3g' % tp) if options.out_stem is not None: jointplot(ref_cors, exp_cors, '%s_spec.pdf' % options.out_stem, options.label1, options.label2)
def main(): usage = ( "usage: %prog [options] <params_file> <model_file> <genes_hdf5_file>" " <vcf_file>") parser = OptionParser(usage) parser.add_option( "-a", dest="all_sed", default=False, action="store_true", help= "Print all variant-gene pairs, as opposed to only nonzero [Default: %default]", ) parser.add_option( "-b", dest="batch_size", default=None, type="int", help="Batch size [Default: %default]", ) parser.add_option( "-c", dest="csv", default=False, action="store_true", help="Print table as CSV [Default: %default]", ) parser.add_option( "-g", dest="genome_file", default="%s/data/human.hg19.genome" % os.environ["BASENJIDIR"], help="Chromosome lengths file [Default: %default]", ) parser.add_option( "-o", dest="out_dir", default="sed", help="Output directory for tables and plots [Default: %default]", ) parser.add_option( "-p", dest="processes", default=2, type="int", help="Number of parallel processes to run [Default: %default]", ) parser.add_option( "--pseudo", dest="log_pseudo", default=0.125, type="float", help="Log2 pseudocount [Default: %default]", ) parser.add_option( "-q", dest="queue", default="k80", help="SLURM queue on which to run the jobs [Default: %default]", ) parser.add_option( "-r", dest="tss_radius", default=0, type="int", help= "Radius of bins considered to quantify TSS transcription [Default: %default]", ) parser.add_option( "--rc", dest="rc", default=False, action="store_true", help= "Average the forward and reverse complement predictions when testing [Default: %default]", ) parser.add_option( "--shifts", dest="shifts", default="0", help="Ensemble prediction shifts [Default: %default]", ) parser.add_option( "-t", dest="targets_file", default=None, help="File specifying target indexes and labels in table format.", ) parser.add_option( "--ti", dest="track_indexes", help="Comma-separated list of target indexes to output BigWig tracks", ) parser.add_option( "-u", dest="penultimate", default=False, action="store_true", help="Compute SED in the penultimate layer [Default: %default]", ) parser.add_option( "-x", dest="tss_table", default=False, action="store_true", help="Print TSS table in addition to gene [Default: %default]", ) (options, args) = parser.parse_args() if len(args) != 4: parser.error( "Must provide parameters and model files, genes HDF5 file, and QTL VCF" " file") else: params_file = args[0] model_file = args[1] genes_hdf5_file = args[2] vcf_file = args[3] ####################################################### # prep work # output directory if os.path.isdir(options.out_dir): shutil.rmtree(options.out_dir) os.mkdir(options.out_dir) # pickle options options_pkl_file = "%s/options.pkl" % options.out_dir options_pkl = open(options_pkl_file, "wb") pickle.dump(options, options_pkl) options_pkl.close() ####################################################### # launch worker threads jobs = [] for pi in range(options.processes): cmd = "source activate py3_gpu; basenji_sed.py %s %s %d" % ( options_pkl_file, " ".join(args), pi, ) name = "sed_p%d" % pi outf = "%s/job%d.out" % (options.out_dir, pi) errf = "%s/job%d.err" % (options.out_dir, pi) j = slurm.Job(cmd, name, outf, errf, queue=options.queue, mem=30000, time="4:0:0", gpu=1) jobs.append(j) slurm.multi_run(jobs, max_proc=options.processes, verbose=True, sleep_time=60) ####################################################### # collect output collect_table_multi("sed_gene.txt", options.out_dir, options.processes, options.log_pseudo) if options.tss_table: collect_table("sed_tss.txt", options.out_dir, options.processes) if options.track_indexes is not None: if not os.path.isdir("%s/tracks" % options.out_dir): os.mkdir("%s/tracks" % options.out_dir) for track_file in glob.glob("%s/job*/tracks/*"): track_base = os.path.split(track_file)[1] os.rename(track_file, "%s/tracks/%s" % (options.out_dir, track_base)) for pi in range(options.processes): shutil.rmtree("%s/job%d" % (options.out_dir, pi))
def main(): usage = 'usage: %prog [options] <fasta_file> <targets_file>' parser = OptionParser(usage) parser.add_option('-b', dest='blacklist_bed', help='Set blacklist nucleotides to a baseline value.') parser.add_option( '--break', dest='break_t', default=786432, type='int', help='Break in half contigs above length [Default: %default]') # parser.add_option('-c', dest='clip', # default=None, type='float', # help='Clip target values to have minimum [Default: %default]') parser.add_option('-d', dest='sample_pct', default=1.0, type='float', help='Down-sample the segments') parser.add_option('-g', dest='gaps_file', help='Genome assembly gaps BED [Default: %default]') parser.add_option('-l', dest='seq_length', default=131072, type='int', help='Sequence length [Default: %default]') parser.add_option( '--limit', dest='limit_bed', help='Limit to segments that overlap regions in a BED file') parser.add_option( '--local', dest='run_local', default=False, action='store_true', help='Run jobs locally as opposed to on SLURM [Default: %default]') parser.add_option('-o', dest='out_dir', default='data_out', help='Output directory [Default: %default]') parser.add_option('-p', dest='processes', default=None, type='int', help='Number parallel processes [Default: %default]') parser.add_option('-r', dest='seqs_per_tfr', default=256, type='int', help='Sequences per TFRecord file [Default: %default]') parser.add_option('--seed', dest='seed', default=44, type='int', help='Random seed [Default: %default]') parser.add_option( '--stride_train', dest='stride_train', default=1., type='float', help='Stride to advance train sequences [Default: seq_length]') parser.add_option( '--stride_test', dest='stride_test', default=1., type='float', help='Stride to advance valid and test sequences [Default: seq_length]' ) parser.add_option( '--soft', dest='soft_clip', default=False, action='store_true', help= 'Soft clip values, applying sqrt to the execess above the threshold [Default: %default]' ) parser.add_option( '-t', dest='test_pct_or_chr', default=0.05, type='str', help='Proportion of the data for testing [Default: %default]') parser.add_option('-u', dest='umap_bed', help='Unmappable regions in BED format') parser.add_option( '--umap_t', dest='umap_t', default=0.3, type='float', help= 'Remove sequences with more than this unmappable bin % [Default: %default]' ) parser.add_option( '--umap_set', dest='umap_set', default=None, type='float', help= 'Set unmappable regions to this percentile in the sequences\' distribution of values' ) parser.add_option('-w', dest='pool_width', default=128, type='int', help='Sum pool width [Default: %default]') parser.add_option( '-v', dest='valid_pct_or_chr', default=0.05, type='str', help='Proportion of the data for validation [Default: %default]') (options, args) = parser.parse_args() if len(args) != 2: parser.error( 'Must provide FASTA and sample coverage labels and paths.') else: fasta_file = args[0] targets_file = args[1] random.seed(options.seed) np.random.seed(options.seed) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) if options.stride_train <= 0 or options.stride_train > 1: parser.error('Train stride =%f must be in [0,1]' % options.stride_train) if options.stride_test <= 0 or options.stride_test > 1: parser.error('Test stride =%f must be in [0,1]' % options.stride_test) ################################################################ # define genomic contigs ################################################################ chrom_contigs = genome.load_chromosomes(fasta_file) # remove gaps if options.gaps_file: chrom_contigs = genome.split_contigs(chrom_contigs, options.gaps_file) # ditch the chromosomes for contigs contigs = [] for chrom in chrom_contigs: contigs += [ Contig(chrom, ctg_start, ctg_end) for ctg_start, ctg_end in chrom_contigs[chrom] ] # limit to a BED file if options.limit_bed is not None: contigs = limit_contigs(contigs, options.limit_bed) # filter for large enough contigs = [ ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length ] # break up large contigs if options.break_t is not None: contigs = break_large_contigs(contigs, options.break_t) # print contigs to BED file ctg_bed_file = '%s/contigs.bed' % options.out_dir write_seqs_bed(ctg_bed_file, contigs) ################################################################ # divide between train/valid/test ################################################################ try: # convert to float pct valid_pct = float(options.valid_pct_or_chr) test_pct = float(options.test_pct_or_chr) assert (0 <= valid_pct <= 1) assert (0 <= test_pct <= 1) # divide by pct contig_sets = divide_contigs_pct(contigs, test_pct, valid_pct) except (ValueError, AssertionError): # divide by chr valid_chr = options.valid_pct_or_chr test_chr = options.test_pct_or_chr contig_sets = divide_contigs_chr(contigs, test_chr, valid_chr) train_contigs, valid_contigs, test_contigs = contig_sets # rejoin broken contigs within set train_contigs = rejoin_large_contigs(train_contigs) valid_contigs = rejoin_large_contigs(valid_contigs) test_contigs = rejoin_large_contigs(test_contigs) ################################################################ # define model sequences ################################################################ # stride sequences across contig train_mseqs = contig_sequences(train_contigs, options.seq_length, options.stride_train, label='train') valid_mseqs = contig_sequences(valid_contigs, options.seq_length, options.stride_test, label='valid') test_mseqs = contig_sequences(test_contigs, options.seq_length, options.stride_test, label='test') # shuffle random.shuffle(train_mseqs) random.shuffle(valid_mseqs) random.shuffle(test_mseqs) # merge mseqs = train_mseqs + valid_mseqs + test_mseqs ################################################################ # mappability ################################################################ if options.umap_bed is not None: # annotate unmappable positions mseqs_unmap = annotate_unmap(mseqs, options.umap_bed, options.seq_length, options.pool_width) # filter unmappable mseqs_map_mask = (mseqs_unmap.mean(axis=1, dtype='float64') < options.umap_t) mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]] mseqs_unmap = mseqs_unmap[mseqs_map_mask, :] # write to file unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir np.save(unmap_npy, mseqs_unmap) # down-sample if options.sample_pct < 1.0: mseqs = random.sample(mseqs, int(options.sample_pct * len(contigs))) # write sequences to BED seqs_bed_file = '%s/sequences.bed' % options.out_dir write_seqs_bed(seqs_bed_file, mseqs, True) ################################################################ # read sequence coverage values ################################################################ # read target datasets targets_df = pd.read_table(targets_file, index_col=0) seqs_cov_dir = '%s/seqs_cov' % options.out_dir if not os.path.isdir(seqs_cov_dir): os.mkdir(seqs_cov_dir) read_jobs = [] for ti in range(targets_df.shape[0]): genome_cov_file = targets_df['file'].iloc[ti] seqs_cov_stem = '%s/%d' % (seqs_cov_dir, ti) seqs_cov_file = '%s.h5' % seqs_cov_stem clip_ti = None if 'clip' in targets_df.columns: clip_ti = targets_df['clip'].iloc[ti] scale_ti = 1 if 'scale' in targets_df.columns: scale_ti = targets_df['scale'].iloc[ti] if os.path.isfile(seqs_cov_file): print('Skipping existing %s' % seqs_cov_file, file=sys.stderr) else: cmd = 'basenji_data_read.py' cmd += ' -w %d' % options.pool_width cmd += ' -u %s' % targets_df['sum_stat'].iloc[ti] if clip_ti is not None: cmd += ' -c %f' % clip_ti if options.soft_clip: cmd += ' --soft' cmd += ' -s %f' % scale_ti if options.blacklist_bed: cmd += ' -b %s' % options.blacklist_bed cmd += ' %s' % genome_cov_file cmd += ' %s' % seqs_bed_file cmd += ' %s' % seqs_cov_file if options.run_local: cmd += ' &> %s.err' % seqs_cov_stem read_jobs.append(cmd) else: j = slurm.Job(cmd, name='read_t%d' % ti, out_file='%s.out' % seqs_cov_stem, err_file='%s.err' % seqs_cov_stem, queue='standard,tbdisk', mem=15000, time='12:0:0') read_jobs.append(j) if options.run_local: util.exec_par(read_jobs, options.processes, verbose=True) else: slurm.multi_run(read_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5) ################################################################ # write TF Records ################################################################ # copy targets file shutil.copy(targets_file, '%s/targets.txt' % options.out_dir) # initialize TF Records dir tfr_dir = '%s/tfrecords' % options.out_dir if not os.path.isdir(tfr_dir): os.mkdir(tfr_dir) write_jobs = [] for tvt_set in ['train', 'valid', 'test']: tvt_set_indexes = [ i for i in range(len(mseqs)) if mseqs[i].label == tvt_set ] tvt_set_start = tvt_set_indexes[0] tvt_set_end = tvt_set_indexes[-1] + 1 tfr_i = 0 tfr_start = tvt_set_start tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end) while tfr_start <= tvt_set_end: tfr_stem = '%s/%s-%d' % (tfr_dir, tvt_set, tfr_i) cmd = 'basenji_data_write.py' cmd += ' -s %d' % tfr_start cmd += ' -e %d' % tfr_end if options.umap_bed is not None: cmd += ' -u %s' % unmap_npy if options.umap_set is not None: cmd += ' --umap_set %f' % options.umap_set cmd += ' %s' % fasta_file cmd += ' %s' % seqs_bed_file cmd += ' %s' % seqs_cov_dir cmd += ' %s.tfr' % tfr_stem if options.run_local: cmd += ' &> %s.err' % tfr_stem write_jobs.append(cmd) else: j = slurm.Job(cmd, name='write_%s-%d' % (tvt_set, tfr_i), out_file='%s.out' % tfr_stem, err_file='%s.err' % tfr_stem, queue='standard,tbdisk', mem=15000, time='12:0:0') write_jobs.append(j) # update tfr_i += 1 tfr_start += options.seqs_per_tfr tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end) if options.run_local: util.exec_par(write_jobs, options.processes, verbose=True) else: slurm.multi_run(write_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5)
def main(): usage = 'usage: %prog [options] <params_file> <model_file>' parser = OptionParser(usage) # sad parser.add_option('-f', dest='genome_fasta', default='%s/data/hg38.fa' % os.environ['BASENJIDIR'], help='Genome FASTA for sequences [Default: %default]') parser.add_option('--local', dest='local', default=1024, type='int', help='Local SAD score [Default: %default]') parser.add_option('-n', dest='norm_file', default=None, help='Normalize SAD scores') parser.add_option( '-o', dest='out_dir', default='sad_gtex', help='Output directory for tables and plots [Default: %default]') parser.add_option('--pseudo', dest='log_pseudo', default=1, type='float', help='Log2 pseudocount [Default: %default]') parser.add_option( '--rc', dest='rc', default=False, action='store_true', help= 'Average forward and reverse complement predictions [Default: %default]' ) parser.add_option('--shifts', dest='shifts', default='0', type='str', help='Ensemble prediction shifts [Default: %default]') parser.add_option( '--stats', dest='sad_stats', default='SAD', help='Comma-separated list of stats to save. [Default: %default]') parser.add_option( '-t', dest='targets_file', default=None, type='str', help='File specifying target indexes and labels in table format') parser.add_option( '--ti', dest='track_indexes', default=None, type='str', help='Comma-separated list of target indexes to output BigWig tracks') parser.add_option( '--threads', dest='threads', default=False, action='store_true', help='Run CPU math and output in a separate thread [Default: %default]' ) parser.add_option( '-u', dest='penultimate', default=False, action='store_true', help='Compute SED in the penultimate layer [Default: %default]') # multi parser.add_option('-e', dest='conda_env', default='tf2.2-gpu', help='Anaconda environment [Default: %default]') parser.add_option('-g', dest='gtex_vcf_dir', default='/home/drk/seqnn/data/gtex_fine/susie_pip90') parser.add_option('--name', dest='name', default='gtex', help='SLURM name prefix [Default: %default]') parser.add_option('--max_proc', dest='max_proc', default=None, type='int', help='Maximum concurrent processes [Default: %default]') parser.add_option('-p', dest='processes', default=None, type='int', help='Number of processes, passed by multi script. \ (Unused, but needs to appear as dummy.)') parser.add_option( '-q', dest='queue', default='gtx1080ti', help='SLURM queue on which to run the jobs [Default: %default]') parser.add_option( '-r', dest='restart', default=False, action='store_true', help='Restart a partially completed job [Default: %default]') (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide parameters and model files') else: params_file = args[0] model_file = args[1] ####################################################### # prep work # output directory if not options.restart: if os.path.isdir(options.out_dir): print('Please remove %s' % options.out_dir, file=sys.stderr) exit(1) os.mkdir(options.out_dir) # pickle options options_pkl_file = '%s/options.pkl' % options.out_dir options_pkl = open(options_pkl_file, 'wb') pickle.dump(options, options_pkl) options_pkl.close() ####################################################### # predict cmd_base = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' cmd_base += ' conda activate %s;' % options.conda_env cmd_base += ' basenji_sad.py %s %s %s' % (options_pkl_file, params_file, model_file) jobs = [] for gtex_pos_vcf in glob.glob('%s/*_pos.vcf' % options.gtex_vcf_dir): # positive job job_base = os.path.splitext(os.path.split(gtex_pos_vcf)[1])[0] out_dir = '%s/%s' % (options.out_dir, job_base) if not options.restart or not os.path.isfile('%s/sad.h5' % out_dir): cmd = '%s -o %s %s' % (cmd_base, out_dir, gtex_pos_vcf) name = '%s_%s' % (options.name, job_base) j = slurm.Job(cmd, name, '%s.out' % out_dir, '%s.err' % out_dir, queue=options.queue, gpu=1, mem=22000, time='1-0:0:0') jobs.append(j) # negative job gtex_neg_vcf = gtex_pos_vcf.replace('_pos.', '_neg.') job_base = os.path.splitext(os.path.split(gtex_neg_vcf)[1])[0] out_dir = '%s/%s' % (options.out_dir, job_base) if not options.restart or not os.path.isfile('%s/sad.h5' % out_dir): cmd = '%s -o %s %s' % (cmd_base, out_dir, gtex_neg_vcf) name = '%s_%s' % (options.name, job_base) j = slurm.Job(cmd, name, '%s.out' % out_dir, '%s.err' % out_dir, queue=options.queue, gpu=1, mem=22000, time='1-0:0:0') jobs.append(j) slurm.multi_run(jobs, max_proc=options.max_proc, verbose=True, launch_sleep=10, update_sleep=60) ####################################################### # classify cmd_base = 'basenji_bench_classify.py -i 100 -p 2 -r 44 -s' jobs = [] for gtex_pos_vcf in glob.glob('%s/*_pos.vcf' % options.gtex_vcf_dir): tissue = os.path.splitext(os.path.split(gtex_pos_vcf)[1])[0][:-4] sad_pos = '%s/%s_pos/sad.h5' % (options.out_dir, tissue) sad_neg = '%s/%s_neg/sad.h5' % (options.out_dir, tissue) out_dir = '%s/%s_class' % (options.out_dir, tissue) if not options.restart or not os.path.isfile('%s/stats.txt' % out_dir): cmd = '%s -o %s %s %s' % (cmd_base, out_dir, sad_pos, sad_neg) j = slurm.Job(cmd, tissue, '%s.out' % out_dir, '%s.err' % out_dir, queue='standard', cpu=2, mem=22000, time='1-0:0:0') jobs.append(j) slurm.multi_run(jobs, verbose=True)
def main(): usage = 'usage: %prog [options] <exp_dir> <params_file> <data1_dir> ...' parser = OptionParser(usage) parser.add_option('-a', '--alt', dest='alternative', default='two-sided', help='Statistical test alternative [Default: %default]') parser.add_option('-c', dest='crosses', default=1, type='int', help='Number of cross-fold rounds [Default:%default]') parser.add_option('-d', dest='dataset_i', default=None, type='int', help='Dataset index [Default:%default]') parser.add_option('--d_ref', dest='dataset_ref_i', default=None, type='int', help='Reference Dataset index [Default:%default]') parser.add_option('-e', dest='conda_env', default='tf2-gpu', help='Anaconda environment [Default: %default]') parser.add_option('-f', dest='fold_subset', default=None, type='int', help='Run a subset of folds [Default:%default]') parser.add_option('--label_exp', dest='label_exp', default='Experiment', help='Experiment label [Default: %default]') parser.add_option('--label_ref', dest='label_ref', default='Reference', help='Reference label [Default: %default]') parser.add_option('-m', dest='metric', default=None, help='Train/test metric [Default: Pearsonr or AUPRC]') parser.add_option('--name', dest='name', default='test', help='SLURM name prefix [Default: %default]') parser.add_option('-o', dest='out_stem', default=None, help='Output plot stem [Default: %default]') parser.add_option('-q', dest='queue', default='gtx1080ti') parser.add_option('-r', dest='ref_dir', default=None, help='Reference directory for statistical tests') parser.add_option( '--rc', dest='rc', default=False, action='store_true', help= 'Average forward and reverse complement predictions [Default: %default]' ) parser.add_option('--shifts', dest='shifts', default='0', type='str', help='Ensemble prediction shifts [Default: %default]') parser.add_option('--spec', dest='specificity', default=False, action='store_true', help='Test specificity [Default: %default]') parser.add_option('--train', dest='train', default=False, action='store_true', help='Test on the training set, too [Default: %default]') (options, args) = parser.parse_args() if len(args) < 3: parser.error('Must provide parameters file and data directory') else: exp_dir = args[0] params_file = args[1] data_dirs = [os.path.abspath(arg) for arg in args[2:]] # read data parameters data_stats_file = '%s/statistics.json' % data_dirs[0] with open(data_stats_file) as data_stats_open: data_stats = json.load(data_stats_open) if options.dataset_i is None: head_i = 0 else: head_i = options.dataset_i # count folds num_folds = len([dkey for dkey in data_stats if dkey.startswith('fold')]) # subset folds if options.fold_subset is not None: num_folds = min(options.fold_subset, num_folds) ################################################################ # test check ################################################################ jobs = [] if options.train: for ci in range(options.crosses): for fi in range(num_folds): it_dir = '%s/f%d_c%d' % (exp_dir, fi, ci) if options.dataset_i is None: out_dir = '%s/test_train' % it_dir model_file = '%s/train/model_check.h5' % it_dir else: out_dir = '%s/test%d_train' % (it_dir, options.dataset_i) model_file = '%s/train/model%d_check.h5' % ( it_dir, options.dataset_i) # check if done acc_file = '%s/acc.txt' % out_dir if os.path.isfile(acc_file): # print('%s already generated.' % acc_file) pass else: cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' cmd += ' conda activate %s;' % options.conda_env cmd += ' basenji_test.py' cmd += ' --head %d' % head_i cmd += ' -o %s' % out_dir if options.rc: cmd += ' --rc' if options.shifts: cmd += ' --shifts %s' % options.shifts cmd += ' --split train' cmd += ' %s' % params_file cmd += ' %s' % model_file cmd += ' %s/data%d' % (it_dir, head_i) name = '%s-testtr-f%dc%d' % (options.name, fi, ci) j = slurm.Job(cmd, name=name, out_file='%s.out' % out_dir, err_file='%s.err' % out_dir, queue=options.queue, cpu=1, gpu=1, mem=23000, time='4:00:00') jobs.append(j) ################################################################ # test best ################################################################ for ci in range(options.crosses): for fi in range(num_folds): it_dir = '%s/f%d_c%d' % (exp_dir, fi, ci) if options.dataset_i is None: out_dir = '%s/test' % it_dir model_file = '%s/train/model_best.h5' % it_dir else: out_dir = '%s/test%d' % (it_dir, options.dataset_i) model_file = '%s/train/model%d_best.h5' % (it_dir, options.dataset_i) # check if done acc_file = '%s/acc.txt' % out_dir if os.path.isfile(acc_file): # print('%s already generated.' % acc_file) pass else: # basenji test cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' cmd += ' conda activate %s;' % options.conda_env cmd += ' basenji_test.py' cmd += ' --head %d' % head_i cmd += ' -o %s' % out_dir if options.rc: cmd += ' --rc' if options.shifts: cmd += ' --shifts %s' % options.shifts cmd += ' %s' % params_file cmd += ' %s' % model_file cmd += ' %s/data%d' % (it_dir, head_i) name = '%s-test-f%dc%d' % (options.name, fi, ci) j = slurm.Job(cmd, name=name, out_file='%s.out' % out_dir, err_file='%s.err' % out_dir, queue=options.queue, cpu=1, gpu=1, mem=23000, time='4:00:00') jobs.append(j) ################################################################ # test best specificity ################################################################ if options.specificity: for ci in range(options.crosses): for fi in range(num_folds): it_dir = '%s/f%d_c%d' % (exp_dir, fi, ci) if options.dataset_i is None: out_dir = '%s/test_spec' % it_dir model_file = '%s/train/model_best.h5' % it_dir else: out_dir = '%s/test%d_spec' % (it_dir, options.dataset_i) model_file = '%s/train/model%d_best.h5' % ( it_dir, options.dataset_i) # check if done acc_file = '%s/acc.txt' % out_dir if os.path.isfile(acc_file): # print('%s already generated.' % acc_file) pass else: # basenji test cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' cmd += ' conda activate %s;' % options.conda_env cmd += ' basenji_test_specificity.py' cmd += ' --head %d' % head_i cmd += ' -o %s' % out_dir if options.rc: cmd += ' --rc' if options.shifts: cmd += ' --shifts %s' % options.shifts cmd += ' %s' % params_file cmd += ' %s' % model_file cmd += ' %s/data%d' % (it_dir, head_i) name = '%s-spec-f%dc%d' % (options.name, fi, ci) j = slurm.Job(cmd, name=name, out_file='%s.out' % out_dir, err_file='%s.err' % out_dir, queue=options.queue, cpu=1, gpu=1, mem=75000, time='6:00:00') jobs.append(j) slurm.multi_run(jobs, verbose=True) if options.dataset_i is None: test_prefix = 'test' else: test_prefix = 'test%d' % options.dataset_i if options.dataset_ref_i is None: test_ref_prefix = 'test' else: test_ref_prefix = 'test%d' % options.dataset_ref_i # classification or regression if options.metric is None: with open('%s/f0_c0/%s/acc.txt' % (exp_dir, test_prefix)) as test0_open: header = test0_open.readline().split() if 'pearsonr' in header: options.metric = 'pearsonr' else: options.metric = 'auprc' ################################################################ # compare checkpoint on training set ################################################################ if options.train: exp_glob_str = '%s/*/%s_train/acc.txt' % (exp_dir, test_prefix) exp_cors, exp_mean, exp_stdm = read_metrics(exp_glob_str, options.metric) if options.ref_dir is not None: ref_glob_str = '%s/*/%s_train/acc.txt' % (options.ref_dir, test_ref_prefix) ref_cors, ref_mean, ref_stdm = read_metrics( ref_glob_str, options.metric) mwp, tp = stat_tests(ref_cors, exp_cors, options.alternative) print('\nTrain:') print('%12s %s: %.4f (%.4f)' % (options.label_exp, options.metric, exp_mean, exp_stdm)) if options.ref_dir is not None: print('%12s %s: %.4f (%.4f)' % (options.label_ref, options.metric, ref_mean, ref_stdm)) print('Mann-Whitney U p-value: %.3g' % mwp) print('T-test p-value: %.3g' % tp) if options.out_stem is not None: jointplot(ref_cors, exp_cors, '%s_train.pdf' % options.out_stem, options.label_ref, options.label_exp) ################################################################ # compare best on test set ################################################################ exp_glob_str = '%s/*/%s/acc.txt' % (exp_dir, test_prefix) exp_cors, exp_mean, exp_stdm = read_metrics(exp_glob_str, options.metric) if options.ref_dir is not None: ref_glob_str = '%s/*/%s/acc.txt' % (options.ref_dir, test_ref_prefix) ref_cors, ref_mean, ref_stdm = read_metrics(ref_glob_str, options.metric) mwp, tp = stat_tests(ref_cors, exp_cors, options.alternative) print('\nTest:') print('%12s %s: %.4f (%.4f)' % (options.label_exp, options.metric, exp_mean, exp_stdm)) if options.ref_dir is not None: print('%12s %s: %.4f (%.4f)' % (options.label_ref, options.metric, ref_mean, ref_stdm)) print('Mann-Whitney U p-value: %.3g' % mwp) print('T-test p-value: %.3g' % tp) if options.out_stem is not None: jointplot(ref_cors, exp_cors, '%s_test.pdf' % options.out_stem, options.label_ref, options.label_exp) ################################################################ # compare best on test set specificity ################################################################ if options.specificity: exp_glob_str = '%s/*/%s_spec/acc.txt' % (exp_dir, test_prefix) exp_cors, exp_mean, exp_stdm = read_metrics(exp_glob_str, options.metric) if options.ref_dir is not None: ref_glob_str = '%s/*/%s_spec/acc.txt' % (options.ref_dir, test_ref_prefix) ref_cors, ref_mean, ref_stdm = read_metrics( ref_glob_str, options.metric) mwp, tp = stat_tests(ref_cors, exp_cors, options.alternative) print('\nSpecificity:') print('%12s %s: %.4f (%.4f)' % (options.label_exp, options.metric, exp_mean, exp_stdm)) if options.ref_dir is not None: print('%12s %s: %.4f (%.4f)' % (options.label_ref, options.metric, ref_mean, ref_stdm)) print('Mann-Whitney U p-value: %.3g' % mwp) print('T-test p-value: %.3g' % tp) if options.out_stem is not None: jointplot(ref_cors, exp_cors, '%s_spec.pdf' % options.out_stem, options.label_ref, options.label_exp)
def main(): usage = 'usage: %prog [options] <params_file> <seed_model> <data_file>' parser = OptionParser(usage) parser.add_option( '-e', dest='num_epochs', default=4, type='int', help='Number of epochs to train models [Default: %default]') parser.add_option('-n', dest='num_models', default=3, type='int', help='Number of models to train [Default: %default]') parser.add_option( '-o', dest='out_dir', default='seqnn_avg', help='Output directory in which to train [Default: %default]') parser.add_option( '-s', dest='num_steps', default=None, type='int', help='Number of steps to train models [Default: %default]') (options, args) = parser.parse_args() if len(args) != 3: parser.error('Must provide parameters, seed model, and data') else: params_file = args[0] seed_model = args[1] data_file = args[2] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) jobs = [] for mi in range(options.num_models): model_dir = '%s/m%d' % (options.out_dir, mi) cmd = 'source activate py3_gpu;' cmd += ' basenji_train.py' cmd += ' --rc --shifts "3,2,1,0,-1,-2,-3"' cmd += ' --logdir %s' % model_dir cmd += ' --check_all' cmd += ' --num_train_epochs %d' % options.num_epochs cmd += ' --restart %s' % seed_model cmd += ' --params %s' % params_file cmd += ' --data %s' % data_file j = slurm.Job(cmd, name=model_dir, out_file='%s.out' % model_dir, err_file='%s.err' % model_dir, queue='gtx1080ti', gpu=1, cpu=1, time='4-0:0:0', mem=30000) jobs.append(j) slurm.multi_run(jobs, verbose=True)
def main(): usage = 'usage: %prog [options] <fasta_file> <targets_file>' parser = OptionParser(usage) parser.add_option('-b', dest='limit_bed', help='Limit to segments that overlap regions in a BED file') # parser.add_option('-c', dest='clip', # default=None, type='float', # help='Clip target values to have minimum [Default: %default]') parser.add_option('-d', dest='sample_pct', default=1.0, type='float', help='Down-sample the segments') parser.add_option('-g', dest='gaps_file', help='Genome assembly gaps BED [Default: %default]') parser.add_option('-l', dest='seq_length', default=131072, type='int', help='Sequence length [Default: %default]') parser.add_option('--local', dest='run_local', default=False, action='store_true', help='Run jobs locally as opposed to on SLURM [Default: %default]') parser.add_option('-o', dest='out_dir', default='data_out', help='Output directory [Default: %default]') parser.add_option('-p', dest='processes', default=None, type='int', help='Number parallel processes [Default: %default]') parser.add_option('--seed', dest='seed', default=44, type='int', help='Random seed [Default: %default]') parser.add_option('--stride_train', dest='stride_train', default=1., type='float', help='Stride to advance train sequences [Default: seq_length]') parser.add_option('--stride_test', dest='stride_test', default=1., type='float', help='Stride to advance valid and test sequences [Default: seq_length]') parser.add_option('-r', dest='seqs_per_tfr', default=256, type='int', help='Sequences per TFRecord file [Default: %default]') parser.add_option('-t', dest='test_pct', default=0.05, type='float', help='Proportion of the data for testing [Default: %default]') parser.add_option('-u', dest='unmap_bed', help='Unmappable segments to set to NA') parser.add_option('--unmap_t', dest='unmap_t', default=0.3, type='float', help='Remove sequences with more than this unmappable bin % [Default: %default]') parser.add_option('-w', dest='pool_width', default=128, type='int', help='Sum pool width [Default: %default]') parser.add_option('-v', dest='valid_pct', default=0.05, type='float', help='Proportion of the data for validation [Default: %default]') (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide FASTA and sample coverage labels and paths.') else: fasta_file = args[0] targets_file = args[1] random.seed(options.seed) np.random.seed(options.seed) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################ # define genomic contigs ################################################################ chrom_contigs = basenji.genome.load_chromosomes(fasta_file) # remove gaps if options.gaps_file: chrom_contigs = basenji.genome.split_contigs(chrom_contigs, options.gaps_file) # ditch the chromosomes for contigs contigs = [] for chrom in chrom_contigs: contigs += [Contig(chrom, ctg_start, ctg_end) for ctg_start, ctg_end in chrom_contigs[chrom]] # limit to a BED file if options.limit_bed is not None: contigs = limit_contigs(contigs, options.limit_bed) # filter for large enough contigs = [ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length] # down-sample if options.sample_pct < 1.0: contigs = random.sample(contigs, int(options.sample_pct*len(contigs))) # print contigs to BED file ctg_bed_file = '%s/contigs.bed' % options.out_dir write_seqs_bed(ctg_bed_file, contigs) ################################################################ # divide between train/valid/test ################################################################ contig_sets = divide_contigs(contigs, options.test_pct, options.valid_pct) train_contigs, valid_contigs, test_contigs = contig_sets ################################################################ # define model sequences ################################################################ # stride sequences across contig train_mseqs = contig_sequences(train_contigs, options.seq_length, options.stride_train) valid_mseqs = contig_sequences(valid_contigs, options.seq_length, options.stride_test) test_mseqs = contig_sequences(test_contigs, options.seq_length, options.stride_test) # shuffle random.shuffle(train_mseqs) random.shuffle(valid_mseqs) random.shuffle(test_mseqs) # merge mseqs = train_mseqs + valid_mseqs + test_mseqs mseqs_labels = ['train']*len(train_mseqs) + ['valid']*len(valid_mseqs) + ['test']*len(test_mseqs) ################################################################ # mappability ################################################################ if options.unmap_bed is not None: # annotate unmappable positions mseqs_unmap = annotate_unmap(mseqs, options.unmap_bed, options.seq_length, options.pool_width) # filter unmappable mseqs_map_mask = (mseqs_unmap.mean(axis=1, dtype='float64') < options.unmap_t) mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]] mseqs_labels = [mseqs_labels[i] for i in range(len(mseqs_labels)) if mseqs_map_mask[i]] mseqs_unmap = mseqs_unmap[mseqs_map_mask,:] # write to file unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir np.save(unmap_npy, mseqs_unmap) # write sequences to BED seqs_bed_file = '%s/sequences.bed' % options.out_dir write_seqs_bed(seqs_bed_file, mseqs, mseqs_labels) ################################################################ # read sequence coverage values ################################################################ # read target datasets targets_df = pd.read_table(targets_file) seqs_cov_dir = '%s/seqs_cov' % options.out_dir if not os.path.isdir(seqs_cov_dir): os.mkdir(seqs_cov_dir) read_jobs = [] for ti in range(targets_df.shape[0]): genome_cov_file = targets_df['file'].iloc[ti] seqs_cov_stem = '%s/%d' % (seqs_cov_dir, ti) seqs_cov_file = '%s.h5' % seqs_cov_stem if os.path.isfile(seqs_cov_file): print('Skipping existing %s' % seqs_cov_file, file=sys.stderr) else: cmd = 'basenji_data_read.py' cmd += ' -w %d' % options.pool_width cmd += ' %s' % genome_cov_file cmd += ' %s' % seqs_bed_file cmd += ' %s' % seqs_cov_file if options.run_local: cmd += ' &> %s.err' % seqs_cov_stem read_jobs.append(cmd) else: j = slurm.Job(cmd, name='read_t%d' % ti, out_file='%s.out' % seqs_cov_stem, err_file='%s.err' % seqs_cov_stem, queue='standard,tbdisk', mem=15000, time='12:0:0') read_jobs.append(j) if options.run_local: util.exec_par(read_jobs, options.processes, verbose=True) else: slurm.multi_run(read_jobs, options.processes, verbose=True, sleep_time=1) ################################################################ # write TF Records ################################################################ tfr_dir = '%s/tfrecords' % options.out_dir if not os.path.isdir(tfr_dir): os.mkdir(tfr_dir) write_jobs = [] for tvt_set in ['train', 'valid', 'test']: tvt_set_indexes = [i for i in range(len(mseqs_labels)) if mseqs_labels[i] == tvt_set] tvt_set_start = tvt_set_indexes[0] tvt_set_end = tvt_set_indexes[-1] tfr_i = 0 tfr_start = tvt_set_start tfr_end = min(tfr_start+options.seqs_per_tfr, tvt_set_end) while tfr_start <= tvt_set_end: tfr_stem = '%s/%s-%d' % (tfr_dir, tvt_set, tfr_i) cmd = 'basenji_data_write.py' cmd += ' -s %d' % tfr_start cmd += ' -e %d' % tfr_end if options.unmap_bed is not None: cmd += ' -u %s' % unmap_npy cmd += ' %s' % fasta_file cmd += ' %s' % seqs_bed_file cmd += ' %s' % seqs_cov_dir cmd += ' %s.tfr' % tfr_stem if options.run_local: cmd += ' &> %s.err' % tfr_stem write_jobs.append(cmd) else: j = slurm.Job(cmd, name='write_%s-%d' % (tvt_set, tfr_i), out_file='%s.out' % tfr_stem, err_file='%s.err' % tfr_stem, queue='standard,tbdisk', mem=15000, time='12:0:0') write_jobs.append(j) # update tfr_i += 1 tfr_start += options.seqs_per_tfr tfr_end = min(tfr_start+options.seqs_per_tfr, tvt_set_end) if options.run_local: util.exec_par(write_jobs, options.processes, verbose=True) else: slurm.multi_run(write_jobs, options.processes, verbose=True, sleep_time=1)
def main(): usage = 'usage: %prog [options] <params_file> <data_dir>' parser = OptionParser(usage) # train train_options = OptionGroup(parser, 'basenji_train.py options') train_options.add_option( '-k', dest='keras_fit', default=False, action='store_true', help='Train with Keras fit method [Default: %default]') train_options.add_option( '-o', dest='out_dir', default='train_out', help='Output directory for test statistics [Default: %default]') train_options.add_option( '--restore', dest='restore', help='Restore model and continue training [Default: %default]') train_options.add_option( '--trunk', dest='trunk', default=False, action='store_true', help='Restore only model trunk [Default: %default]') train_options.add_option( '--tfr_train', dest='tfr_train_pattern', default='train-*.tfr', help= 'Training TFRecord pattern string appended to data_dir [Default: %default]' ) train_options.add_option( '--tfr_eval', dest='tfr_eval_pattern', default='valid-*.tfr', help= 'Evaluation TFRecord pattern string appended to data_dir [Default: %default]' ) parser.add_option_group(train_options) # test test_options = OptionGroup(parser, 'basenji_test.py options') test_options.add_option( '--rc', dest='rc', default=False, action='store_true', help= 'Average forward and reverse complement predictions [Default: %default]' ) test_options.add_option( '--shifts', dest='shifts', default='0', type='str', help='Ensemble prediction shifts [Default: %default]') parser.add_option_group(test_options) # multi rep_options = OptionGroup(parser, 'replication options') rep_options.add_option('-e', dest='conda_env', default='tf2-gpu', help='Anaconda environment [Default: %default]') rep_options.add_option('--name', dest='name', default='reps', help='SLURM name prefix [Default: %default]') rep_options.add_option('-p', dest='processes', default=None, type='int', help='Number of processes, passed by multi script') rep_options.add_option( '-q', dest='queue', default='gtx1080ti', help='SLURM queue on which to run the jobs [Default: %default]') rep_options.add_option('-r', dest='restart', default=False, action='store_true') parser.add_option_group(rep_options) (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide parameters and data directory.') else: params_file = os.path.abspath(args[0]) data_dir = os.path.abspath(args[1]) # read model parameters with open(params_file) as params_open: params = json.load(params_open) params_train = params['train'] ####################################################### # prep work if not options.restart and os.path.isdir(options.out_dir): print('Output directory %s exists. Please remove.' % options.out_dir) exit(1) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ####################################################### # train jobs = [] for pi in range(options.processes): rep_dir = '%s/%d' % (options.out_dir, pi) if options.restart and os.path.isdir(rep_dir): print('%s found and skipped.' % rep_dir) else: os.mkdir(rep_dir) cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' cmd += ' conda activate %s;' % options.conda_env cmd += ' echo $HOSTNAME;' cmd += ' basenji_train.py' cmd += ' %s' % options_string(options, train_options, '%s/train' % rep_dir) cmd += ' %s %s' % (params_file, data_dir) name = '%s-train%d' % (options.name, pi) sbf = os.path.abspath('%s/train.sb' % rep_dir) outf = os.path.abspath('%s/train.out' % rep_dir) errf = os.path.abspath('%s/train.err' % rep_dir) j = slurm.Job(cmd, name, outf, errf, sbf, queue=options.queue, gpu=params_train.get('num_gpu', 1), mem=23000, time='28-0:0:0') jobs.append(j) slurm.multi_run(jobs, max_proc=options.processes, verbose=True, launch_sleep=10, update_sleep=60) ####################################################### # test train jobs = [] for pi in range(options.processes): rep_dir = '%s/%d' % (options.out_dir, pi) test_dir = '%s/test_train' % rep_dir # check if done acc_file = '%s/acc.txt' % test_dir if options.restart and os.path.isfile(acc_file): print('%s already generated.' % acc_file) else: cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' cmd += ' conda activate %s;' % options.conda_env cmd += ' echo $HOSTNAME;' cmd += ' basenji_test.py' if options.rc: cmd += ' --rc' if options.shifts: cmd += ' --shifts %s' % options.shifts cmd += ' -o %s' % test_dir cmd += ' --tfr "train-*.tfr"' cmd += ' %s %s/train/model_check.h5 %s' % (params_file, rep_dir, data_dir) name = '%s-testtr%d' % (options.name, pi) sbf = os.path.abspath('%s/test_train.sb' % rep_dir) outf = os.path.abspath('%s/test_train.out' % rep_dir) errf = os.path.abspath('%s/test_train.err' % rep_dir) j = slurm.Job(cmd, name, outf, errf, sbf, queue=options.queue, gpu=params_train.get('num_gpu', 1), mem=23000, time='4:0:0') jobs.append(j) ####################################################### # test best for pi in range(options.processes): rep_dir = '%s/%d' % (options.out_dir, pi) test_dir = '%s/test' % rep_dir # check if done acc_file = '%s/acc.txt' % test_dir if options.restart and os.path.isfile(acc_file): print('%s already generated.' % acc_file) else: cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' cmd += ' conda activate %s;' % options.conda_env cmd += ' echo $HOSTNAME;' cmd += ' basenji_test.py' if options.rc: cmd += ' --rc' if options.shifts: cmd += ' --shifts %s' % options.shifts cmd += ' -o %s' % test_dir cmd += ' %s %s/train/model_best.h5 %s' % (params_file, rep_dir, data_dir) name = '%s-test%d' % (options.name, pi) sbf = os.path.abspath('%s/test.sb' % rep_dir) outf = os.path.abspath('%s/test.out' % rep_dir) errf = os.path.abspath('%s/test.err' % rep_dir) j = slurm.Job(cmd, name, outf, errf, sbf, queue=options.queue, gpu=params_train.get('num_gpu', 1), mem=23000, time='4:0:0') jobs.append(j) ####################################################### # test best specificity for pi in range(options.processes): rep_dir = '%s/%d' % (options.out_dir, pi) test_dir = '%s/test_spec' % rep_dir # check if done acc_file = '%s/acc.txt' % test_dir if options.restart and os.path.isfile(acc_file): print('%s already generated.' % acc_file) else: cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' cmd += ' conda activate %s;' % options.conda_env cmd += ' echo $HOSTNAME;' cmd += ' basenji_test_specificity.py' if options.rc: cmd += ' --rc' if options.shifts: cmd += ' --shifts %s' % options.shifts cmd += ' -o %s' % test_dir cmd += ' %s %s/train/model_best.h5 %s' % (params_file, rep_dir, data_dir) name = '%s-spec%d' % (options.name, pi) sbf = os.path.abspath('%s/test_spec.sb' % rep_dir) outf = os.path.abspath('%s/test_spec.out' % rep_dir) errf = os.path.abspath('%s/test_spec.err' % rep_dir) # sticking to one gpu because the normalization time dominates # better would be to save predictions above. j = slurm.Job(cmd, name, outf, errf, sbf, queue=options.queue, gpu=1, mem=45000, time='8:0:0') jobs.append(j) slurm.multi_run(jobs, max_proc=options.processes, verbose=True, launch_sleep=10, update_sleep=60)
def main(): usage = 'usage: %prog [options] <params_file> <model_file> <genes_hdf5_file> <vcf_file>' parser = OptionParser(usage) parser.add_option( '-a', dest='all_sed', default=False, action='store_true', help= 'Print all variant-gene pairs, as opposed to only nonzero [Default: %default]' ) parser.add_option('-b', dest='batch_size', default=None, type='int', help='Batch size [Default: %default]') parser.add_option('-c', dest='csv', default=False, action='store_true', help='Print table as CSV [Default: %default]') parser.add_option('-g', dest='genome_file', default='%s/assembly/human.hg19.genome' % os.environ['HG19'], help='Chromosome lengths file [Default: %default]') parser.add_option( '-i', dest='index_snp', default=False, action='store_true', help= 'SNPs are labeled with their index SNP as column 6 [Default: %default]' ) parser.add_option( '-o', dest='out_dir', default='sed', help='Output directory for tables and plots [Default: %default]') parser.add_option( '-p', dest='processes', default=2, type='int', help='Number of parallel processes to run [Default: %default]') parser.add_option( '-q', dest='queue', default='p100', help='SLURM queue on which to run the jobs [Default: %default]') parser.add_option( '--rc', dest='rc', default=False, action='store_true', help= 'Average the forward and reverse complement predictions when testing [Default: %default]' ) parser.add_option( '-s', dest='score', default=False, action='store_true', help='SNPs are labeled with scores as column 7 [Default: %default]') parser.add_option( '-t', dest='target_wigs_file', default=None, help='Store target values, extracted from this list of WIG files') parser.add_option( '--ti', dest='track_indexes', help='Comma-separated list of target indexes to output BigWig tracks') parser.add_option( '-x', dest='transcript_table', default=False, action='store_true', help='Print transcript table in addition to gene [Default: %default]') parser.add_option( '-w', dest='tss_width', default=1, type='int', help= 'Width of bins considered to quantify TSS transcription [Default: %default]' ) (options, args) = parser.parse_args() if len(args) != 4: parser.error( 'Must provide parameters and model files, genes HDF5 file, and QTL VCF file' ) else: params_file = args[0] model_file = args[1] genes_hdf5_file = args[2] vcf_file = args[3] ####################################################### # prep work # output directory if os.path.isdir(options.out_dir): shutil.rmtree(options.out_dir) os.mkdir(options.out_dir) # pickle options options_pkl_file = '%s/options.pkl' % options.out_dir options_pkl = open(options_pkl_file, 'wb') pickle.dump(options, options_pkl) options_pkl.close() ####################################################### # launch worker threads jobs = [] for pi in range(options.processes): cmd = 'source activate py3_gpu; basenji_sed.py %s %s %d' % ( options_pkl_file, ' '.join(args), pi) name = 'sed_p%d' % pi outf = '%s/job%d.out' % (options.out_dir, pi) errf = '%s/job%d.err' % (options.out_dir, pi) j = slurm.Job(cmd, name, outf, errf, queue=options.queue, mem=16000, time='4:0:0', gpu=1) jobs.append(j) slurm.multi_run(jobs, max_proc=options.processes, verbose=True, sleep_time=60) ####################################################### # collect output collect_table_multi('sed_gene.txt', options.out_dir, options.processes) if options.transcript_table: collect_table('sed_tx.txt', options.out_dir, options.processes) if options.track_indexes is not None: if not os.path.isdir('%s/tracks' % options.out_dir): os.mkdir('%s/tracks' % options.out_dir) for track_file in glob.glob('%s/job*/tracks/*'): track_base = os.path.split(track_file)[1] os.rename(track_file, '%s/tracks/%s' % (options.out_dir, track_base)) for pi in range(options.processes): shutil.rmtree('%s/job%d' % (options.out_dir, pi))
def main(): usage = 'usage: %prog [options] <fasta_file> <targets_file>' parser = OptionParser(usage) parser.add_option('-b', dest='blacklist_bed', help='Set blacklist nucleotides to a baseline value.') parser.add_option( '--break', dest='break_t', default=786432, type='int', help='Break in half contigs above length [Default: %default]') parser.add_option('-c', '--crop', dest='crop_bp', default=0, type='int', help='Crop bp off each end [Default: %default]') parser.add_option('-d', dest='sample_pct', default=1.0, type='float', help='Down-sample the segments') parser.add_option('-f', dest='folds', default=None, type='int', help='Generate cross fold split [Default: %default]') parser.add_option('-g', dest='gaps_file', help='Genome assembly gaps BED [Default: %default]') parser.add_option('-i', dest='interp_nan', default=False, action='store_true', help='Interpolate NaNs [Default: %default]') parser.add_option('-l', dest='seq_length', default=131072, type='int', help='Sequence length [Default: %default]') parser.add_option( '--limit', dest='limit_bed', help='Limit to segments that overlap regions in a BED file') parser.add_option( '--local', dest='run_local', default=False, action='store_true', help='Run jobs locally as opposed to on SLURM [Default: %default]') parser.add_option('-o', dest='out_dir', default='data_out', help='Output directory [Default: %default]') parser.add_option('-p', dest='processes', default=None, type='int', help='Number parallel processes [Default: %default]') parser.add_option( '--peaks', dest='peaks_only', default=False, action='store_true', help='Create contigs only from peaks [Default: %default]') parser.add_option('-r', dest='seqs_per_tfr', default=256, type='int', help='Sequences per TFRecord file [Default: %default]') parser.add_option( '--restart', dest='restart', default=False, action='store_true', help='Continue progress from midpoint. [Default: %default]') parser.add_option('--seed', dest='seed', default=44, type='int', help='Random seed [Default: %default]') parser.add_option( '--snap', dest='snap', default=1, type='int', help='Snap sequences to multiple of the given value [Default: %default]' ) parser.add_option('--st', '--split_test', dest='split_test', default=False, action='store_true', help='Exit after split. [Default: %default]') parser.add_option( '--stride', '--stride_train', dest='stride_train', default=1., type='float', help='Stride to advance train sequences [Default: seq_length]') parser.add_option( '--stride_test', dest='stride_test', default=1., type='float', help='Stride to advance valid and test sequences [Default: seq_length]' ) parser.add_option( '-t', dest='test_pct_or_chr', default=0.05, type='str', help='Proportion of the data for testing [Default: %default]') parser.add_option('-u', dest='umap_bed', help='Unmappable regions in BED format') parser.add_option( '--umap_t', dest='umap_t', default=0.5, type='float', help= 'Remove sequences with more than this unmappable bin % [Default: %default]' ) parser.add_option( '--umap_clip', dest='umap_clip', default=1, type='float', help= 'Clip values at unmappable positions to distribution quantiles, eg 0.25. [Default: %default]' ) parser.add_option( '--umap_tfr', dest='umap_tfr', default=False, action='store_true', help='Save umap array into TFRecords [Default: %default]') parser.add_option('-w', dest='pool_width', default=128, type='int', help='Sum pool width [Default: %default]') parser.add_option( '-v', dest='valid_pct_or_chr', default=0.05, type='str', help='Proportion of the data for validation [Default: %default]') parser.add_option('--norm', dest='norm', default='', type='str', help='Normalize coverage values') parser.add_option('--step', dest='step', default=0, type='int', help='Stride using bp size [Default: %pool_window]') parser.add_option('--padding', dest='padding', default='valid', type='str', help='Padding method for sliding window approach') (options, args) = parser.parse_args() if len(args) != 2: parser.error( 'Must provide FASTA and sample coverage labels and paths.') else: fasta_file = args[0] targets_file = args[1] random.seed(options.seed) np.random.seed(options.seed) if options.break_t is not None and options.break_t < options.seq_length: print( 'Maximum contig length --break cannot be less than sequence length.', file=sys.stderr) exit(1) # transform proportion strides to base pairs if options.stride_train <= 1: print('stride_train %.f' % options.stride_train, end='') options.stride_train = options.stride_train * options.seq_length print(' converted to %f' % options.stride_train) options.stride_train = int(np.round(options.stride_train)) if options.stride_test <= 1: if options.folds is None: print('stride_test %.f' % options.stride_test, end='') options.stride_test = options.stride_test * options.seq_length print(' converted to %f' % options.stride_test) options.stride_test = int(np.round(options.stride_test)) # check snap if options.snap is not None: if np.mod(options.seq_length, options.snap) != 0: raise ValueError('seq_length must be a multiple of snap') if np.mod(options.stride_train, options.snap) != 0: raise ValueError('stride_train must be a multiple of snap') if np.mod(options.stride_test, options.snap) != 0: raise ValueError('stride_test must be a multiple of snap') # setup output directory if os.path.isdir(options.out_dir) and not options.restart: print('Remove output directory %s or use --restart option.' % options.out_dir) exit(1) elif not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) # read target datasets targets_df = pd.read_csv(targets_file, index_col=0, sep='\t') ################################################################ # define genomic contigs ################################################################ if not options.restart: chrom_contigs = genome.load_chromosomes(fasta_file) # remove gaps if options.gaps_file: chrom_contigs = genome.split_contigs(chrom_contigs, options.gaps_file) # ditch the chromosomes for contigs contigs = [] for chrom in chrom_contigs: if len(chrom.split('_')) == 1 and chrom != 'chrM': contigs += [ Contig(chrom, ctg_start, ctg_end) for ctg_start, ctg_end in chrom_contigs[chrom] ] # limit to a BED file if options.limit_bed is not None: contigs = limit_contigs(contigs, options.limit_bed) # limit to peaks if options.peaks_only: peaks_bed = curate_peaks(targets_df, options.out_dir, options.pool_width, options.crop_bp) contigs = limit_contigs(contigs, peaks_bed) # filter for large enough contigs = [ ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length ] # break up large contigs if options.break_t is not None: contigs = break_large_contigs(contigs, options.break_t) # print contigs to BED file # ctg_bed_file = '%s/contigs.bed' % options.out_dir # write_seqs_bed(ctg_bed_file, contigs) ################################################################ # divide between train/valid/test ################################################################ # label folds if options.folds is not None: fold_labels = ['fold%d' % fi for fi in range(options.folds)] num_folds = options.folds else: fold_labels = ['train', 'valid', 'test'] num_folds = 3 if not options.restart: if options.folds is not None: # divide by fold pct fold_contigs = divide_contigs_folds(contigs, options.folds) else: try: # convert to float pct valid_pct = float(options.valid_pct_or_chr) test_pct = float(options.test_pct_or_chr) assert (0 <= valid_pct <= 1) assert (0 <= test_pct <= 1) # divide by pct fold_contigs = divide_contigs_pct(contigs, test_pct, valid_pct) except (ValueError, AssertionError): # divide by chr valid_chrs = options.valid_pct_or_chr.split(',') test_chrs = options.test_pct_or_chr.split(',') fold_contigs = divide_contigs_chr(contigs, test_chrs, valid_chrs) # rejoin broken contigs within set for fi in range(len(fold_contigs)): fold_contigs[fi] = rejoin_large_contigs(fold_contigs[fi]) # write labeled contigs to BED file ctg_bed_file = '%s/contigs.bed' % options.out_dir ctg_bed_out = open(ctg_bed_file, 'w') for fi in range(len(fold_contigs)): for ctg in fold_contigs[fi]: line = '%s\t%d\t%d\t%s' % (ctg.chr, ctg.start, ctg.end, fold_labels[fi]) print(line, file=ctg_bed_out) ctg_bed_out.close() if options.split_test: exit() ################################################################ # define model sequences ################################################################ if not options.restart: fold_mseqs = [] for fi in range(num_folds): if fold_labels[fi] in ['valid', 'test']: stride_fold = options.stride_test else: stride_fold = options.stride_train # stride sequences across contig fold_mseqs_fi = contig_sequences(fold_contigs[fi], options.seq_length, stride_fold, options.snap, fold_labels[fi]) fold_mseqs.append(fold_mseqs_fi) # shuffle random.shuffle(fold_mseqs[fi]) # down-sample if options.sample_pct < 1.0: fold_mseqs[fi] = random.sample( fold_mseqs[fi], int(options.sample_pct * len(fold_mseqs[fi]))) # merge into one list mseqs = [ms for fm in fold_mseqs for ms in fm] ################################################################ # mappability ################################################################ if not options.restart: if options.umap_bed is not None: if shutil.which('bedtools') is None: print('Install Bedtools to annotate unmappable sites', file=sys.stderr) exit(1) # annotate unmappable positions mseqs_unmap = annotate_unmap(mseqs, options.umap_bed, options.seq_length, options.pool_width, options.crop_bp) # filter unmappable mseqs_map_mask = (mseqs_unmap.mean(axis=1, dtype='float64') < options.umap_t) mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]] mseqs_unmap = mseqs_unmap[mseqs_map_mask, :] # write to file unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir np.save(unmap_npy, mseqs_unmap) # write sequences to BED seqs_bed_file = '%s/sequences.bed' % options.out_dir write_seqs_bed(seqs_bed_file, mseqs, True) else: # read from directory seqs_bed_file = '%s/sequences.bed' % options.out_dir unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir mseqs = [] fold_mseqs = [] for fi in range(num_folds): fold_mseqs.append([]) for line in open(seqs_bed_file): a = line.split() msg = ModelSeq(a[0], int(a[1]), int(a[2]), a[3]) mseqs.append(msg) if a[3] == 'train': fi = 0 elif a[3] == 'valid': fi = 1 elif a[3] == 'test': fi = 2 else: fi = int(a[3].replace('fold', '')) fold_mseqs[fi].append(msg) ################################################################ # read sequence coverage values ################################################################ seqs_cov_dir = '%s/seqs_cov' % options.out_dir if not os.path.isdir(seqs_cov_dir): os.mkdir(seqs_cov_dir) read_jobs = [] for ti in range(targets_df.shape[0]): genome_cov_file = targets_df['file'].iloc[ti] seqs_cov_stem = '%s/%d' % (seqs_cov_dir, ti) seqs_cov_file = '%s.h5' % seqs_cov_stem clip_ti = None if 'clip' in targets_df.columns: clip_ti = targets_df['clip'].iloc[ti] clipsoft_ti = None if 'clip_soft' in targets_df.columns: clipsoft_ti = targets_df['clip_soft'].iloc[ti] scale_ti = 1 if 'scale' in targets_df.columns: scale_ti = targets_df['scale'].iloc[ti] if options.restart and os.path.isfile(seqs_cov_file): print('Skipping existing %s' % seqs_cov_file, file=sys.stderr) else: cmd = '/home/shush/profile/tfprofile/bin/basenji_data_read.py' cmd += ' --crop %d' % options.crop_bp cmd += ' -w %d' % options.pool_width cmd += ' -u %s' % targets_df['sum_stat'].iloc[ti] if clip_ti is not None: cmd += ' -c %f' % clip_ti if clipsoft_ti is not None: cmd += ' --clip_soft %f' % clipsoft_ti cmd += ' -s %f' % scale_ti if options.blacklist_bed: cmd += ' -b %s' % options.blacklist_bed if options.interp_nan: cmd += ' -i' if options.norm: cmd += ' --norm %s' % options.norm if options.step: cmd += ' --step %i' % options.step if options.padding: cmd += ' --padding %s' % options.padding cmd += ' %s' % genome_cov_file cmd += ' %s' % seqs_bed_file cmd += ' %s' % seqs_cov_file if options.run_local: # breaks on some OS # cmd += ' &> %s.err' % seqs_cov_stem read_jobs.append(cmd) else: j = slurm.Job(cmd, name='read_t%d' % ti, out_file='%s.out' % seqs_cov_stem, err_file='%s.err' % seqs_cov_stem, queue='standard', mem=15000, time='12:0:0') read_jobs.append(j) if options.run_local: util.exec_par(read_jobs, options.processes, verbose=True) else: slurm.multi_run(read_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5) ################################################################ # write TF Records ################################################################ # copy targets file shutil.copy(targets_file, '%s/targets.txt' % options.out_dir) # initialize TF Records dir tfr_dir = '%s/tfrecords' % options.out_dir if not os.path.isdir(tfr_dir): os.mkdir(tfr_dir) write_jobs = [] for fold_set in fold_labels: fold_set_indexes = [ i for i in range(len(mseqs)) if mseqs[i].label == fold_set ] fold_set_start = fold_set_indexes[0] fold_set_end = fold_set_indexes[-1] + 1 tfr_i = 0 tfr_start = fold_set_start tfr_end = min(tfr_start + options.seqs_per_tfr, fold_set_end) while tfr_start <= fold_set_end: tfr_stem = '%s/%s-%d' % (tfr_dir, fold_set, tfr_i) cmd = '/home/shush/profile/tfprofile/bin/basenji_data_write.py' cmd += ' -s %d' % tfr_start cmd += ' -e %d' % tfr_end cmd += ' --umap_clip %f' % options.umap_clip if options.umap_tfr: cmd += ' --umap_tfr' if options.umap_bed is not None: cmd += ' -u %s' % unmap_npy cmd += ' %s' % fasta_file cmd += ' %s' % seqs_bed_file cmd += ' %s' % seqs_cov_dir cmd += ' %s.tfr' % tfr_stem if options.run_local: # breaks on some OS # cmd += ' &> %s.err' % tfr_stem write_jobs.append(cmd) else: j = slurm.Job(cmd, name='write_%s-%d' % (fold_set, tfr_i), out_file='%s.out' % tfr_stem, err_file='%s.err' % tfr_stem, queue='standard', mem=15000, time='12:0:0') write_jobs.append(j) # update tfr_i += 1 tfr_start += options.seqs_per_tfr tfr_end = min(tfr_start + options.seqs_per_tfr, fold_set_end) if options.run_local: util.exec_par(write_jobs, options.processes, verbose=True) else: slurm.multi_run(write_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5) ################################################################ # stats ################################################################ stats_dict = {} stats_dict['num_targets'] = targets_df.shape[0] stats_dict['seq_length'] = options.seq_length stats_dict['pool_width'] = options.pool_width stats_dict['crop_bp'] = options.crop_bp target_length = options.seq_length - 2 * options.crop_bp target_length = target_length // options.pool_width stats_dict['target_length'] = target_length for fi in range(num_folds): stats_dict['%s_seqs' % fold_labels[fi]] = len(fold_mseqs[fi]) for i in range(10): print('~~~') print('%s/statistics.json' % options.out_dir) for i in range(10): print('~~~') with open('%s/statistics.json' % options.out_dir, 'w') as stats_json_out: json.dump(stats_dict, stats_json_out, indent=4)
def main(): usage = 'usage: %prog [options] <params_file> <model_file> <vcf_file>' parser = OptionParser(usage) parser.add_option('-b', dest='batch_size', default=256, type='int', help='Batch size [Default: %default]') parser.add_option('-c', dest='csv', default=False, action='store_true', help='Print table as CSV [Default: %default]') parser.add_option( '-e', dest='heatmaps', default=False, action='store_true', help='Draw score heatmaps, grouped by index SNP [Default: %default]') parser.add_option( '-f', dest='genome_fasta', default='%s/assembly/hg19.fa' % os.environ['HG19'], help= 'Genome FASTA from which sequences will be drawn [Default: %default]') parser.add_option('-g', dest='genome_file', default='%s/assembly/human.hg19.genome' % os.environ['HG19'], help='Chromosome lengths file [Default: %default]') parser.add_option( '-l', dest='seq_len', type='int', default=131072, help='Sequence length provided to the model [Default: %default]') parser.add_option('--local', dest='local', default=1024, type='int', help='Local SAD score [Default: %default]') parser.add_option('-n', dest='norm_file', default=None, help='Normalize SAD scores') parser.add_option( '-o', dest='out_dir', default='sad', help='Output directory for tables and plots [Default: %default]') parser.add_option('-p', dest='processes', default=2, type='int', help='Number of parallel processes to run.') parser.add_option('--pseudo', dest='log_pseudo', default=1, type='float', help='Log2 pseudocount [Default: %default]') parser.add_option( '-q', dest='queue', default='p100', help='SLURM queue on which to run the jobs [Default: %default]') parser.add_option( '--rc', dest='rc', default=False, action='store_true', help= 'Average the forward and reverse complement predictions when testing [Default: %default]' ) parser.add_option('--shifts', dest='shifts', default='0', help='Ensemble prediction shifts [Default: %default]') parser.add_option( '-t', dest='targets_file', default=None, help='File specifying target indexes and labels in table format') parser.add_option( '--ti', dest='track_indexes', help='Comma-separated list of target indexes to output BigWig tracks') parser.add_option( '-u', dest='penultimate', default=False, action='store_true', help='Compute SED in the penultimate layer [Default: %default]') (options, args) = parser.parse_args() if len(args) != 3: parser.error('Must provide parameters and model files and VCF file') else: params_file = args[0] model_file = args[1] vcf_file = args[2] ####################################################### # prep work # output directory if os.path.isdir(options.out_dir): shutil.rmtree(options.out_dir) os.mkdir(options.out_dir) # pickle options options_pkl_file = '%s/options.pkl' % options.out_dir options_pkl = open(options_pkl_file, 'wb') pickle.dump(options, options_pkl) options_pkl.close() ####################################################### # launch worker threads jobs = [] for pi in range(options.processes): cmd = 'source activate py3_gpu; basenji_sad.py %s %s %d' % ( options_pkl_file, ' '.join(args), pi) name = 'sad_p%d' % pi outf = '%s/job%d.out' % (options.out_dir, pi) errf = '%s/job%d.err' % (options.out_dir, pi) j = slurm.Job(cmd, name, outf, errf, queue=options.queue, mem=15000, time='7-0:0:0', gpu=1) jobs.append(j) slurm.multi_run(jobs, max_proc=options.processes, verbose=True, sleep_time=60) ####################################################### # collect output collect_table('sad_table.txt', options.out_dir, options.processes)