def score_reads(k, readsf, par): cmds = [] for c in range(k): cmds.append( '%s/simple-score -N cluster-%d.icm < %s > icm-%d.scores.tmp 2>/dev/null' % (bin_dir, c, readsf, c)) util.exec_par(cmds, par)
def train_imm(k, soft_assign, par): cmds = [] for i in range(k): if soft_assign: cmds.append('%s/em_build-icm -p 1 cluster-%d.icm < cluster-%d.build.fa' % (bin_dir,i,i)) else: cmds.append('%s/build-icm -p 1 cluster-%d.icm < cluster-%d.fa' % (bin_dir,i,i)) util.exec_par(cmds, par)
def main(): usage = 'usage: %prog [options] <fasta_file> <targets_file>' parser = OptionParser(usage) parser.add_option('-b', dest='blacklist_bed', help='Set blacklist nucleotides to a baseline value.') parser.add_option( '--break', dest='break_t', default=786432, type='int', help='Break in half contigs above length [Default: %default]') parser.add_option('-c', '--crop', dest='crop_bp', default=0, type='int', help='Crop bp off each end [Default: %default]') parser.add_option('-d', dest='sample_pct', default=1.0, type='float', help='Down-sample the segments') parser.add_option('-f', dest='folds', default=None, type='int', help='Generate cross fold split [Default: %default]') parser.add_option('-g', dest='gaps_file', help='Genome assembly gaps BED [Default: %default]') parser.add_option('-i', dest='interp_nan', default=False, action='store_true', help='Interpolate NaNs [Default: %default]') parser.add_option('-l', dest='seq_length', default=131072, type='int', help='Sequence length [Default: %default]') parser.add_option( '--limit', dest='limit_bed', help='Limit to segments that overlap regions in a BED file') parser.add_option( '--local', dest='run_local', default=False, action='store_true', help='Run jobs locally as opposed to on SLURM [Default: %default]') parser.add_option('-o', dest='out_dir', default='data_out', help='Output directory [Default: %default]') parser.add_option('-p', dest='processes', default=None, type='int', help='Number parallel processes [Default: %default]') parser.add_option( '--peaks', dest='peaks_only', default=False, action='store_true', help='Create contigs only from peaks [Default: %default]') parser.add_option('-r', dest='seqs_per_tfr', default=256, type='int', help='Sequences per TFRecord file [Default: %default]') parser.add_option( '--restart', dest='restart', default=False, action='store_true', help='Continue progress from midpoint. [Default: %default]') parser.add_option('--seed', dest='seed', default=44, type='int', help='Random seed [Default: %default]') parser.add_option( '--snap', dest='snap', default=1, type='int', help='Snap sequences to multiple of the given value [Default: %default]' ) parser.add_option('--st', '--split_test', dest='split_test', default=False, action='store_true', help='Exit after split. [Default: %default]') parser.add_option( '--stride', '--stride_train', dest='stride_train', default=1., type='float', help='Stride to advance train sequences [Default: seq_length]') parser.add_option( '--stride_test', dest='stride_test', default=1., type='float', help='Stride to advance valid and test sequences [Default: seq_length]' ) parser.add_option( '-t', dest='test_pct_or_chr', default=0.05, type='str', help='Proportion of the data for testing [Default: %default]') parser.add_option('-u', dest='umap_bed', help='Unmappable regions in BED format') parser.add_option( '--umap_t', dest='umap_t', default=0.5, type='float', help= 'Remove sequences with more than this unmappable bin % [Default: %default]' ) parser.add_option( '--umap_clip', dest='umap_clip', default=1, type='float', help= 'Clip values at unmappable positions to distribution quantiles, eg 0.25. [Default: %default]' ) parser.add_option( '--umap_tfr', dest='umap_tfr', default=False, action='store_true', help='Save umap array into TFRecords [Default: %default]') parser.add_option('-w', dest='pool_width', default=128, type='int', help='Sum pool width [Default: %default]') parser.add_option( '-v', dest='valid_pct_or_chr', default=0.05, type='str', help='Proportion of the data for validation [Default: %default]') parser.add_option('--norm', dest='norm', default='', type='str', help='Normalize coverage values') parser.add_option('--step', dest='step', default=0, type='int', help='Stride using bp size [Default: %pool_window]') parser.add_option('--padding', dest='padding', default='valid', type='str', help='Padding method for sliding window approach') (options, args) = parser.parse_args() if len(args) != 2: parser.error( 'Must provide FASTA and sample coverage labels and paths.') else: fasta_file = args[0] targets_file = args[1] random.seed(options.seed) np.random.seed(options.seed) if options.break_t is not None and options.break_t < options.seq_length: print( 'Maximum contig length --break cannot be less than sequence length.', file=sys.stderr) exit(1) # transform proportion strides to base pairs if options.stride_train <= 1: print('stride_train %.f' % options.stride_train, end='') options.stride_train = options.stride_train * options.seq_length print(' converted to %f' % options.stride_train) options.stride_train = int(np.round(options.stride_train)) if options.stride_test <= 1: if options.folds is None: print('stride_test %.f' % options.stride_test, end='') options.stride_test = options.stride_test * options.seq_length print(' converted to %f' % options.stride_test) options.stride_test = int(np.round(options.stride_test)) # check snap if options.snap is not None: if np.mod(options.seq_length, options.snap) != 0: raise ValueError('seq_length must be a multiple of snap') if np.mod(options.stride_train, options.snap) != 0: raise ValueError('stride_train must be a multiple of snap') if np.mod(options.stride_test, options.snap) != 0: raise ValueError('stride_test must be a multiple of snap') # setup output directory if os.path.isdir(options.out_dir) and not options.restart: print('Remove output directory %s or use --restart option.' % options.out_dir) exit(1) elif not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) # read target datasets targets_df = pd.read_csv(targets_file, index_col=0, sep='\t') ################################################################ # define genomic contigs ################################################################ if not options.restart: chrom_contigs = genome.load_chromosomes(fasta_file) # remove gaps if options.gaps_file: chrom_contigs = genome.split_contigs(chrom_contigs, options.gaps_file) # ditch the chromosomes for contigs contigs = [] for chrom in chrom_contigs: if len(chrom.split('_')) == 1 and chrom != 'chrM': contigs += [ Contig(chrom, ctg_start, ctg_end) for ctg_start, ctg_end in chrom_contigs[chrom] ] # limit to a BED file if options.limit_bed is not None: contigs = limit_contigs(contigs, options.limit_bed) # limit to peaks if options.peaks_only: peaks_bed = curate_peaks(targets_df, options.out_dir, options.pool_width, options.crop_bp) contigs = limit_contigs(contigs, peaks_bed) # filter for large enough contigs = [ ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length ] # break up large contigs if options.break_t is not None: contigs = break_large_contigs(contigs, options.break_t) # print contigs to BED file # ctg_bed_file = '%s/contigs.bed' % options.out_dir # write_seqs_bed(ctg_bed_file, contigs) ################################################################ # divide between train/valid/test ################################################################ # label folds if options.folds is not None: fold_labels = ['fold%d' % fi for fi in range(options.folds)] num_folds = options.folds else: fold_labels = ['train', 'valid', 'test'] num_folds = 3 if not options.restart: if options.folds is not None: # divide by fold pct fold_contigs = divide_contigs_folds(contigs, options.folds) else: try: # convert to float pct valid_pct = float(options.valid_pct_or_chr) test_pct = float(options.test_pct_or_chr) assert (0 <= valid_pct <= 1) assert (0 <= test_pct <= 1) # divide by pct fold_contigs = divide_contigs_pct(contigs, test_pct, valid_pct) except (ValueError, AssertionError): # divide by chr valid_chrs = options.valid_pct_or_chr.split(',') test_chrs = options.test_pct_or_chr.split(',') fold_contigs = divide_contigs_chr(contigs, test_chrs, valid_chrs) # rejoin broken contigs within set for fi in range(len(fold_contigs)): fold_contigs[fi] = rejoin_large_contigs(fold_contigs[fi]) # write labeled contigs to BED file ctg_bed_file = '%s/contigs.bed' % options.out_dir ctg_bed_out = open(ctg_bed_file, 'w') for fi in range(len(fold_contigs)): for ctg in fold_contigs[fi]: line = '%s\t%d\t%d\t%s' % (ctg.chr, ctg.start, ctg.end, fold_labels[fi]) print(line, file=ctg_bed_out) ctg_bed_out.close() if options.split_test: exit() ################################################################ # define model sequences ################################################################ if not options.restart: fold_mseqs = [] for fi in range(num_folds): if fold_labels[fi] in ['valid', 'test']: stride_fold = options.stride_test else: stride_fold = options.stride_train # stride sequences across contig fold_mseqs_fi = contig_sequences(fold_contigs[fi], options.seq_length, stride_fold, options.snap, fold_labels[fi]) fold_mseqs.append(fold_mseqs_fi) # shuffle random.shuffle(fold_mseqs[fi]) # down-sample if options.sample_pct < 1.0: fold_mseqs[fi] = random.sample( fold_mseqs[fi], int(options.sample_pct * len(fold_mseqs[fi]))) # merge into one list mseqs = [ms for fm in fold_mseqs for ms in fm] ################################################################ # mappability ################################################################ if not options.restart: if options.umap_bed is not None: if shutil.which('bedtools') is None: print('Install Bedtools to annotate unmappable sites', file=sys.stderr) exit(1) # annotate unmappable positions mseqs_unmap = annotate_unmap(mseqs, options.umap_bed, options.seq_length, options.pool_width, options.crop_bp) # filter unmappable mseqs_map_mask = (mseqs_unmap.mean(axis=1, dtype='float64') < options.umap_t) mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]] mseqs_unmap = mseqs_unmap[mseqs_map_mask, :] # write to file unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir np.save(unmap_npy, mseqs_unmap) # write sequences to BED seqs_bed_file = '%s/sequences.bed' % options.out_dir write_seqs_bed(seqs_bed_file, mseqs, True) else: # read from directory seqs_bed_file = '%s/sequences.bed' % options.out_dir unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir mseqs = [] fold_mseqs = [] for fi in range(num_folds): fold_mseqs.append([]) for line in open(seqs_bed_file): a = line.split() msg = ModelSeq(a[0], int(a[1]), int(a[2]), a[3]) mseqs.append(msg) if a[3] == 'train': fi = 0 elif a[3] == 'valid': fi = 1 elif a[3] == 'test': fi = 2 else: fi = int(a[3].replace('fold', '')) fold_mseqs[fi].append(msg) ################################################################ # read sequence coverage values ################################################################ seqs_cov_dir = '%s/seqs_cov' % options.out_dir if not os.path.isdir(seqs_cov_dir): os.mkdir(seqs_cov_dir) read_jobs = [] for ti in range(targets_df.shape[0]): genome_cov_file = targets_df['file'].iloc[ti] seqs_cov_stem = '%s/%d' % (seqs_cov_dir, ti) seqs_cov_file = '%s.h5' % seqs_cov_stem clip_ti = None if 'clip' in targets_df.columns: clip_ti = targets_df['clip'].iloc[ti] clipsoft_ti = None if 'clip_soft' in targets_df.columns: clipsoft_ti = targets_df['clip_soft'].iloc[ti] scale_ti = 1 if 'scale' in targets_df.columns: scale_ti = targets_df['scale'].iloc[ti] if options.restart and os.path.isfile(seqs_cov_file): print('Skipping existing %s' % seqs_cov_file, file=sys.stderr) else: cmd = '/home/shush/profile/tfprofile/bin/basenji_data_read.py' cmd += ' --crop %d' % options.crop_bp cmd += ' -w %d' % options.pool_width cmd += ' -u %s' % targets_df['sum_stat'].iloc[ti] if clip_ti is not None: cmd += ' -c %f' % clip_ti if clipsoft_ti is not None: cmd += ' --clip_soft %f' % clipsoft_ti cmd += ' -s %f' % scale_ti if options.blacklist_bed: cmd += ' -b %s' % options.blacklist_bed if options.interp_nan: cmd += ' -i' if options.norm: cmd += ' --norm %s' % options.norm if options.step: cmd += ' --step %i' % options.step if options.padding: cmd += ' --padding %s' % options.padding cmd += ' %s' % genome_cov_file cmd += ' %s' % seqs_bed_file cmd += ' %s' % seqs_cov_file if options.run_local: # breaks on some OS # cmd += ' &> %s.err' % seqs_cov_stem read_jobs.append(cmd) else: j = slurm.Job(cmd, name='read_t%d' % ti, out_file='%s.out' % seqs_cov_stem, err_file='%s.err' % seqs_cov_stem, queue='standard', mem=15000, time='12:0:0') read_jobs.append(j) if options.run_local: util.exec_par(read_jobs, options.processes, verbose=True) else: slurm.multi_run(read_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5) ################################################################ # write TF Records ################################################################ # copy targets file shutil.copy(targets_file, '%s/targets.txt' % options.out_dir) # initialize TF Records dir tfr_dir = '%s/tfrecords' % options.out_dir if not os.path.isdir(tfr_dir): os.mkdir(tfr_dir) write_jobs = [] for fold_set in fold_labels: fold_set_indexes = [ i for i in range(len(mseqs)) if mseqs[i].label == fold_set ] fold_set_start = fold_set_indexes[0] fold_set_end = fold_set_indexes[-1] + 1 tfr_i = 0 tfr_start = fold_set_start tfr_end = min(tfr_start + options.seqs_per_tfr, fold_set_end) while tfr_start <= fold_set_end: tfr_stem = '%s/%s-%d' % (tfr_dir, fold_set, tfr_i) cmd = '/home/shush/profile/tfprofile/bin/basenji_data_write.py' cmd += ' -s %d' % tfr_start cmd += ' -e %d' % tfr_end cmd += ' --umap_clip %f' % options.umap_clip if options.umap_tfr: cmd += ' --umap_tfr' if options.umap_bed is not None: cmd += ' -u %s' % unmap_npy cmd += ' %s' % fasta_file cmd += ' %s' % seqs_bed_file cmd += ' %s' % seqs_cov_dir cmd += ' %s.tfr' % tfr_stem if options.run_local: # breaks on some OS # cmd += ' &> %s.err' % tfr_stem write_jobs.append(cmd) else: j = slurm.Job(cmd, name='write_%s-%d' % (fold_set, tfr_i), out_file='%s.out' % tfr_stem, err_file='%s.err' % tfr_stem, queue='standard', mem=15000, time='12:0:0') write_jobs.append(j) # update tfr_i += 1 tfr_start += options.seqs_per_tfr tfr_end = min(tfr_start + options.seqs_per_tfr, fold_set_end) if options.run_local: util.exec_par(write_jobs, options.processes, verbose=True) else: slurm.multi_run(write_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5) ################################################################ # stats ################################################################ stats_dict = {} stats_dict['num_targets'] = targets_df.shape[0] stats_dict['seq_length'] = options.seq_length stats_dict['pool_width'] = options.pool_width stats_dict['crop_bp'] = options.crop_bp target_length = options.seq_length - 2 * options.crop_bp target_length = target_length // options.pool_width stats_dict['target_length'] = target_length for fi in range(num_folds): stats_dict['%s_seqs' % fold_labels[fi]] = len(fold_mseqs[fi]) for i in range(10): print('~~~') print('%s/statistics.json' % options.out_dir) for i in range(10): print('~~~') with open('%s/statistics.json' % options.out_dir, 'w') as stats_json_out: json.dump(stats_dict, stats_json_out, indent=4)
def score_reads(k, readsf, par): cmds = [] for c in range(k): cmds.append('%s/simple-score -N cluster-%d.icm < %s > icm-%d.scores.tmp 2>/dev/null' % (bin_dir,c,readsf,c)) util.exec_par(cmds, par)
def main(): usage = 'usage: %prog [options] <exp_dir> <params_file> <data_dir> <bed_file>' parser = OptionParser(usage) # sat options sat_options = OptionGroup(parser, 'basenji_sat_bed.py options') sat_options.add_option( '-d', dest='mut_down', default=0, type='int', help= 'Nucleotides downstream of center sequence to mutate [Default: %default]' ) sat_options.add_option( '-f', dest='genome_fasta', default=None, help='Genome FASTA for sequences [Default: %default]') sat_options.add_option( '-l', dest='mut_len', default=0, type='int', help='Length of center sequence to mutate [Default: %default]') sat_options.add_option('-o', dest='out_dir', default='sat_mut', help='Output directory [Default: %default]') sat_options.add_option('--plots', dest='plots', default=False, action='store_true', help='Make heatmap plots [Default: %default]') sat_options.add_option('-p', dest='processes', default=None, type='int', help='Number of processes, passed by multi script') sat_options.add_option( '--rc', dest='rc', default=False, action='store_true', help= 'Ensemble forward and reverse complement predictions [Default: %default]' ) sat_options.add_option( '--shifts', dest='shifts', default='0', help='Ensemble prediction shifts [Default: %default]') sat_options.add_option( '--stats', dest='sad_stats', default='sum', help='Comma-separated list of stats to save. [Default: %default]') sat_options.add_option( '-t', dest='targets_file', default=None, type='str', help='File specifying target indexes and labels in table format') sat_options.add_option( '-u', dest='mut_up', default=0, type='int', help= 'Nucleotides upstream of center sequence to mutate [Default: %default]' ) parser.add_option_group(sat_options) phylop_options = OptionGroup(parser, 'basenji_bench_phylop.py options') # phylop_options.add_option('-e', dest='num_estimators', # default=100, type='int', # help='Number of random forest estimators [Default: %default]') phylop_options.add_option( '-g', dest='genome', default='ce11', help='PhyloP and FASTA genome [Default: %default]') # phylop_options.add_option('--pca', dest='n_components', # default=None, type='int', # help='PCA n_components [Default: %default]') parser.add_option_group(phylop_options) fold_options = OptionGroup(parser, 'cross-fold options') fold_options.add_option( '-a', '--alt', dest='alternative', default='two-sided', help='Statistical test alternative [Default: %default]') fold_options.add_option( '-c', dest='crosses', default=1, type='int', help='Number of cross-fold rounds [Default:%default]') fold_options.add_option('-e', dest='conda_env', default='tf2.4', help='Anaconda environment [Default: %default]') fold_options.add_option('--label_exp', dest='label_exp', default='Experiment', help='Experiment label [Default: %default]') fold_options.add_option('--label_ref', dest='label_ref', default='Reference', help='Reference label [Default: %default]') fold_options.add_option( '--max_proc', dest='max_proc', default=None, type='int', help='Maximum concurrent processes [Default: %default]') fold_options.add_option('--name', dest='name', default='sat', help='SLURM name prefix [Default: %default]') fold_options.add_option( '-q', dest='queue', default='gtx1080ti', help='SLURM queue on which to run the jobs [Default: %default]') fold_options.add_option('-r', dest='ref_dir', default=None, help='Reference directory for statistical tests') parser.add_option_group(fold_options) (options, args) = parser.parse_args() if len(args) != 4: parser.error('Must provide parameters file and data directory') else: exp_dir = args[0] params_file = args[1] data_dir = args[2] bed_file = args[3] # read data parameters data_stats_file = '%s/statistics.json' % data_dir with open(data_stats_file) as data_stats_open: data_stats = json.load(data_stats_open) # count folds num_folds = len([dkey for dkey in data_stats if dkey.startswith('fold')]) # genome genome_path = os.environ[options.genome.upper()] options.genome_fasta = '%s/assembly/%s.fa' % (genome_path, options.genome) ################################################################ # saturation mutagenesis ################################################################ jobs = [] scores_files = [] for ci in range(options.crosses): for fi in range(num_folds): it_dir = '%s/f%d_c%d' % (exp_dir, fi, ci) name = '%s-f%dc%d' % (options.name, fi, ci) # update output directory sat_dir = '%s/%s' % (it_dir, options.out_dir) # check if done scores_file = '%s/scores.h5' % sat_dir scores_files.append(scores_file) if os.path.isfile(scores_file): print('%s already generated.' % scores_file) else: basenji_cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' basenji_cmd += ' conda activate %s;' % options.conda_env basenji_cmd += ' echo $HOSTNAME;' if options.processes > 1: basenji_cmd += ' basenji_sat_bed_multi.py' basenji_cmd += ' --max_proc %d' % (options.max_proc // num_folds) basenji_cmd += ' -q %s' % options.queue basenji_cmd += ' -n %s' % name basenji_cmd += ' -r' else: basenji_cmd += ' basenji_sat_bed.py' basenji_cmd += ' %s' % options_string(options, sat_options, sat_dir) basenji_cmd += ' %s' % params_file basenji_cmd += ' %s/train/model_best.h5' % it_dir basenji_cmd += ' %s' % bed_file if options.processes > 1: jobs.append(basenji_cmd) else: basenji_job = slurm.Job(basenji_cmd, name, out_file='%s.out' % sat_dir, err_file='%s.err' % sat_dir, cpu=2, gpu=1, queue=options.queue, mem=30000, time='28-0:00:00') jobs.append(basenji_job) if options.processes > 1: util.exec_par(jobs, verbose=True) else: slurm.multi_run(jobs, verbose=True) ################################################################ # ensemble ################################################################ ensemble_dir = '%s/ensemble' % exp_dir if not os.path.isdir(ensemble_dir): os.mkdir(ensemble_dir) sat_dir = '%s/%s' % (ensemble_dir, options.out_dir) if not os.path.isdir(sat_dir): os.mkdir(sat_dir) if not os.path.isfile('%s/scores.h5' % sat_dir): print('Generating ensemble scores.') ensemble_scores_h5(sat_dir, scores_files) else: print('Ensemble scores already generated.') ################################################################ # PhyloP regressors ################################################################ # num_pcs = int(data_stats['num_targets']**0.75) jobs = [] for ci in range(options.crosses): for fi in range(num_folds): it_dir = '%s/f%d_c%d' % (exp_dir, fi, ci) sat_dir = '%s/%s' % (it_dir, options.out_dir) if not os.path.isfile('%s/stats.txt' % sat_dir): phylop_cmd = 'basenji_bench_phylop.py' phylop_cmd += ' -e 200 -p 4' # phylop_cmd += ' -d %d' % num_pcs phylop_cmd += ' -o %s' % sat_dir phylop_cmd += ' %s/scores.h5' % sat_dir name = '%s-f%dc%d' % (options.name, fi, ci) std_pre = '%s/phylop' % sat_dir j = slurm.Job(phylop_cmd, name, '%s.out' % std_pre, '%s.err' % std_pre, queue='standard', cpu=4, mem=90000, time='1-0:0:0') jobs.append(j) # ensemble sat_dir = '%s/%s' % (ensemble_dir, options.out_dir) if not os.path.isfile('%s/stats.txt' % sat_dir): phylop_cmd = 'basenji_bench_phylop.py' phylop_cmd += ' -e 200 -p 4' # phylop_cmd += ' -d %d' % num_pcs phylop_cmd += ' -o %s' % sat_dir phylop_cmd += ' %s/scores.h5' % sat_dir name = '%s-ens' % options.name std_pre = '%s/phylop' % sat_dir j = slurm.Job(phylop_cmd, name, '%s.out' % std_pre, '%s.err' % std_pre, queue='standard', cpu=4, mem=90000, time='1-0:0:0') jobs.append(j) slurm.multi_run(jobs, verbose=True) ################################################################ # compare ################################################################ ref_sat_dirs = [] exp_sat_dirs = [] for ci in range(options.crosses): for fi in range(num_folds): exp_sat_dir = '%s/f%d_c%d/%s' % (exp_dir, fi, ci, options.out_dir) exp_sat_dirs.append(exp_sat_dir) if options.ref_dir is not None: ref_sat_dir = '%s/f%d_c%d/%s' % (options.ref_dir, fi, ci, options.out_dir) ref_sat_dirs.append(ref_sat_dir) exp_pcor_folds, exp_r2_folds = read_metrics(exp_sat_dirs) exp_sat_dirs = ['%s/ensemble/%s' % (exp_dir, options.out_dir)] exp_pcor_ens, exp_r2_ens = read_metrics(exp_sat_dirs) if options.ref_dir is not None: ref_pcor_folds, ref_r2_folds = read_metrics(ref_sat_dirs) ref_sat_dirs = ['%s/ensemble/%s' % (options.ref_dir, options.out_dir)] ref_pcor_ens, ref_r2_ens = read_metrics(ref_sat_dirs) print('PearsonR') exp_mean = exp_pcor_folds.mean() exp_stdm = exp_pcor_folds.std() / np.sqrt(len(exp_pcor_folds)) expe_mean = exp_pcor_ens.mean() expe_stdm = exp_pcor_ens.std() / np.sqrt(len(exp_pcor_ens)) print('%12s: %.4f (%.4f)' % (options.label_exp, exp_mean, exp_stdm)) print('%12s (ens): %.4f (%.4f)' % (options.label_exp, expe_mean, expe_stdm)) if options.ref_dir is not None: ref_mean = ref_pcor_folds.mean() ref_stdm = ref_pcor_folds.std() / np.sqrt(len(ref_pcor_folds)) refe_mean = ref_pcor_ens.mean() refe_stdm = ref_pcor_ens.std() / np.sqrt(len(ref_pcor_ens)) print('%12s: %.4f (%.4f)' % (options.label_ref, ref_mean, ref_stdm)) print('%12s (ens): %.4f (%.4f)' % (options.label_ref, refe_mean, refe_stdm)) mwp, tp = stat_tests(exp_pcor_folds, ref_pcor_folds, options.alternative) print('Mann-Whitney U p-value: %.3g' % mwp) print('T-test p-value: %.3g' % tp) print('\nR2') exp_mean = exp_r2_folds.mean() exp_stdm = exp_r2_folds.std() / np.sqrt(len(exp_r2_folds)) expe_mean = exp_r2_ens.mean() expe_stdm = exp_r2_ens.std() / np.sqrt(len(exp_r2_ens)) print('%12s: %.4f (%.4f)' % (options.label_exp, exp_mean, exp_stdm)) print('%12s (ens): %.4f (%.4f)' % (options.label_exp, expe_mean, expe_stdm)) if options.ref_dir is not None: ref_mean = ref_r2_folds.mean() ref_stdm = ref_r2_folds.std() / np.sqrt(len(ref_r2_folds)) refe_mean = ref_r2_ens.mean() refe_stdm = ref_r2_ens.std() / np.sqrt(len(ref_r2_ens)) print('%12s: %.4f (%.4f)' % (options.label_ref, ref_mean, ref_stdm)) print('%12s (ens): %.4f (%.4f)' % (options.label_ref, refe_mean, refe_stdm)) mwp, tp = stat_tests(exp_r2_folds, ref_r2_folds, options.alternative) print('Mann-Whitney U p-value: %.3g' % mwp) print('T-test p-value: %.3g' % tp)
def main(): usage = 'usage: %prog [options] <fasta_file> <targets_file>' parser = OptionParser(usage) parser.add_option('-b', dest='limit_bed', help='Limit to segments that overlap regions in a BED file') # parser.add_option('-c', dest='clip', # default=None, type='float', # help='Clip target values to have minimum [Default: %default]') parser.add_option('-d', dest='sample_pct', default=1.0, type='float', help='Down-sample the segments') parser.add_option('-g', dest='gaps_file', help='Genome assembly gaps BED [Default: %default]') parser.add_option('-l', dest='seq_length', default=131072, type='int', help='Sequence length [Default: %default]') parser.add_option('--local', dest='run_local', default=False, action='store_true', help='Run jobs locally as opposed to on SLURM [Default: %default]') parser.add_option('-o', dest='out_dir', default='data_out', help='Output directory [Default: %default]') parser.add_option('-p', dest='processes', default=None, type='int', help='Number parallel processes [Default: %default]') parser.add_option('--seed', dest='seed', default=44, type='int', help='Random seed [Default: %default]') parser.add_option('--stride_train', dest='stride_train', default=1., type='float', help='Stride to advance train sequences [Default: seq_length]') parser.add_option('--stride_test', dest='stride_test', default=1., type='float', help='Stride to advance valid and test sequences [Default: seq_length]') parser.add_option('-r', dest='seqs_per_tfr', default=256, type='int', help='Sequences per TFRecord file [Default: %default]') parser.add_option('-t', dest='test_pct', default=0.05, type='float', help='Proportion of the data for testing [Default: %default]') parser.add_option('-u', dest='unmap_bed', help='Unmappable segments to set to NA') parser.add_option('--unmap_t', dest='unmap_t', default=0.3, type='float', help='Remove sequences with more than this unmappable bin % [Default: %default]') parser.add_option('-w', dest='pool_width', default=128, type='int', help='Sum pool width [Default: %default]') parser.add_option('-v', dest='valid_pct', default=0.05, type='float', help='Proportion of the data for validation [Default: %default]') (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide FASTA and sample coverage labels and paths.') else: fasta_file = args[0] targets_file = args[1] random.seed(options.seed) np.random.seed(options.seed) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################ # define genomic contigs ################################################################ chrom_contigs = basenji.genome.load_chromosomes(fasta_file) # remove gaps if options.gaps_file: chrom_contigs = basenji.genome.split_contigs(chrom_contigs, options.gaps_file) # ditch the chromosomes for contigs contigs = [] for chrom in chrom_contigs: contigs += [Contig(chrom, ctg_start, ctg_end) for ctg_start, ctg_end in chrom_contigs[chrom]] # limit to a BED file if options.limit_bed is not None: contigs = limit_contigs(contigs, options.limit_bed) # filter for large enough contigs = [ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length] # down-sample if options.sample_pct < 1.0: contigs = random.sample(contigs, int(options.sample_pct*len(contigs))) # print contigs to BED file ctg_bed_file = '%s/contigs.bed' % options.out_dir write_seqs_bed(ctg_bed_file, contigs) ################################################################ # divide between train/valid/test ################################################################ contig_sets = divide_contigs(contigs, options.test_pct, options.valid_pct) train_contigs, valid_contigs, test_contigs = contig_sets ################################################################ # define model sequences ################################################################ # stride sequences across contig train_mseqs = contig_sequences(train_contigs, options.seq_length, options.stride_train) valid_mseqs = contig_sequences(valid_contigs, options.seq_length, options.stride_test) test_mseqs = contig_sequences(test_contigs, options.seq_length, options.stride_test) # shuffle random.shuffle(train_mseqs) random.shuffle(valid_mseqs) random.shuffle(test_mseqs) # merge mseqs = train_mseqs + valid_mseqs + test_mseqs mseqs_labels = ['train']*len(train_mseqs) + ['valid']*len(valid_mseqs) + ['test']*len(test_mseqs) ################################################################ # mappability ################################################################ if options.unmap_bed is not None: # annotate unmappable positions mseqs_unmap = annotate_unmap(mseqs, options.unmap_bed, options.seq_length, options.pool_width) # filter unmappable mseqs_map_mask = (mseqs_unmap.mean(axis=1, dtype='float64') < options.unmap_t) mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]] mseqs_labels = [mseqs_labels[i] for i in range(len(mseqs_labels)) if mseqs_map_mask[i]] mseqs_unmap = mseqs_unmap[mseqs_map_mask,:] # write to file unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir np.save(unmap_npy, mseqs_unmap) # write sequences to BED seqs_bed_file = '%s/sequences.bed' % options.out_dir write_seqs_bed(seqs_bed_file, mseqs, mseqs_labels) ################################################################ # read sequence coverage values ################################################################ # read target datasets targets_df = pd.read_table(targets_file) seqs_cov_dir = '%s/seqs_cov' % options.out_dir if not os.path.isdir(seqs_cov_dir): os.mkdir(seqs_cov_dir) read_jobs = [] for ti in range(targets_df.shape[0]): genome_cov_file = targets_df['file'].iloc[ti] seqs_cov_stem = '%s/%d' % (seqs_cov_dir, ti) seqs_cov_file = '%s.h5' % seqs_cov_stem if os.path.isfile(seqs_cov_file): print('Skipping existing %s' % seqs_cov_file, file=sys.stderr) else: cmd = 'basenji_data_read.py' cmd += ' -w %d' % options.pool_width cmd += ' %s' % genome_cov_file cmd += ' %s' % seqs_bed_file cmd += ' %s' % seqs_cov_file if options.run_local: cmd += ' &> %s.err' % seqs_cov_stem read_jobs.append(cmd) else: j = slurm.Job(cmd, name='read_t%d' % ti, out_file='%s.out' % seqs_cov_stem, err_file='%s.err' % seqs_cov_stem, queue='standard,tbdisk', mem=15000, time='12:0:0') read_jobs.append(j) if options.run_local: util.exec_par(read_jobs, options.processes, verbose=True) else: slurm.multi_run(read_jobs, options.processes, verbose=True, sleep_time=1) ################################################################ # write TF Records ################################################################ tfr_dir = '%s/tfrecords' % options.out_dir if not os.path.isdir(tfr_dir): os.mkdir(tfr_dir) write_jobs = [] for tvt_set in ['train', 'valid', 'test']: tvt_set_indexes = [i for i in range(len(mseqs_labels)) if mseqs_labels[i] == tvt_set] tvt_set_start = tvt_set_indexes[0] tvt_set_end = tvt_set_indexes[-1] tfr_i = 0 tfr_start = tvt_set_start tfr_end = min(tfr_start+options.seqs_per_tfr, tvt_set_end) while tfr_start <= tvt_set_end: tfr_stem = '%s/%s-%d' % (tfr_dir, tvt_set, tfr_i) cmd = 'basenji_data_write.py' cmd += ' -s %d' % tfr_start cmd += ' -e %d' % tfr_end if options.unmap_bed is not None: cmd += ' -u %s' % unmap_npy cmd += ' %s' % fasta_file cmd += ' %s' % seqs_bed_file cmd += ' %s' % seqs_cov_dir cmd += ' %s.tfr' % tfr_stem if options.run_local: cmd += ' &> %s.err' % tfr_stem write_jobs.append(cmd) else: j = slurm.Job(cmd, name='write_%s-%d' % (tvt_set, tfr_i), out_file='%s.out' % tfr_stem, err_file='%s.err' % tfr_stem, queue='standard,tbdisk', mem=15000, time='12:0:0') write_jobs.append(j) # update tfr_i += 1 tfr_start += options.seqs_per_tfr tfr_end = min(tfr_start+options.seqs_per_tfr, tvt_set_end) if options.run_local: util.exec_par(write_jobs, options.processes, verbose=True) else: slurm.multi_run(write_jobs, options.processes, verbose=True, sleep_time=1)
def main(): usage = "usage: %prog [options] <input_file>" parser = OptionParser(usage) parser.add_option( "-k", dest="k_fold", type="int", default=10, help="Number of folds to use for cross-validation [Default: %default]", ) parser.add_option( "--lambda_min", dest="lambda_min", type="float", default=0.01, help="Minimum -lambda value to attempt [Default: %default]", ) parser.add_option( "--lambda_max", dest="lambda_max", type="float", default=10.0, help="Maximum -lambda value to attempt [Default: %default]", ) parser.add_option( "--lambda_mult", dest="lambda_mult", type="float", default=2.0, help="Multiplier for next -lambda value to attempt [Default: %default]", ) parser.add_option( "-l", dest="lesser_kmers", action="store_true", default=False, help="Use all kmers of length less than and equal to that given by -k [Default: %default]", ) # parser.add_option('-m', dest='model_file', help='File to output model to') parser.add_option( "-p", dest="parallel", type="int", default=4, help="Number of parallel threads to run [Default: %default]" ) parser.add_option( "-r", dest="replicates", type="int", default=1, help="Number of times to repeat the optimization for each fold [Default: %default]", ) parser.add_option( "-w", dest="weights", action="store_true", default=False, help="Print a summary of the weight vectors" ) (options, args) = parser.parse_args() if len(args) != 1: parser.error("Must provide input file") else: input_file = args[0] input_base = os.path.splitext(input_file)[0] if options.weights: summarize_weights(input_base, options) exit() # determine % of positive examples input_pos, input_total = positive_percent(input_file) f1_base = input_pos / float(input_total) # trust me, it works for r in range(options.replicates): rep_dir = "%s_rep%d" % (input_base, r) if os.path.isdir(rep_dir): shutil.rmtree(rep_dir) os.mkdir(rep_dir) os.chdir(rep_dir) # divide data into folds divide_data("../" + input_file, options.k_fold) # collect pegasos commands cmds = [] peg_lambda = options.lambda_min while peg_lambda <= options.lambda_max: # run on each fold for f in range(options.k_fold): cmds.append( "pegasos -lambda %f -modelFile fold%d/train_%.1e.mod fold%d/train.dat &> /dev/null" % (peg_lambda, f, peg_lambda, f) ) # increase lambda peg_lambda *= options.lambda_mult # exceute pegasos commands util.exec_par(cmds, options.parallel) # start to clean up space for f in range(options.k_fold): os.remove("fold%d/train.dat" % f) os.chdir("..") # collect results peg_lambda = options.lambda_min while peg_lambda <= options.lambda_max: recalls = [] precisions = [] failed = False for r in range(options.replicates): if not failed: outcomes = {"tp": 0, "fp": 0, "fn": 0} # collect each fold for f in range(options.k_fold): if not compute_accuracy(outcomes, "%s_rep%d/fold%d" % (input_base, r, f), peg_lambda): failed = True break # save if not failed: recalls.append(float(outcomes["tp"]) / (outcomes["tp"] + outcomes["fn"])) precisions.append(float(outcomes["tp"]) / (outcomes["tp"] + outcomes["fp"])) # summarize and print if failed: print "%.1e %8s %7s %8s %7s %8s %8s" % (peg_lambda, "NA", "NA", "NA", "NA", "NA", "NA") else: recall, rsd = stats.mean_sd(recalls) rsd /= math.sqrt(len(recalls)) precision, psd = stats.mean_sd(precisions) psd /= math.sqrt(len(precisions)) # null_p = 1.0-binom.cdf(int(recall*input_total+0.5)-1, int(recall*input_total/precision + 0.5), float(input_pos)/input_total) f1 = 2 * recall * precision / (recall + precision) # print '%.1e %8.3f %6.3f %8.3f %6.3f %8.3f %8.3f %8.1e' % (peg_lambda, recall, rsd, precision, psd, f1, (f1-f1_base), null_p) print "%.1e %8.4f %7.4f %8.4f %7.4f %8.4f %8.4f" % ( peg_lambda, recall, rsd, precision, psd, f1, (f1 - f1_base), ) peg_lambda *= options.lambda_mult