def main(): usage = 'usage: %prog [options] <fasta_file> <sample_wigs_file> <hdf5_file>' parser = OptionParser(usage) parser.add_option( '-b', dest='limit_bed', help='Limit to segments that overlap regions in a BED file') parser.add_option( '-c', dest='clip', default=None, type='float', help='Clip target values to have minimum [Default: %default]') parser.add_option('-d', dest='sample_pct', default=1.0, type='float', help='Down-sample the segments') parser.add_option('-f', dest='fourier_dim', default=None, type='int', help='Fourier transform dimension [Default: %default]') parser.add_option('-g', dest='gaps_file', help='Genome assembly gaps BED [Default: %default]') parser.add_option('-l', dest='seq_length', default=131072, type='int', help='Sequence length [Default: %default]') parser.add_option( '--log2', dest='log10to2', default=False, action='store_true', help='Transform values from log10 to log2 [Default: %default]') parser.add_option('-m', dest='params_file', help='Dimension reduction hyper-parameters file') parser.add_option( '--mult_cov', dest='cov_multiplier', default=1, type='float', help= 'Coverage multiplier, useful when the read extension and pool width do not match [Default: %default]' ) parser.add_option( '-n', dest='na_t', default=0.25, type='float', help= 'Remove sequences with an NA% greater than this threshold [Default: %default]' ) parser.add_option( '--no_full', dest='no_full', default=False, action='store_true', help='Do not save full test sequence targets [Default: %default]') parser.add_option( '-o', dest='out_bed_file', help='Output the train/valid/test sequences as a BED file') parser.add_option( '-p', dest='processes', default=1, type='int', help='Number parallel processes to load data [Default: %default]') parser.add_option('-s', dest='stride', default=None, type='int', help='Stride to advance segments [Default: seq_length]') parser.add_option('--scent', dest='scent_file', help='Dimension reduction model file') parser.add_option( '-t', dest='test_pct_or_chr', type='str', default=0.05, help='Proportion of the data for testing [Default: %default]') parser.add_option('-u', dest='unmap_bed', help='Unmappable segments to set to NA') parser.add_option('-w', dest='pool_width', type='int', default=128, help='Average pooling width [Default: %default]') parser.add_option( '--w5', dest='w5', default=False, action='store_true', help='Coverage files are w5 rather than BigWig [Default: %default]') parser.add_option( '-v', dest='valid_pct_or_chr', type='str', default=0.05, help='Proportion of the data for validation [Default: %default]') parser.add_option('-z', dest='compression', help='h5py compression [Default: %default]') (options, args) = parser.parse_args() if len(args) != 3: parser.error( 'Must provide genome FASTA file, sample Wig/BigWig labels and paths, ' 'and model output file') else: fasta_file = args[0] sample_wigs_file = args[1] hdf5_file = args[2] random.seed(1) if options.stride is None: options.stride = options.seq_length ################################################################ # assess bigwigs ################################################################ # get wig files and labels target_wigs = OrderedDict() target_strands = [] target_labels = [] for line in open(sample_wigs_file, encoding='UTF-8'): a = line.rstrip().split('\t') target_wigs[a[0]] = a[1] if len(a) > 2: target_strands.append(a[2]) else: target_strands.append('*') if len(a) > 3: target_labels.append(a[3]) else: target_labels.append('') if options.fourier_dim is not None and 2 * options.fourier_dim >= options.seq_length / options.pool_width: print( "Fourier transform to %d dims won't compress %d length sequences with %d pooling" % (options.fourier_dim, options.seq_length, options.pool_width), file=sys.stderr) exit(1) ################################################################ # prepare genomic segments ################################################################ chrom_segments = genome.load_chromosomes(fasta_file) # remove gaps if options.gaps_file: chrom_segments = genome.split_contigs(chrom_segments, options.gaps_file) # ditch the chromosomes segments = [] for chrom in chrom_segments: segments += [(chrom, seg_start, seg_end) for seg_start, seg_end in chrom_segments[chrom]] # standardize order segments.sort() # filter for large enough segments = [ cse for cse in segments if cse[2] - cse[1] >= options.seq_length ] # down-sample if options.sample_pct < 1.0: segments = random.sample(segments, int(options.sample_pct * len(segments))) # limit to a BED file if options.limit_bed is not None: segments = limit_segments(segments, options.limit_bed) ################################################################ # one hot code sequences ################################################################ seqs_1hot, seqs_segments = segments_1hot(fasta_file, segments, options.seq_length, options.stride) print('%d sequences one hot coded' % seqs_1hot.shape[0]) ################################################################ # load model ################################################################ if options.params_file: job = dna_io.read_job_params(options.params_file) job['num_targets'] = len(target_wigs) job['batch_size'] = 1024 job['model'] = job.get('model', 'autoencoder') if job['model'] == 'autoencoder': model = autoencoder.AE(job) saver = tf.train.Saver() else: model = joblib.load(options.scent_file) ################################################################ # bigwig read and process ################################################################ print('Reading and pre-processing bigwigs for %d segments' % len(segments), flush=True) targets_real = [] targets_imag = [] include_indexes = [] include_marker = 0 targets_test = [] test_indexes = [] test_marker = 0 update_i = 0 ssi = 0 # initialize multiprocessing pool pool = multiprocessing.Pool(options.processes) with tf.Session() as sess: if options.scent_file and job['model'] == 'autoencoder': saver.restore(sess, options.scent_file) # batch segment processing bstart = 0 while bstart < len(segments): if update_i % 1 == 0: print('Tiling from %s:%d-%d' % segments[bstart], flush=True) # determine batch end bend = batch_end(segments, bstart, 400000) # bigwig_read parameters bwr_params = [(wig_file, segments[bstart:bend], options.seq_length, options.pool_width, options.stride, options.log10to2, options.cov_multiplier) for wig_file in target_wigs.values()] # pull the target values in parallel if options.w5: wig_targets = pool.starmap(w5_batch, bwr_params) else: wig_targets = pool.starmap(bigwig_batch, bwr_params) # transpose to S x L x T (making a copy?) targets_wig = np.transpose(np.array(wig_targets), axes=(1, 2, 0)) # clip if options.clip is not None: targets_wig = targets_wig.clip(options.clip) # sample indexes from this batch if options.test_pct_or_chr.startswith('chr'): test_bindexes = [ twi for twi in range(targets_wig.shape[0]) if seqs_segments[ssi + twi][0] == options.test_pct_or_chr ] else: test_pct = float(options.test_pct_or_chr) test_bindexes = [ twi for twi in range(targets_wig.shape[0]) if random.random() < test_pct ] # capture test indexes test_indexes += [test_marker + tbi for tbi in test_bindexes] # update test marker test_marker += targets_wig.shape[0] # save the full test targets if not options.no_full: targets_test.append(targets_wig[test_bindexes]) # map to latent space if options.scent_file is None: targets_latent = targets_wig else: targets_latent = latent_transform(sess, model, job, targets_wig) # compress across length if options.fourier_dim is None: targets_rfour = targets_latent targets_ifour = None else: targets_rfour, targets_ifour = fourier_transform( targets_latent, options.fourier_dim) # save targets_real.append(targets_rfour) targets_imag.append(targets_ifour) # update seqs_segments index ssi += targets_wig.shape[0] # update batch bstart = bend update_i += 1 pool.close() # stack arrays targets_real = np.vstack(targets_real) if options.fourier_dim is not None: targets_imag = np.vstack(targets_imag) if not options.no_full: targets_test = np.vstack(targets_test) print('%d target sequences' % targets_real.shape[0]) ################################################################ # correct for unmappable regions ################################################################ if options.unmap_bed is not None: seqs_na = annotate_na(seqs_segments, options.unmap_bed, options.seq_length, options.pool_width) # determine mappable sequences and update test indexes map_indexes = [] test_indexes_set = set(test_indexes) print('test_indexes', len(test_indexes)) test_indexes_na = [] new_i = 0 for old_i in range(seqs_na.shape[0]): # mappable if seqs_na[old_i, :].mean(dtype='float64') < options.na_t: map_indexes.append(old_i) if old_i in test_indexes_set: test_indexes_na.append(new_i) new_i += 1 # unmappable else: # forget it pass # update data structures targets_real = targets_real[map_indexes] if options.fourier_dim is not None: targets_imag = targets_imag[map_indexes] seqs_1hot = seqs_1hot[map_indexes] seqs_segments = [seqs_segments[mi] for mi in map_indexes] seqs_na = seqs_na[map_indexes] test_indexes = test_indexes_na print('test_indexes', len(test_indexes)) ################################################################ # write to train, valid, test HDF5 ################################################################ if options.valid_pct_or_chr.startswith('chr'): # sample valid chromosome valid_indexes = [ si for si in range(len(seqs_segments)) if seqs_segments[si][0] == options.valid_pct_or_chr ] else: # sample valid indexes (we already have test) valid_pct = float(options.valid_pct_or_chr) valid_n = int(valid_pct * targets_real.shape[0]) nontest_indexes = set(range(targets_real.shape[0])) - set(test_indexes) valid_indexes = random.sample(nontest_indexes, valid_n) # remainder is training train_indexes = list( set(range(len(seqs_segments))) - set(valid_indexes) - set(test_indexes)) # training may requires shuffle random.shuffle(sorted(train_indexes)) random.shuffle(sorted(valid_indexes)) random.shuffle(sorted(test_indexes)) # write to HDF5 hdf5_out = h5py.File(hdf5_file, 'w') # store pooling hdf5_out.create_dataset('pool_width', data=options.pool_width, dtype='int') # store targets target_ids = np.array(list(target_wigs.keys()), dtype='S') hdf5_out.create_dataset('target_ids', data=target_ids) target_labels = np.array(target_labels, dtype='S') hdf5_out.create_dataset('target_labels', data=target_labels) target_strands = np.array(target_strands, dtype='S') hdf5_out.create_dataset('target_strands', data=target_strands) # HDF5 train hdf5_out.create_dataset('train_in', data=seqs_1hot[train_indexes], dtype='bool', compression=options.compression) hdf5_out.create_dataset('train_out', data=targets_real[train_indexes], dtype='float16', compression=options.compression) if options.fourier_dim is not None: hdf5_out.create_dataset('train_out_imag', data=targets_imag[train_indexes], dtype='float16', compression=options.compression) if options.unmap_bed is not None: hdf5_out.create_dataset('train_na', data=seqs_na[train_indexes], dtype='bool', compression=options.compression) # HDF5 valid hdf5_out.create_dataset('valid_in', data=seqs_1hot[valid_indexes], dtype='bool', compression=options.compression) hdf5_out.create_dataset('valid_out', data=targets_real[valid_indexes], dtype='float16', compression=options.compression) if options.fourier_dim is not None: hdf5_out.create_dataset('valid_out_imag', data=targets_imag[valid_indexes], dtype='float16', compression=options.compression) if options.unmap_bed is not None: hdf5_out.create_dataset('valid_na', data=seqs_na[valid_indexes], dtype='bool', compression=options.compression) # HDF5 test hdf5_out.create_dataset('test_in', data=seqs_1hot[test_indexes], dtype='bool', compression=options.compression) hdf5_out.create_dataset('test_out', data=targets_real[test_indexes], dtype='float16', compression=options.compression) if options.fourier_dim is not None: hdf5_out.create_dataset('test_out_imag', data=targets_imag[test_indexes], dtype='float16', compression=options.compression) if not options.no_full: hdf5_out.create_dataset('test_out_full', data=targets_test, dtype='float16', compression=options.compression) if options.unmap_bed is not None: hdf5_out.create_dataset('test_na', data=seqs_na[test_indexes], dtype='bool', compression=options.compression) hdf5_out.close() # output BED file if options.out_bed_file: out_bed_out = open(options.out_bed_file, 'w') for si in train_indexes: print('%s\t%d\t%d\ttrain' % seqs_segments[si], file=out_bed_out) for si in valid_indexes: print('%s\t%d\t%d\tvalid' % seqs_segments[si], file=out_bed_out) for si in test_indexes: print('%s\t%d\t%d\ttest' % seqs_segments[si], file=out_bed_out) out_bed_out.close()
def main(): usage = 'usage: %prog [options] <fasta_file> <targets_file>' parser = OptionParser(usage) parser.add_option('-b', dest='blacklist_bed', help='Set blacklist nucleotides to a baseline value.') parser.add_option( '--break', dest='break_t', default=786432, type='int', help='Break in half contigs above length [Default: %default]') parser.add_option('-c', '--crop', dest='crop_bp', default=0, type='int', help='Crop bp off each end [Default: %default]') parser.add_option('-d', dest='sample_pct', default=1.0, type='float', help='Down-sample the segments') parser.add_option('-f', dest='folds', default=None, type='int', help='Generate cross fold split [Default: %default]') parser.add_option('-g', dest='gaps_file', help='Genome assembly gaps BED [Default: %default]') parser.add_option('-i', dest='interp_nan', default=False, action='store_true', help='Interpolate NaNs [Default: %default]') parser.add_option('-l', dest='seq_length', default=131072, type='int', help='Sequence length [Default: %default]') parser.add_option( '--limit', dest='limit_bed', help='Limit to segments that overlap regions in a BED file') parser.add_option( '--local', dest='run_local', default=False, action='store_true', help='Run jobs locally as opposed to on SLURM [Default: %default]') parser.add_option('-o', dest='out_dir', default='data_out', help='Output directory [Default: %default]') parser.add_option('-p', dest='processes', default=None, type='int', help='Number parallel processes [Default: %default]') parser.add_option( '--peaks', dest='peaks_only', default=False, action='store_true', help='Create contigs only from peaks [Default: %default]') parser.add_option('-r', dest='seqs_per_tfr', default=256, type='int', help='Sequences per TFRecord file [Default: %default]') parser.add_option( '--restart', dest='restart', default=False, action='store_true', help='Skip already read HDF5 coverage values. [Default: %default]') parser.add_option('--seed', dest='seed', default=44, type='int', help='Random seed [Default: %default]') parser.add_option( '--snap', dest='snap', default=1, type='int', help='Snap sequences to multiple of the given value [Default: %default]' ) parser.add_option('--st', '--split_test', dest='split_test', default=False, action='store_true', help='Exit after split. [Default: %default]') parser.add_option( '--stride', '--stride_train', dest='stride_train', default=1., type='float', help='Stride to advance train sequences [Default: seq_length]') parser.add_option( '--stride_test', dest='stride_test', default=1., type='float', help='Stride to advance valid and test sequences [Default: seq_length]' ) parser.add_option( '-t', dest='test_pct_or_chr', default=0.05, type='str', help='Proportion of the data for testing [Default: %default]') parser.add_option('-u', dest='umap_bed', help='Unmappable regions in BED format') parser.add_option( '--umap_t', dest='umap_t', default=0.5, type='float', help= 'Remove sequences with more than this unmappable bin % [Default: %default]' ) parser.add_option( '--umap_clip', dest='umap_clip', default=1, type='float', help= 'Clip values at unmappable positions to distribution quantiles, eg 0.25. [Default: %default]' ) parser.add_option( '--umap_tfr', dest='umap_tfr', default=False, action='store_true', help='Save umap array into TFRecords [Default: %default]') parser.add_option('-w', dest='pool_width', default=128, type='int', help='Sum pool width [Default: %default]') parser.add_option( '-v', dest='valid_pct_or_chr', default=0.05, type='str', help='Proportion of the data for validation [Default: %default]') (options, args) = parser.parse_args() if len(args) != 2: parser.error( 'Must provide FASTA and sample coverage labels and paths.') else: fasta_file = args[0] targets_file = args[1] random.seed(options.seed) np.random.seed(options.seed) # transform proportion strides to base pairs if options.stride_train <= 1: print('stride_train %.f' % options.stride_train, end='') options.stride_train = options.stride_train * options.seq_length print(' converted to %f' % options.stride_train) options.stride_train = int(np.round(options.stride_train)) if options.stride_test <= 1: if options.folds is None: print('stride_test %.f' % options.stride_test, end='') options.stride_test = options.stride_test * options.seq_length print(' converted to %f' % options.stride_test) options.stride_test = int(np.round(options.stride_test)) # check snap if options.snap is not None: if np.mod(options.seq_length, options.snap) != 0: raise ValueError('seq_length must be a multiple of snap') if np.mod(options.stride_train, options.snap) != 0: raise ValueError('stride_train must be a multiple of snap') if np.mod(options.stride_test, options.snap) != 0: raise ValueError('stride_test must be a multiple of snap') # setup output directory if os.path.isdir(options.out_dir) and not options.restart: print('Remove output directory %s or use --restart option.' % options.out_dir) exit(1) elif not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) # read target datasets targets_df = pd.read_csv(targets_file, index_col=0, sep='\t') ################################################################ # define genomic contigs ################################################################ if not options.restart: chrom_contigs = genome.load_chromosomes(fasta_file) # remove gaps if options.gaps_file: chrom_contigs = genome.split_contigs(chrom_contigs, options.gaps_file) # ditch the chromosomes for contigs contigs = [] for chrom in chrom_contigs: contigs += [ Contig(chrom, ctg_start, ctg_end) for ctg_start, ctg_end in chrom_contigs[chrom] ] # limit to a BED file if options.limit_bed is not None: contigs = limit_contigs(contigs, options.limit_bed) # limit to peaks if options.peaks_only: peaks_bed = curate_peaks(targets_df, options.out_dir, options.pool_width, options.crop_bp) contigs = limit_contigs(contigs, peaks_bed) # filter for large enough contigs = [ ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length ] # break up large contigs if options.break_t is not None: contigs = break_large_contigs(contigs, options.break_t) # print contigs to BED file # ctg_bed_file = '%s/contigs.bed' % options.out_dir # write_seqs_bed(ctg_bed_file, contigs) ################################################################ # divide between train/valid/test ################################################################ # label folds if options.folds is not None: fold_labels = ['fold%d' % fi for fi in range(options.folds)] num_folds = options.folds else: fold_labels = ['train', 'valid', 'test'] num_folds = 3 if not options.restart: if options.folds is not None: # divide by fold pct fold_contigs = divide_contigs_folds(contigs, options.folds) else: try: # convert to float pct valid_pct = float(options.valid_pct_or_chr) test_pct = float(options.test_pct_or_chr) assert (0 <= valid_pct <= 1) assert (0 <= test_pct <= 1) # divide by pct fold_contigs = divide_contigs_pct(contigs, test_pct, valid_pct) except (ValueError, AssertionError): # divide by chr valid_chrs = options.valid_pct_or_chr.split(',') test_chrs = options.test_pct_or_chr.split(',') fold_contigs = divide_contigs_chr(contigs, test_chrs, valid_chrs) # rejoin broken contigs within set for fi in range(len(fold_contigs)): fold_contigs[fi] = rejoin_large_contigs(fold_contigs[fi]) # write labeled contigs to BED file ctg_bed_file = '%s/contigs.bed' % options.out_dir ctg_bed_out = open(ctg_bed_file, 'w') for fi in range(len(fold_contigs)): for ctg in fold_contigs[fi]: line = '%s\t%d\t%d\t%s' % (ctg.chr, ctg.start, ctg.end, fold_labels[fi]) print(line, file=ctg_bed_out) ctg_bed_out.close() if options.split_test: exit() ################################################################ # define model sequences ################################################################ if not options.restart: fold_mseqs = [] for fi in range(num_folds): if fold_labels[fi] in ['valid', 'test']: stride_fold = options.stride_test else: stride_fold = options.stride_train # stride sequences across contig fold_mseqs_fi = contig_sequences(fold_contigs[fi], options.seq_length, stride_fold, options.snap, fold_labels[fi]) fold_mseqs.append(fold_mseqs_fi) # shuffle random.shuffle(fold_mseqs[fi]) # down-sample if options.sample_pct < 1.0: fold_mseqs[fi] = random.sample( fold_mseqs[fi], int(options.sample_pct * len(fold_mseqs[fi]))) # merge into one list mseqs = [ms for fm in fold_mseqs for ms in fm] ################################################################ # mappability ################################################################ if not options.restart: if options.umap_bed is not None: if shutil.which('bedtools') is None: print('Install Bedtools to annotate unmappable sites', file=sys.stderr) exit(1) # annotate unmappable positions mseqs_unmap = annotate_unmap(mseqs, options.umap_bed, options.seq_length, options.pool_width, options.crop_bp) # filter unmappable mseqs_map_mask = (mseqs_unmap.mean(axis=1, dtype='float64') < options.umap_t) mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]] mseqs_unmap = mseqs_unmap[mseqs_map_mask, :] # write to file unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir np.save(unmap_npy, mseqs_unmap) # write sequences to BED seqs_bed_file = '%s/sequences.bed' % options.out_dir write_seqs_bed(seqs_bed_file, mseqs, True) else: # read from directory seqs_bed_file = '%s/sequences.bed' % options.out_dir unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir mseqs = [] fold_mseqs = [] for fi in range(num_folds): fold_mseqs.append([]) for line in open(seqs_bed_file): a = line.split() msg = ModelSeq(a[0], int(a[1]), int(a[2]), a[3]) mseqs.append(msg) if a[3] == 'train': fi = 0 elif a[3] == 'valid': fi = 1 elif a[3] == 'test': fi = 2 else: fi = int(a[3].replace('fold', '')) fold_mseqs[fi].append(msg) ################################################################ # read sequence coverage values ################################################################ seqs_cov_dir = '%s/seqs_cov' % options.out_dir if not os.path.isdir(seqs_cov_dir): os.mkdir(seqs_cov_dir) read_jobs = [] for ti in range(targets_df.shape[0]): genome_cov_file = targets_df['file'].iloc[ti] seqs_cov_stem = '%s/%d' % (seqs_cov_dir, ti) seqs_cov_file = '%s.h5' % seqs_cov_stem clip_ti = None if 'clip' in targets_df.columns: clip_ti = targets_df['clip'].iloc[ti] clipsoft_ti = None if 'clip_soft' in targets_df.columns: clipsoft_ti = targets_df['clip_soft'].iloc[ti] scale_ti = 1 if 'scale' in targets_df.columns: scale_ti = targets_df['scale'].iloc[ti] if options.restart and os.path.isfile(seqs_cov_file): print('Skipping existing %s' % seqs_cov_file, file=sys.stderr) else: cmd = 'basenji_data_read.py' cmd += ' --crop %d' % options.crop_bp cmd += ' -w %d' % options.pool_width cmd += ' -u %s' % targets_df['sum_stat'].iloc[ti] if clip_ti is not None: cmd += ' -c %f' % clip_ti if clipsoft_ti is not None: cmd += ' --clip_soft %f' % clipsoft_ti cmd += ' -s %f' % scale_ti if options.blacklist_bed: cmd += ' -b %s' % options.blacklist_bed if options.interp_nan: cmd += ' -i' cmd += ' %s' % genome_cov_file cmd += ' %s' % seqs_bed_file cmd += ' %s' % seqs_cov_file if options.run_local: # breaks on some OS # cmd += ' &> %s.err' % seqs_cov_stem read_jobs.append(cmd) else: j = slurm.Job(cmd, name='read_t%d' % ti, out_file='%s.out' % seqs_cov_stem, err_file='%s.err' % seqs_cov_stem, queue='standard', mem=15000, time='12:0:0') read_jobs.append(j) if options.run_local: util.exec_par(read_jobs, options.processes, verbose=True) else: slurm.multi_run(read_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5) ################################################################ # write TF Records ################################################################ # copy targets file shutil.copy(targets_file, '%s/targets.txt' % options.out_dir) # initialize TF Records dir tfr_dir = '%s/tfrecords' % options.out_dir if not os.path.isdir(tfr_dir): os.mkdir(tfr_dir) write_jobs = [] for fold_set in fold_labels: fold_set_indexes = [ i for i in range(len(mseqs)) if mseqs[i].label == fold_set ] fold_set_start = fold_set_indexes[0] fold_set_end = fold_set_indexes[-1] + 1 tfr_i = 0 tfr_start = fold_set_start tfr_end = min(tfr_start + options.seqs_per_tfr, fold_set_end) while tfr_start <= fold_set_end: tfr_stem = '%s/%s-%d' % (tfr_dir, fold_set, tfr_i) cmd = 'basenji_data_write.py' cmd += ' -s %d' % tfr_start cmd += ' -e %d' % tfr_end cmd += ' --umap_clip %f' % options.umap_clip if options.umap_tfr: cmd += ' --umap_tfr' if options.umap_bed is not None: cmd += ' -u %s' % unmap_npy cmd += ' %s' % fasta_file cmd += ' %s' % seqs_bed_file cmd += ' %s' % seqs_cov_dir cmd += ' %s.tfr' % tfr_stem if options.run_local: # breaks on some OS # cmd += ' &> %s.err' % tfr_stem write_jobs.append(cmd) else: j = slurm.Job(cmd, name='write_%s-%d' % (fold_set, tfr_i), out_file='%s.out' % tfr_stem, err_file='%s.err' % tfr_stem, queue='standard', mem=15000, time='12:0:0') write_jobs.append(j) # update tfr_i += 1 tfr_start += options.seqs_per_tfr tfr_end = min(tfr_start + options.seqs_per_tfr, fold_set_end) if options.run_local: util.exec_par(write_jobs, options.processes, verbose=True) else: slurm.multi_run(write_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5) ################################################################ # stats ################################################################ stats_dict = {} stats_dict['num_targets'] = targets_df.shape[0] stats_dict['seq_length'] = options.seq_length stats_dict['pool_width'] = options.pool_width stats_dict['crop_bp'] = options.crop_bp target_length = options.seq_length - 2 * options.crop_bp target_length = target_length // options.pool_width stats_dict['target_length'] = target_length for fi in range(num_folds): stats_dict['%s_seqs' % fold_labels[fi]] = len(fold_mseqs[fi]) with open('%s/statistics.json' % options.out_dir, 'w') as stats_json_out: json.dump(stats_dict, stats_json_out, indent=4)
def main(): usage = 'usage: %prog [options] <fasta_file> <targets_file>' parser = OptionParser(usage) parser.add_option('-b', dest='blacklist_bed', help='Set blacklist nucleotides to a baseline value.') parser.add_option( '--break', dest='break_t', default=786432, type='int', help='Break in half contigs above length [Default: %default]') # parser.add_option('-c', dest='clip', # default=None, type='float', # help='Clip target values to have minimum [Default: %default]') parser.add_option('-d', dest='sample_pct', default=1.0, type='float', help='Down-sample the segments') parser.add_option('-g', dest='gaps_file', help='Genome assembly gaps BED [Default: %default]') parser.add_option('-l', dest='seq_length', default=131072, type='int', help='Sequence length [Default: %default]') parser.add_option( '--limit', dest='limit_bed', help='Limit to segments that overlap regions in a BED file') parser.add_option( '--local', dest='run_local', default=False, action='store_true', help='Run jobs locally as opposed to on SLURM [Default: %default]') parser.add_option('-o', dest='out_dir', default='data_out', help='Output directory [Default: %default]') parser.add_option('-p', dest='processes', default=None, type='int', help='Number parallel processes [Default: %default]') parser.add_option('-r', dest='seqs_per_tfr', default=256, type='int', help='Sequences per TFRecord file [Default: %default]') parser.add_option('--seed', dest='seed', default=44, type='int', help='Random seed [Default: %default]') parser.add_option( '--stride_train', dest='stride_train', default=1., type='float', help='Stride to advance train sequences [Default: seq_length]') parser.add_option( '--stride_test', dest='stride_test', default=1., type='float', help='Stride to advance valid and test sequences [Default: seq_length]' ) parser.add_option( '--soft', dest='soft_clip', default=False, action='store_true', help= 'Soft clip values, applying sqrt to the execess above the threshold [Default: %default]' ) parser.add_option( '-t', dest='test_pct_or_chr', default=0.05, type='str', help='Proportion of the data for testing [Default: %default]') parser.add_option('-u', dest='umap_bed', help='Unmappable regions in BED format') parser.add_option( '--umap_t', dest='umap_t', default=0.3, type='float', help= 'Remove sequences with more than this unmappable bin % [Default: %default]' ) parser.add_option( '--umap_set', dest='umap_set', default=None, type='float', help= 'Set unmappable regions to this percentile in the sequences\' distribution of values' ) parser.add_option('-w', dest='pool_width', default=128, type='int', help='Sum pool width [Default: %default]') parser.add_option( '-v', dest='valid_pct_or_chr', default=0.05, type='str', help='Proportion of the data for validation [Default: %default]') (options, args) = parser.parse_args() if len(args) != 2: parser.error( 'Must provide FASTA and sample coverage labels and paths.') else: fasta_file = args[0] targets_file = args[1] random.seed(options.seed) np.random.seed(options.seed) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) if options.stride_train <= 0 or options.stride_train > 1: parser.error('Train stride =%f must be in [0,1]' % options.stride_train) if options.stride_test <= 0 or options.stride_test > 1: parser.error('Test stride =%f must be in [0,1]' % options.stride_test) ################################################################ # define genomic contigs ################################################################ chrom_contigs = genome.load_chromosomes(fasta_file) # remove gaps if options.gaps_file: chrom_contigs = genome.split_contigs(chrom_contigs, options.gaps_file) # ditch the chromosomes for contigs contigs = [] for chrom in chrom_contigs: contigs += [ Contig(chrom, ctg_start, ctg_end) for ctg_start, ctg_end in chrom_contigs[chrom] ] # limit to a BED file if options.limit_bed is not None: contigs = limit_contigs(contigs, options.limit_bed) # filter for large enough contigs = [ ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length ] # break up large contigs if options.break_t is not None: contigs = break_large_contigs(contigs, options.break_t) # print contigs to BED file ctg_bed_file = '%s/contigs.bed' % options.out_dir write_seqs_bed(ctg_bed_file, contigs) ################################################################ # divide between train/valid/test ################################################################ try: # convert to float pct valid_pct = float(options.valid_pct_or_chr) test_pct = float(options.test_pct_or_chr) assert (0 <= valid_pct <= 1) assert (0 <= test_pct <= 1) # divide by pct contig_sets = divide_contigs_pct(contigs, test_pct, valid_pct) except (ValueError, AssertionError): # divide by chr valid_chr = options.valid_pct_or_chr test_chr = options.test_pct_or_chr contig_sets = divide_contigs_chr(contigs, test_chr, valid_chr) train_contigs, valid_contigs, test_contigs = contig_sets # rejoin broken contigs within set train_contigs = rejoin_large_contigs(train_contigs) valid_contigs = rejoin_large_contigs(valid_contigs) test_contigs = rejoin_large_contigs(test_contigs) ################################################################ # define model sequences ################################################################ # stride sequences across contig train_mseqs = contig_sequences(train_contigs, options.seq_length, options.stride_train, label='train') valid_mseqs = contig_sequences(valid_contigs, options.seq_length, options.stride_test, label='valid') test_mseqs = contig_sequences(test_contigs, options.seq_length, options.stride_test, label='test') # shuffle random.shuffle(train_mseqs) random.shuffle(valid_mseqs) random.shuffle(test_mseqs) # merge mseqs = train_mseqs + valid_mseqs + test_mseqs ################################################################ # mappability ################################################################ if options.umap_bed is not None: # annotate unmappable positions mseqs_unmap = annotate_unmap(mseqs, options.umap_bed, options.seq_length, options.pool_width) # filter unmappable mseqs_map_mask = (mseqs_unmap.mean(axis=1, dtype='float64') < options.umap_t) mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]] mseqs_unmap = mseqs_unmap[mseqs_map_mask, :] # write to file unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir np.save(unmap_npy, mseqs_unmap) # down-sample if options.sample_pct < 1.0: mseqs = random.sample(mseqs, int(options.sample_pct * len(contigs))) # write sequences to BED seqs_bed_file = '%s/sequences.bed' % options.out_dir write_seqs_bed(seqs_bed_file, mseqs, True) ################################################################ # read sequence coverage values ################################################################ # read target datasets targets_df = pd.read_table(targets_file, index_col=0) seqs_cov_dir = '%s/seqs_cov' % options.out_dir if not os.path.isdir(seqs_cov_dir): os.mkdir(seqs_cov_dir) read_jobs = [] for ti in range(targets_df.shape[0]): genome_cov_file = targets_df['file'].iloc[ti] seqs_cov_stem = '%s/%d' % (seqs_cov_dir, ti) seqs_cov_file = '%s.h5' % seqs_cov_stem clip_ti = None if 'clip' in targets_df.columns: clip_ti = targets_df['clip'].iloc[ti] scale_ti = 1 if 'scale' in targets_df.columns: scale_ti = targets_df['scale'].iloc[ti] if os.path.isfile(seqs_cov_file): print('Skipping existing %s' % seqs_cov_file, file=sys.stderr) else: cmd = 'basenji_data_read.py' cmd += ' -w %d' % options.pool_width cmd += ' -u %s' % targets_df['sum_stat'].iloc[ti] if clip_ti is not None: cmd += ' -c %f' % clip_ti if options.soft_clip: cmd += ' --soft' cmd += ' -s %f' % scale_ti if options.blacklist_bed: cmd += ' -b %s' % options.blacklist_bed cmd += ' %s' % genome_cov_file cmd += ' %s' % seqs_bed_file cmd += ' %s' % seqs_cov_file if options.run_local: cmd += ' &> %s.err' % seqs_cov_stem read_jobs.append(cmd) else: j = slurm.Job(cmd, name='read_t%d' % ti, out_file='%s.out' % seqs_cov_stem, err_file='%s.err' % seqs_cov_stem, queue='standard,tbdisk', mem=15000, time='12:0:0') read_jobs.append(j) if options.run_local: util.exec_par(read_jobs, options.processes, verbose=True) else: slurm.multi_run(read_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5) ################################################################ # write TF Records ################################################################ # copy targets file shutil.copy(targets_file, '%s/targets.txt' % options.out_dir) # initialize TF Records dir tfr_dir = '%s/tfrecords' % options.out_dir if not os.path.isdir(tfr_dir): os.mkdir(tfr_dir) write_jobs = [] for tvt_set in ['train', 'valid', 'test']: tvt_set_indexes = [ i for i in range(len(mseqs)) if mseqs[i].label == tvt_set ] tvt_set_start = tvt_set_indexes[0] tvt_set_end = tvt_set_indexes[-1] + 1 tfr_i = 0 tfr_start = tvt_set_start tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end) while tfr_start <= tvt_set_end: tfr_stem = '%s/%s-%d' % (tfr_dir, tvt_set, tfr_i) cmd = 'basenji_data_write.py' cmd += ' -s %d' % tfr_start cmd += ' -e %d' % tfr_end if options.umap_bed is not None: cmd += ' -u %s' % unmap_npy if options.umap_set is not None: cmd += ' --umap_set %f' % options.umap_set cmd += ' %s' % fasta_file cmd += ' %s' % seqs_bed_file cmd += ' %s' % seqs_cov_dir cmd += ' %s.tfr' % tfr_stem if options.run_local: cmd += ' &> %s.err' % tfr_stem write_jobs.append(cmd) else: j = slurm.Job(cmd, name='write_%s-%d' % (tvt_set, tfr_i), out_file='%s.out' % tfr_stem, err_file='%s.err' % tfr_stem, queue='standard,tbdisk', mem=15000, time='12:0:0') write_jobs.append(j) # update tfr_i += 1 tfr_start += options.seqs_per_tfr tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end) if options.run_local: util.exec_par(write_jobs, options.processes, verbose=True) else: slurm.multi_run(write_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5)
def main(): usage = 'usage: %prog [options] <fasta0_file,fasta1_file> <targets_file>' parser = OptionParser(usage) parser.add_option('-a', dest='align_net', help='Alignment .net file') parser.add_option('-b', dest='blacklist_beds', help='Set blacklist nucleotides to a baseline value.') parser.add_option('--break', dest='break_t', default=None, type='int', help='Break in half contigs above length [Default: %default]') parser.add_option('-c','--crop', dest='crop_bp', default=0, type='int', help='Crop bp off each end [Default: %default]') parser.add_option('-d', dest='sample_pct', default=1.0, type='float', help='Down-sample the segments') parser.add_option('-g', dest='gap_files', help='Comma-separated list of assembly gaps BED files [Default: %default]') parser.add_option('-i', dest='interp_nan', default=False, action='store_true', help='Interpolate NaNs [Default: %default]') parser.add_option('-l', dest='seq_length', default=131072, type='int', help='Sequence length [Default: %default]') parser.add_option('--local', dest='run_local', default=False, action='store_true', help='Run jobs locally as opposed to on SLURM [Default: %default]') parser.add_option('-n', dest='net_fill_min', default=100000, type='int', help='Alignment net fill size minimum [Default: %default]') parser.add_option('-o', dest='out_dir', default='data_out', help='Output directory [Default: %default]') parser.add_option('-p', dest='processes', default=None, type='int', help='Number parallel processes [Default: %default]') parser.add_option('-r', dest='seqs_per_tfr', default=256, type='int', help='Sequences per TFRecord file [Default: %default]') parser.add_option('--restart', dest='restart', default=False, action='store_true', help='Skip already read HDF5 coverage values. [Default: %default]') parser.add_option('--seed', dest='seed', default=44, type='int', help='Random seed [Default: %default]') parser.add_option('--snap', dest='snap', default=None, type='int', help='Snap sequences to multiple of the given value [Default: %default]') parser.add_option('--stride', '--stride_train', dest='stride_train', default=1., type='float', help='Stride to advance train sequences [Default: seq_length]') parser.add_option('--stride_test', dest='stride_test', default=1., type='float', help='Stride to advance valid and test sequences [Default: %default]') parser.add_option('--soft', dest='soft_clip', default=False, action='store_true', help='Soft clip values, applying sqrt to the execess above the threshold [Default: %default]') parser.add_option('-t', dest='test_pct', default=0.1, type='float', help='Proportion of the data for testing [Default: %default]') parser.add_option('-u', dest='umap_beds', help='Comma-separated genome unmappable segments to set to NA') parser.add_option('--umap_t', dest='umap_t', default=0.5, type='float', help='Remove sequences with more than this unmappable bin % [Default: %default]') parser.add_option('--umap_clip', dest='umap_clip', default=None, type='float', help='Clip unmappable regions to this percentile in the sequences\' distribution of values') parser.add_option('-w', dest='pool_width', default=128, type='int', help='Sum pool width [Default: %default]') parser.add_option('-v', dest='valid_pct', default=0.1, type='float', help='Proportion of the data for validation [Default: %default]') (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide FASTA and sample coverage label and path files for two genomes.') else: fasta_files = args[0].split(',') targets_file = args[1] # there is still some source of stochasticity random.seed(options.seed) np.random.seed(options.seed) # transform proportion strides to base pairs if options.stride_train <= 1: print('stride_train %.f'%options.stride_train, end='') options.stride_train = options.stride_train*options.seq_length print(' converted to %f' % options.stride_train) options.stride_train = int(np.round(options.stride_train)) if options.stride_test <= 1: print('stride_test %.f'%options.stride_test, end='') options.stride_test = options.stride_test*options.seq_length print(' converted to %f' % options.stride_test) options.stride_test = int(np.round(options.stride_test)) # check snap if options.snap is not None: if np.mod(options.seq_length, options.snap) != 0: raise ValueError('seq_length must be a multiple of snap') if np.mod(options.stride_train, options.snap) != 0: raise ValueError('stride_train must be a multiple of snap') if np.mod(options.stride_test, options.snap) != 0: raise ValueError('stride_test must be a multiple of snap') if os.path.isdir(options.out_dir) and not options.restart: print('Remove output directory %s or use --restart option.' % options.out_dir) exit(1) elif not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) if options.gap_files is not None: options.gap_files = options.gap_files.split(',') if options.blacklist_beds is not None: options.blacklist_beds = options.blacklist_beds.split(',') # read targets targets_df = pd.read_table(targets_file, index_col=0) # verify genomes num_genomes = len(fasta_files) assert(len(set(targets_df.genome)) == num_genomes) ################################################################ # define genomic contigs ################################################################ genome_chr_contigs = [] for gi in range(num_genomes): genome_chr_contigs.append(genome.load_chromosomes(fasta_files[gi])) # remove gaps if options.gap_files[gi]: genome_chr_contigs[gi] = genome.split_contigs(genome_chr_contigs[gi], options.gap_files[gi]) # ditch the chromosomes contigs = [] for gi in range(num_genomes): for chrom in genome_chr_contigs[gi]: contigs += [Contig(gi, chrom, ctg_start, ctg_end) for ctg_start, ctg_end in genome_chr_contigs[gi][chrom]] # filter for large enough contigs = [ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length] # break up large contigs if options.break_t is not None: contigs = break_large_contigs(contigs, options.break_t) # print contigs to BED file for gi in range(num_genomes): contigs_i = [ctg for ctg in contigs if ctg.genome == gi] ctg_bed_file = '%s/contigs%d.bed' % (options.out_dir, gi) write_seqs_bed(ctg_bed_file, contigs_i) ################################################################ # divide between train/valid/test ################################################################ # connect contigs across genomes by alignment contig_components = connect_contigs(contigs, options.align_net, options.net_fill_min, options.out_dir) # divide contig connected components between train/valid/test contig_sets = divide_contig_components(contig_components, options.test_pct, options.valid_pct) train_contigs, valid_contigs, test_contigs = contig_sets # rejoin broken contigs within set train_contigs = rejoin_large_contigs(train_contigs) valid_contigs = rejoin_large_contigs(valid_contigs) test_contigs = rejoin_large_contigs(test_contigs) # quantify leakage across sets quantify_leakage(options.align_net, train_contigs, valid_contigs, test_contigs, options.out_dir) ################################################################ # define model sequences ################################################################ # stride sequences across contig train_mseqs = contig_sequences(train_contigs, options.seq_length, options.stride_train, options.snap, 'train') valid_mseqs = contig_sequences(valid_contigs, options.seq_length, options.stride_test, options.snap, 'valid') test_mseqs = contig_sequences(test_contigs, options.seq_length, options.stride_test, options.snap, 'test') # shuffle random.shuffle(train_mseqs) random.shuffle(valid_mseqs) random.shuffle(test_mseqs) # down-sample if options.sample_pct < 1.0: train_mseqs = random.sample(train_mseqs, int(options.sample_pct*len(train_mseqs))) valid_mseqs = random.sample(valid_mseqs, int(options.sample_pct*len(valid_mseqs))) test_mseqs = random.sample(test_mseqs, int(options.sample_pct*len(test_mseqs))) # merge mseqs = train_mseqs + valid_mseqs + test_mseqs ################################################################ # separate sequences by genome ################################################################ mseqs_genome = [] for gi in range(num_genomes): mseqs_gi = [mseqs[si] for si in range(len(mseqs)) if mseqs[si].genome == gi] mseqs_genome.append(mseqs_gi) ################################################################ # mappability ################################################################ options.umap_beds = options.umap_beds.split(',') unmap_npys = [None, None] for gi in range(num_genomes): if options.umap_beds[gi] is not None: # annotate unmappable positions mseqs_unmap = annotate_unmap(mseqs_genome[gi], options.umap_beds[gi], options.seq_length, options.pool_width) # filter unmappable mseqs_map_mask = (mseqs_unmap.mean(axis=1, dtype='float64') < options.umap_t) mseqs_genome[gi] = [mseqs_genome[gi][si] for si in range(len(mseqs_genome[gi])) if mseqs_map_mask[si]] mseqs_unmap = mseqs_unmap[mseqs_map_mask,:] # write to file unmap_npys[gi] = '%s/mseqs%d_unmap.npy' % (options.out_dir, gi) np.save(unmap_npys[gi], mseqs_unmap) seqs_bed_files = [] for gi in range(num_genomes): # write sequences to BED seqs_bed_files.append('%s/sequences%d.bed' % (options.out_dir, gi)) write_seqs_bed(seqs_bed_files[gi], mseqs_genome[gi], True) ################################################################ # read sequence coverage values ################################################################ seqs_cov_dir = '%s/seqs_cov' % options.out_dir if not os.path.isdir(seqs_cov_dir): os.mkdir(seqs_cov_dir) read_jobs = [] for gi in range(num_genomes): read_jobs += make_read_jobs(seqs_bed_files[gi], targets_df, gi, seqs_cov_dir, options) if options.run_local: util.exec_par(read_jobs, options.processes, verbose=True) else: slurm.multi_run(read_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5) ################################################################ # write TF Records ################################################################ tfr_dir = '%s/tfrecords' % options.out_dir if not os.path.isdir(tfr_dir): os.mkdir(tfr_dir) # set genome target index starts sum_targets = 0 genome_targets_start = [] for gi in range(num_genomes): genome_targets_start.append(sum_targets) targets_df_gi = targets_df[targets_df.genome == gi] sum_targets += targets_df_gi.shape[0] write_jobs = [] for gi in range(num_genomes): write_jobs += make_write_jobs(mseqs_genome[gi], fasta_files[gi], seqs_bed_files[gi], seqs_cov_dir, tfr_dir, gi, unmap_npys[gi], genome_targets_start[gi], sum_targets, options) if options.run_local: util.exec_par(write_jobs, options.processes, verbose=True) else: slurm.multi_run(write_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5) ################################################################ # stats ################################################################ stats_dict = {} # stats_dict['num_targets'] = targets_df.shape[0] # stats_dict['train_seqs'] = len(train_mseqs) # stats_dict['valid_seqs'] = len(valid_mseqs) # stats_dict['test_seqs'] = len(test_mseqs) stats_dict['seq_length'] = options.seq_length stats_dict['pool_width'] = options.pool_width stats_dict['crop_bp'] = options.crop_bp target_length = options.seq_length - 2*options.crop_bp target_length = target_length // options.pool_width stats_dict['target_length'] = target_length with open('%s/statistics.json' % options.out_dir, 'w') as stats_json_out: json.dump(stats_dict, stats_json_out, indent=4)
def main(): usage = 'usage: %prog [options] <fasta_file> <sample_wigs_file> <hdf5_file>' parser = OptionParser(usage) parser.add_option( '-b', dest='limit_bed', help='Limit to segments that overlap regions in a BED file') parser.add_option( '-c', dest='clip', default=None, type='float', help='Clip target values to have minimum [Default: %default]') parser.add_option('--cluster_dir', dest='cluster_dir', default='basenji_hdf5') parser.add_option('-d', dest='sample_pct', default=1.0, type='float', help='Down-sample the segments') parser.add_option('-f', dest='fourier_dim', default=None, type='int', help='Fourier transform dimension [Default: %default]') parser.add_option('-g', dest='gaps_file', help='Genome assembly gaps BED [Default: %default]') parser.add_option('-l', dest='seq_length', default=1024, type='int', help='Sequence length [Default: %default]') parser.add_option( '--log2', dest='log10to2', default=False, action='store_true', help='Transform values from log10 to log2 [Default: %default]') parser.add_option( '--mult_cov', dest='cov_multiplier', default=1, type='float', help= 'Coverage multiplier, useful when the read extension and pool width do not match [Default: %default]' ) parser.add_option( '-n', dest='na_t', default=0.25, type='float', help= 'Remove sequences with an NA% greater than this threshold [Default: %default]' ) parser.add_option( '-o', dest='out_bed_file', help='Output the train/valid/test sequences as a BED file') parser.add_option( '-p', dest='processes', default=1, type='int', help='Number parallel processes to load data [Default: %default]') parser.add_option('-s', dest='stride', type='int', help='Stride to advance segments [Default: seq_length]') parser.add_option( '-t', dest='test_pct_or_chr', type='str', default=0.05, help='Proportion of the data for testing [Default: %default]') parser.add_option('-u', dest='unmap_bed', help='Unmappable segments to set to NA') parser.add_option('-w', dest='pool_width', type='int', default=1, help='Average pooling width [Default: %default]') parser.add_option( '-v', dest='valid_pct_or_chr', type='str', default=0.05, help='Proportion of the data for validation [Default: %default]') parser.add_option('-z', dest='compression', help='h5py compression [Default: %default]') (options, args) = parser.parse_args() if len(args) != 3: parser.error( 'Must provide genome FASTA file, sample Wig/BigWig labels and paths, ' 'and model output file') else: fasta_file = args[0] sample_wigs_file = args[1] hdf5_file = args[2] random.seed(1) if options.stride is None: options.stride = options.seq_length ################################################################ # assess bigwigs ################################################################ # get wig files and labels target_wigs = OrderedDict() target_strands = [] target_labels = [] for line in open(sample_wigs_file, encoding='UTF-8'): a = line.rstrip().split('\t') if a[0] in target_wigs: print('WARNING: duplicate target id %s' % a[0], file=sys.stderr) target_wigs[a[0]] = a[1] target_strands.append(a[2]) if len(a) > 3: target_labels.append(a[3]) else: target_labels.append('') if options.fourier_dim is not None and 2 * options.fourier_dim >= options.seq_length / options.pool_width: print( "Fourier transform to %d dims won't compress %d length sequences with %d pooling" % (options.fourier_dim, options.seq_length, options.pool_width), file=sys.stderr) exit(1) ################################################################ # prepare genomic segments ################################################################ chrom_segments = genome.load_chromosomes(fasta_file) # remove gaps if options.gaps_file: chrom_segments = genome.split_contigs(chrom_segments, options.gaps_file) # ditch the chromosomes segments = [] for chrom in chrom_segments: segments += [(chrom, seg_start, seg_end) for seg_start, seg_end in chrom_segments[chrom]] # standardize order segments.sort() # filter for large enough segments = [ cse for cse in segments if cse[2] - cse[1] >= options.seq_length ] # down-sample if options.sample_pct < 1.0: segments = random.sample(segments, int(options.sample_pct * len(segments))) # limit to a BED file if options.limit_bed is not None: segments = limit_segments(segments, options.limit_bed) if not os.path.isdir(options.cluster_dir): os.mkdir(options.cluster_dir) # print segments to BED file seg_bed_file = '%s/segments.bed' % options.cluster_dir seg_bed_out = open(seg_bed_file, 'w') for chrom, seg_start, seg_end in segments: print('%s\t%d\t%d' % (chrom, seg_start, seg_end), file=seg_bed_out) seg_bed_out.close() ################################################################ # bigwig read and process ################################################################ print('Reading and pre-processing bigwigs for %d segments' % len(segments), flush=True) targets_real = [] targets_imag = [] # generate numpy arrays on cluster jobs = [] for target_label in target_wigs.keys(): wig_file = target_wigs[target_label] npy_file = '%s/%s' % (options.cluster_dir, target_label) if not os.path.isfile(npy_file) and not os.path.isfile( '%s.npy' % npy_file): print(npy_file) if os.path.splitext(wig_file)[1] == '.h5': script = 'seqs_hdf5.py' else: script = 'bigwig_hdf5.py' cmd = 'echo $HOSTNAME; %s -l %d -s %d -w %d %s %s %s' % ( script, options.seq_length, options.stride, options.pool_width, wig_file, seg_bed_file, npy_file) name = 'hdf5_%s' % target_label outf = '%s/%s.out' % (options.cluster_dir, target_label) errf = '%s/%s.err' % (options.cluster_dir, target_label) j = slurm.Job(cmd, name, outf, errf, queue='standard,tbdisk', mem=15000, time='12:0:0') jobs.append(j) slurm.multi_run(jobs) # load into targets_real, targets_imag for target_label in target_wigs.keys(): npy_file = '%s/%s.npy' % (options.cluster_dir, target_label) wig_targets = np.load(npy_file) targets_real.append(wig_targets) # transpose from TxSxL to SxLxT targets_real = np.transpose(np.array(targets_real), axes=(1, 2, 0)) print('%d target sequences' % targets_real.shape[0]) ################################################################ # one hot code sequences ################################################################ seqs_1hot, seqs_segments = segments_1hot(fasta_file, segments, options.seq_length, options.stride) print('%d sequences one hot coded' % seqs_1hot.shape[0]) ################################################################ # correct for unmappable regions ################################################################ if options.unmap_bed is not None: seqs_na = annotate_na(seqs_segments, options.unmap_bed, options.seq_length, options.pool_width) # determine mappable sequences and update test indexes map_indexes = [] for i in range(seqs_na.shape[0]): # mappable if seqs_na[i, :].mean(dtype='float64') < options.na_t: map_indexes.append(i) # unmappable else: # forget it pass # update data structures targets_real = targets_real[map_indexes] if options.fourier_dim is not None: targets_imag = targets_imag[map_indexes] seqs_1hot = seqs_1hot[map_indexes] seqs_segments = [seqs_segments[mi] for mi in map_indexes] seqs_na = seqs_na[map_indexes] ################################################################ # write to train, valid, test HDF5 ################################################################ # choose test indexes if options.test_pct_or_chr.startswith('chr'): test_indexes = [ si for si in range(len(seqs_segments)) if seqs_segments[si][0] == options.test_pct_or_chr ] else: test_pct = float(options.test_pct_or_chr) test_indexes = [ twi for twi in range(len(seqs_segments)) if random.random() < test_pct ] # choose valid indexes if options.valid_pct_or_chr.startswith('chr'): # valid_indexes = np.array([seq_seg[0] == options.valid_pct_or_chr for seq_seg in seqs_segments]) valid_indexes = [ si for si in range(len(seqs_segments)) if seqs_segments[si][0] == options.valid_pct_or_chr ] else: valid_pct = float(options.valid_pct_or_chr) valid_n = int(valid_pct * len(seqs_segments)) nontest_indexes = set(range(len(seqs_segments))) - set(test_indexes) valid_indexes = random.sample(nontest_indexes, valid_n) # remainder is training train_indexes = list( set(range(len(seqs_segments))) - set(valid_indexes) - set(test_indexes)) # training may require shuffling random.shuffle(train_indexes) random.shuffle(valid_indexes) random.shuffle(test_indexes) # write to HDF5 hdf5_out = h5py.File(hdf5_file, 'w') # store pooling hdf5_out.create_dataset('pool_width', data=options.pool_width, dtype='int') # store targets target_ids = np.array(list(target_wigs.keys()), dtype='S') hdf5_out.create_dataset('target_ids', data=target_ids) target_labels = np.array(target_labels, dtype='S') hdf5_out.create_dataset('target_labels', data=target_labels) target_strands = np.array(target_strands, dtype='S') hdf5_out.create_dataset('target_strands', data=target_strands) # HDF5 train hdf5_out.create_dataset('train_in', data=seqs_1hot[train_indexes], dtype='bool', compression=options.compression) hdf5_out.create_dataset('train_out', data=targets_real[train_indexes], dtype='float16', compression=options.compression) if options.fourier_dim is not None: hdf5_out.create_dataset('train_out_imag', data=targets_imag[train_indexes], dtype='float16', compression=options.compression) hdf5_out.create_dataset('train_na', data=seqs_na[train_indexes], dtype='bool', compression=options.compression) # HDF5 valid hdf5_out.create_dataset('valid_in', data=seqs_1hot[valid_indexes], dtype='bool', compression=options.compression) hdf5_out.create_dataset('valid_out', data=targets_real[valid_indexes], dtype='float16', compression=options.compression) if options.fourier_dim is not None: hdf5_out.create_dataset('valid_out_imag', data=targets_imag[valid_indexes], dtype='float16', compression=options.compression) hdf5_out.create_dataset('valid_na', data=seqs_na[valid_indexes], dtype='bool', compression=options.compression) # HDF5 test hdf5_out.create_dataset('test_in', data=seqs_1hot[test_indexes], dtype='bool', compression=options.compression) hdf5_out.create_dataset('test_out', data=targets_real[test_indexes], dtype='float16', compression=options.compression) if options.fourier_dim is not None: hdf5_out.create_dataset('test_out_imag', data=targets_imag[test_indexes], dtype='float16', compression=options.compression) hdf5_out.create_dataset('test_na', data=seqs_na[test_indexes], dtype='bool', compression=options.compression) hdf5_out.close() # output BED file if options.out_bed_file: out_bed_out = open(options.out_bed_file, 'w') for si in train_indexes: print('%s\t%d\t%d\ttrain' % seqs_segments[si], file=out_bed_out) for si in valid_indexes: print('%s\t%d\t%d\tvalid' % seqs_segments[si], file=out_bed_out) for si in test_indexes: print('%s\t%d\t%d\ttest' % seqs_segments[si], file=out_bed_out) out_bed_out.close()
def main(): usage = "usage: %prog [options] <fasta0_file,fasta1_file> <targets_file>" parser = OptionParser(usage) parser.add_option("-a", dest="align_net", help="Alignment .net file") parser.add_option( "-b", dest="blacklist_beds", help="Set blacklist nucleotides to a baseline value.", ) parser.add_option( "--break", dest="break_t", default=None, type="int", help="Break in half contigs above length [Default: %default]", ) # parser.add_option('-c', dest='clip', # default=None, type='float', # help='Clip target values to have minimum [Default: %default]') parser.add_option( "-d", dest="sample_pct", default=1.0, type="float", help="Down-sample the segments", ) parser.add_option( "-f", dest="fill_min", default=100000, type="int", help="Alignment net fill size minimum [Default: %default]", ) parser.add_option( "-g", dest="gap_files", help="Comma-separated list of assembly gaps BED files [Default: %default]", ) parser.add_option( "-l", dest="seq_length", default=131072, type="int", help="Sequence length [Default: %default]", ) parser.add_option( "--local", dest="run_local", default=False, action="store_true", help="Run jobs locally as opposed to on SLURM [Default: %default]", ) parser.add_option( "-o", dest="out_dir", default="data_out", help="Output directory [Default: %default]", ) parser.add_option( "-p", dest="processes", default=None, type="int", help="Number parallel processes [Default: %default]", ) parser.add_option( "-r", dest="seqs_per_tfr", default=256, type="int", help="Sequences per TFRecord file [Default: %default]", ) parser.add_option( "--seed", dest="seed", default=44, type="int", help="Random seed [Default: %default]", ) parser.add_option( "--stride_train", dest="stride_train", default=1.0, type="float", help="Stride to advance train sequences [Default: %default]", ) parser.add_option( "--stride_test", dest="stride_test", default=1.0, type="float", help="Stride to advance valid and test sequences [Default: %default]", ) parser.add_option( "--soft", dest="soft_clip", default=False, action="store_true", help="Soft clip values, applying sqrt to the execess above the threshold [Default: %default]", ) parser.add_option( "-t", dest="test_pct", default=0.1, type="float", help="Proportion of the data for testing [Default: %default]", ) parser.add_option( "-u", dest="umap_beds", help="Comma-separated genome unmappable segments to set to NA", ) parser.add_option( "--umap_t", dest="umap_t", default=0.5, type="float", help="Remove sequences with more than this unmappable bin % [Default: %default]", ) parser.add_option( "--umap_set", dest="umap_set", default=None, type="float", help="Set unmappable regions to this percentile in the sequences' distribution of values", ) parser.add_option( "-w", dest="pool_width", default=128, type="int", help="Sum pool width [Default: %default]", ) parser.add_option( "-v", dest="valid_pct", default=0.1, type="float", help="Proportion of the data for validation [Default: %default]", ) (options, args) = parser.parse_args() if len(args) != 2: parser.error( "Must provide FASTA and sample coverage label and path files for two genomes." ) else: fasta_files = args[0].split(",") targets_file = args[1] # there is still some source of stochasticity random.seed(options.seed) np.random.seed(options.seed) # transform proportion strides to base pairs if options.stride_train <= 1: print("stride_train %.f" % options.stride_train, end="") options.stride_train = options.stride_train * options.seq_length print(" converted to %f" % options.stride_train) options.stride_train = int(np.round(options.stride_train)) if options.stride_test <= 1: print("stride_test %.f" % options.stride_test, end="") options.stride_test = options.stride_test * options.seq_length print(" converted to %f" % options.stride_test) options.stride_test = int(np.round(options.stride_test)) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) if options.gap_files is not None: options.gap_files = options.gap_files.split(",") if options.blacklist_beds is not None: options.blacklist_beds = options.blacklist_beds.split(",") # read targets targets_df = pd.read_table(targets_file, index_col=0) # verify genomes num_genomes = len(fasta_files) assert len(set(targets_df.genome)) == num_genomes ################################################################ # define genomic contigs ################################################################ genome_chr_contigs = [] for gi in range(num_genomes): genome_chr_contigs.append(genome.load_chromosomes(fasta_files[gi])) # remove gaps if options.gap_files[gi]: genome_chr_contigs[gi] = genome.split_contigs( genome_chr_contigs[gi], options.gap_files[gi] ) # ditch the chromosomes contigs = [] for gi in range(num_genomes): for chrom in genome_chr_contigs[gi]: contigs += [ Contig(gi, chrom, ctg_start, ctg_end) for ctg_start, ctg_end in genome_chr_contigs[gi][chrom] ] # filter for large enough contigs = [ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length] # break up large contigs if options.break_t is not None: contigs = break_large_contigs(contigs, options.break_t) # print contigs to BED file for gi in range(num_genomes): contigs_i = [ctg for ctg in contigs if ctg.genome == gi] ctg_bed_file = "%s/contigs%d.bed" % (options.out_dir, gi) write_seqs_bed(ctg_bed_file, contigs_i) ################################################################ # divide between train/valid/test ################################################################ # connect contigs across genomes by alignment contig_components = connect_contigs( contigs, options.align_net, options.fill_min, options.out_dir ) # divide contig connected components between train/valid/test contig_sets = divide_contig_components( contig_components, options.test_pct, options.valid_pct ) train_contigs, valid_contigs, test_contigs = contig_sets # rejoin broken contigs within set train_contigs = rejoin_large_contigs(train_contigs) valid_contigs = rejoin_large_contigs(valid_contigs) test_contigs = rejoin_large_contigs(test_contigs) ################################################################ # define model sequences ################################################################ # stride sequences across contig train_mseqs = contig_sequences( train_contigs, options.seq_length, options.stride_train, label="train" ) valid_mseqs = contig_sequences( valid_contigs, options.seq_length, options.stride_test, label="valid" ) test_mseqs = contig_sequences( test_contigs, options.seq_length, options.stride_test, label="test" ) # shuffle random.shuffle(train_mseqs) random.shuffle(valid_mseqs) random.shuffle(test_mseqs) # down-sample if options.sample_pct < 1.0: train_mseqs = random.sample( train_mseqs, int(options.sample_pct * len(train_mseqs)) ) valid_mseqs = random.sample( valid_mseqs, int(options.sample_pct * len(valid_mseqs)) ) test_mseqs = random.sample( test_mseqs, int(options.sample_pct * len(test_mseqs)) ) # merge mseqs = train_mseqs + valid_mseqs + test_mseqs ################################################################ # separate sequences by genome ################################################################ mseqs_genome = [] for gi in range(num_genomes): mseqs_gi = [mseqs[si] for si in range(len(mseqs)) if mseqs[si].genome == gi] mseqs_genome.append(mseqs_gi) ################################################################ # mappability ################################################################ options.umap_beds = options.umap_beds.split(",") unmap_npys = [None, None] for gi in range(num_genomes): if options.umap_beds[gi] is not None: # annotate unmappable positions mseqs_unmap = annotate_unmap( mseqs_genome[gi], options.umap_beds[gi], options.seq_length, options.pool_width, ) # filter unmappable mseqs_map_mask = mseqs_unmap.mean(axis=1, dtype="float64") < options.umap_t mseqs_genome[gi] = [ mseqs_genome[gi][si] for si in range(len(mseqs_genome[gi])) if mseqs_map_mask[si] ] mseqs_unmap = mseqs_unmap[mseqs_map_mask, :] # write to file unmap_npys[gi] = "%s/mseqs%d_unmap.npy" % (options.out_dir, gi) np.save(unmap_npys[gi], mseqs_unmap) seqs_bed_files = [] for gi in range(num_genomes): # write sequences to BED seqs_bed_files.append("%s/sequences%d.bed" % (options.out_dir, gi)) write_seqs_bed(seqs_bed_files[gi], mseqs_genome[gi], True) ################################################################ # read sequence coverage values ################################################################ seqs_cov_dir = "%s/seqs_cov" % options.out_dir if not os.path.isdir(seqs_cov_dir): os.mkdir(seqs_cov_dir) read_jobs = [] for gi in range(num_genomes): read_jobs += make_read_jobs( seqs_bed_files[gi], targets_df, gi, seqs_cov_dir, options ) if options.run_local: util.exec_par(read_jobs, options.processes, verbose=True) else: slurm.multi_run( read_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5 ) ################################################################ # write TF Records ################################################################ tfr_dir = "%s/tfrecords" % options.out_dir if not os.path.isdir(tfr_dir): os.mkdir(tfr_dir) # set genome target index starts sum_targets = 0 genome_targets_start = [] for gi in range(num_genomes): genome_targets_start.append(sum_targets) targets_df_gi = targets_df[targets_df.genome == gi] sum_targets += targets_df_gi.shape[0] write_jobs = [] for gi in range(num_genomes): write_jobs += make_write_jobs( mseqs_genome[gi], fasta_files[gi], seqs_bed_files[gi], seqs_cov_dir, tfr_dir, gi, unmap_npys[gi], genome_targets_start[gi], sum_targets, options, ) if options.run_local: util.exec_par(write_jobs, options.processes, verbose=True) else: slurm.multi_run( write_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5 )
def main(): usage = 'usage: %prog [options] <align_net> <fasta0_file,fasta1_file>' parser = OptionParser(usage) parser.add_option('-a', dest='genome_labels', default=None, help='Genome labels in output') parser.add_option('--break', dest='break_t', default=None, type='int', help='Break in half contigs above length [Default: %default]') parser.add_option('-c','--crop', dest='crop_bp', default=0, type='int', help='Crop bp off each end [Default: %default]') parser.add_option('-d', dest='sample_pct', default=1.0, type='float', help='Down-sample the segments') parser.add_option('-f', dest='folds', default=None, type='int', help='Generate cross fold split [Default: %default]') parser.add_option('-g', dest='gap_files', help='Comma-separated list of assembly gaps BED files [Default: %default]') parser.add_option('-l', dest='seq_length', default=131072, type='int', help='Sequence length [Default: %default]') parser.add_option('--nf', dest='net_fill_min', default=100000, type='int', help='Alignment net fill size minimum [Default: %default]') parser.add_option('--no', dest='net_olap_min', default=1024, type='int', help='Alignment net and contig overlap minimum [Default: %default]') parser.add_option('-o', dest='out_dir', default='align_out', help='Output directory [Default: %default]') parser.add_option('--seed', dest='seed', default=44, type='int', help='Random seed [Default: %default]') parser.add_option('--snap', dest='snap', default=1, type='int', help='Snap sequences to multiple of the given value [Default: %default]') parser.add_option('--stride', '--stride_train', dest='stride_train', default=1., type='float', help='Stride to advance train sequences [Default: seq_length]') parser.add_option('--stride_test', dest='stride_test', default=1., type='float', help='Stride to advance valid and test sequences [Default: %default]') parser.add_option('-t', dest='test_pct', default=0.1, type='float', help='Proportion of the data for testing [Default: %default]') parser.add_option('-u', dest='umap_beds', help='Comma-separated genome unmappable segments to set to NA') parser.add_option('--umap_t', dest='umap_t', default=0.5, type='float', help='Remove sequences with more than this unmappable bin % [Default: %default]') parser.add_option('-w', dest='pool_width', default=128, type='int', help='Sum pool width [Default: %default]') parser.add_option('-v', dest='valid_pct', default=0.1, type='float', help='Proportion of the data for validation [Default: %default]') (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide alignment and FASTA files.') else: align_net_file = args[0] fasta_files = args[1].split(',') # there is still some source of stochasticity random.seed(options.seed) np.random.seed(options.seed) # transform proportion strides to base pairs if options.stride_train <= 1: print('stride_train %.f'%options.stride_train, end='') options.stride_train = options.stride_train*options.seq_length print(' converted to %f' % options.stride_train) options.stride_train = int(np.round(options.stride_train)) if options.stride_test <= 1: print('stride_test %.f'%options.stride_test, end='') options.stride_test = options.stride_test*options.seq_length print(' converted to %f' % options.stride_test) options.stride_test = int(np.round(options.stride_test)) # check snap if options.snap is not None: if np.mod(options.seq_length, options.snap) != 0: raise ValueError('seq_length must be a multiple of snap') if np.mod(options.stride_train, options.snap) != 0: raise ValueError('stride_train must be a multiple of snap') if np.mod(options.stride_test, options.snap) != 0: raise ValueError('stride_test must be a multiple of snap') # count genomes num_genomes = len(fasta_files) # parse gap files if options.gap_files is not None: options.gap_files = options.gap_files.split(',') assert(len(options.gap_files) == num_genomes) # parse unmappable files if options.umap_beds is not None: options.umap_beds = options.umap_beds.split(',') assert(len(options.umap_beds) == num_genomes) # label genomes if options.genome_labels is None: options.genome_labels = ['genome%d' % (gi+1) for gi in range(num_genomes)] else: options.genome_labels = options.genome_labels.split(',') assert(len(options.genome_labels) == num_genomes) # create output directorys if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) genome_out_dirs = [] for gi in range(num_genomes): gout_dir = '%s/%s' % (options.out_dir, options.genome_labels[gi]) if not os.path.isdir(gout_dir): os.mkdir(gout_dir) genome_out_dirs.append(gout_dir) ################################################################ # define genomic contigs ################################################################ genome_chr_contigs = [] for gi in range(num_genomes): genome_chr_contigs.append(genome.load_chromosomes(fasta_files[gi])) # remove gaps if options.gap_files[gi]: genome_chr_contigs[gi] = genome.split_contigs(genome_chr_contigs[gi], options.gap_files[gi]) # ditch the chromosomes contigs = [] for gi in range(num_genomes): for chrom in genome_chr_contigs[gi]: contigs += [Contig(gi, chrom, ctg_start, ctg_end) for ctg_start, ctg_end in genome_chr_contigs[gi][chrom]] # filter for large enough contigs = [ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length] # break up large contigs if options.break_t is not None: contigs = break_large_contigs(contigs, options.break_t) # print contigs to BED file for gi in range(num_genomes): contigs_i = [ctg for ctg in contigs if ctg.genome == gi] ctg_bed_file = '%s/contigs.bed' % genome_out_dirs[gi] write_seqs_bed(ctg_bed_file, contigs_i) ################################################################ # divide between train/valid/test ################################################################ # connect contigs across genomes by alignment contig_components = connect_contigs(contigs, align_net_file, options.net_fill_min, options.net_olap_min, options.out_dir, genome_out_dirs) if options.folds is not None: # divide by fold fold_contigs = divide_components_folds(contig_components, options.folds) else: # divide by train/valid/test pct fold_contigs = divide_components_pct(contig_components, options.test_pct, options.valid_pct) # rejoin broken contigs within set for fi in range(len(fold_contigs)): fold_contigs[fi] = rejoin_large_contigs(fold_contigs[fi]) # label folds if options.folds is not None: fold_labels = ['fold%d' % fi for fi in range(options.folds)] num_folds = options.folds else: fold_labels = ['train', 'valid', 'test'] num_folds = 3 if options.folds is None: # quantify leakage across sets quantify_leakage(align_net_file, fold_contigs[0], fold_contigs[1], fold_contigs[2], options.out_dir) ################################################################ # define model sequences ################################################################ fold_mseqs = [] for fi in range(num_folds): if fold_labels[fi] in ['valid','test']: stride_fold = options.stride_test else: stride_fold = options.stride_train # stride sequences across contig fold_mseqs_fi = contig_sequences(fold_contigs[fi], options.seq_length, stride_fold, options.snap, fold_labels[fi]) fold_mseqs.append(fold_mseqs_fi) # shuffle random.shuffle(fold_mseqs[fi]) # down-sample if options.sample_pct < 1.0: fold_mseqs[fi] = random.sample(fold_mseqs[fi], int(options.sample_pct*len(fold_mseqs[fi]))) # merge into one list mseqs = [ms for fm in fold_mseqs for ms in fm] # separate by genome mseqs_genome = [] for gi in range(num_genomes): mseqs_gi = [mseqs[si] for si in range(len(mseqs)) if mseqs[si].genome == gi] mseqs_genome.append(mseqs_gi) ################################################################ # filter for sufficient mappability ################################################################ for gi in range(num_genomes): if options.umap_beds[gi] is not None: # annotate unmappable positions mseqs_unmap = annotate_unmap(mseqs_genome[gi], options.umap_beds[gi], options.seq_length, options.pool_width, options.crop_bp) # filter unmappable mseqs_map_mask = (mseqs_unmap.mean(axis=1, dtype='float64') < options.umap_t) mseqs_genome[gi] = [mseqs_genome[gi][si] for si in range(len(mseqs_genome[gi])) if mseqs_map_mask[si]] mseqs_unmap = mseqs_unmap[mseqs_map_mask,:] # write to file unmap_npy_file = '%s/mseqs_unmap.npy' % genome_out_dirs[gi] np.save(unmap_npy_file, mseqs_unmap) seqs_bed_files = [] for gi in range(num_genomes): # write sequences to BED seqs_bed_files.append('%s/sequences.bed' % genome_out_dirs[gi]) write_seqs_bed(seqs_bed_files[gi], mseqs_genome[gi], True)
def main(): usage = "usage: %prog [options] <fasta_file> <sample_wigs_file> <hdf5_file>" parser = OptionParser(usage) parser.add_option( "-b", dest="limit_bed", help="Limit to segments that overlap regions in a BED file", ) parser.add_option( "-c", dest="clip", default=None, type="float", help="Clip target values to have minimum [Default: %default]", ) parser.add_option( "-d", dest="sample_pct", default=1.0, type="float", help="Down-sample the segments", ) parser.add_option( "-f", dest="fourier_dim", default=None, type="int", help="Fourier transform dimension [Default: %default]", ) parser.add_option("-g", dest="gaps_file", help="Genome assembly gaps BED [Default: %default]") parser.add_option( "-l", dest="seq_length", default=131072, type="int", help="Sequence length [Default: %default]", ) parser.add_option( "--log2", dest="log10to2", default=False, action="store_true", help="Transform values from log10 to log2 [Default: %default]", ) parser.add_option("-m", dest="params_file", help="Dimension reduction hyper-parameters file") parser.add_option( "--mult_cov", dest="cov_multiplier", default=1, type="float", help= "Coverage multiplier, useful when the read extension and pool width do not match [Default: %default]", ) parser.add_option( "-n", dest="na_t", default=0.25, type="float", help= "Remove sequences with an NA% greater than this threshold [Default: %default]", ) parser.add_option( "--no_full", dest="no_full", default=False, action="store_true", help="Do not save full test sequence targets [Default: %default]", ) parser.add_option( "-o", dest="out_bed_file", help="Output the train/valid/test sequences as a BED file", ) parser.add_option( "-p", dest="processes", default=1, type="int", help="Number parallel processes to load data [Default: %default]", ) parser.add_option( "-s", dest="stride", default=None, type="int", help="Stride to advance segments [Default: seq_length]", ) parser.add_option("--scent", dest="scent_file", help="Dimension reduction model file") parser.add_option( "-t", dest="test_pct_or_chr", type="str", default=0.05, help="Proportion of the data for testing [Default: %default]", ) parser.add_option("-u", dest="unmap_bed", help="Unmappable segments to set to NA") parser.add_option( "-w", dest="pool_width", type="int", default=128, help="Average pooling width [Default: %default]", ) parser.add_option( "--w5", dest="w5", default=False, action="store_true", help="Coverage files are w5 rather than BigWig [Default: %default]", ) parser.add_option( "-v", dest="valid_pct_or_chr", type="str", default=0.05, help="Proportion of the data for validation [Default: %default]", ) parser.add_option("-z", dest="compression", help="h5py compression [Default: %default]") (options, args) = parser.parse_args() if len(args) != 3: parser.error( "Must provide genome FASTA file, sample Wig/BigWig labels and paths, " "and model output file") else: fasta_file = args[0] sample_wigs_file = args[1] hdf5_file = args[2] random.seed(1) if options.stride is None: options.stride = options.seq_length ################################################################ # assess bigwigs ################################################################ # get wig files and labels target_wigs = OrderedDict() target_strands = [] target_labels = [] for line in open(sample_wigs_file, encoding="UTF-8"): a = line.rstrip().split("\t") target_wigs[a[0]] = a[1] if len(a) > 2: target_strands.append(a[2]) else: target_strands.append("*") if len(a) > 3: target_labels.append(a[3]) else: target_labels.append("") if (options.fourier_dim is not None and 2 * options.fourier_dim >= options.seq_length / options.pool_width): print( "Fourier transform to %d dims won't compress %d length sequences with %d pooling" % (options.fourier_dim, options.seq_length, options.pool_width), file=sys.stderr, ) exit(1) ################################################################ # prepare genomic segments ################################################################ chrom_segments = genome.load_chromosomes(fasta_file) # remove gaps if options.gaps_file: chrom_segments = genome.split_contigs(chrom_segments, options.gaps_file) # ditch the chromosomes segments = [] for chrom in chrom_segments: segments += [(chrom, seg_start, seg_end) for seg_start, seg_end in chrom_segments[chrom]] # standardize order segments.sort() # filter for large enough segments = [ cse for cse in segments if cse[2] - cse[1] >= options.seq_length ] # down-sample if options.sample_pct < 1.0: segments = random.sample(segments, int(options.sample_pct * len(segments))) # limit to a BED file if options.limit_bed is not None: segments = limit_segments(segments, options.limit_bed) ################################################################ # one hot code sequences ################################################################ seqs_1hot, seqs_segments = segments_1hot(fasta_file, segments, options.seq_length, options.stride) print("%d sequences one hot coded" % seqs_1hot.shape[0]) ################################################################ # load model ################################################################ if options.params_file: job = dna_io.read_job_params(options.params_file) job["num_targets"] = len(target_wigs) job["batch_size"] = 1024 job["model"] = job.get("model", "autoencoder") if job["model"] == "autoencoder": model = autoencoder.AE(job) saver = tf.train.Saver() else: model = joblib.load(options.scent_file) ################################################################ # bigwig read and process ################################################################ print("Reading and pre-processing bigwigs for %d segments" % len(segments), flush=True) targets_real = [] targets_imag = [] include_indexes = [] include_marker = 0 targets_test = [] test_indexes = [] test_marker = 0 update_i = 0 ssi = 0 # initialize multiprocessing pool pool = multiprocessing.Pool(options.processes) with tf.Session() as sess: if options.scent_file and job["model"] == "autoencoder": saver.restore(sess, options.scent_file) # batch segment processing bstart = 0 while bstart < len(segments): if update_i % 1 == 0: print("Tiling from %s:%d-%d" % segments[bstart], flush=True) # determine batch end bend = batch_end(segments, bstart, 400000) # bigwig_read parameters bwr_params = [( wig_file, segments[bstart:bend], options.seq_length, options.pool_width, options.stride, options.log10to2, options.cov_multiplier, ) for wig_file in target_wigs.values()] # pull the target values in parallel if options.w5: wig_targets = pool.starmap(w5_batch, bwr_params) else: wig_targets = pool.starmap(bigwig_batch, bwr_params) # transpose to S x L x T (making a copy?) targets_wig = np.transpose(np.array(wig_targets), axes=(1, 2, 0)) # clip if options.clip is not None: targets_wig = targets_wig.clip(options.clip) # sample indexes from this batch if options.test_pct_or_chr.startswith("chr"): test_bindexes = [ twi for twi in range(targets_wig.shape[0]) if seqs_segments[ssi + twi][0] == options.test_pct_or_chr ] else: test_pct = float(options.test_pct_or_chr) test_bindexes = [ twi for twi in range(targets_wig.shape[0]) if random.random() < test_pct ] # capture test indexes test_indexes += [test_marker + tbi for tbi in test_bindexes] # update test marker test_marker += targets_wig.shape[0] # save the full test targets if not options.no_full: targets_test.append(targets_wig[test_bindexes]) # map to latent space if options.scent_file is None: targets_latent = targets_wig else: targets_latent = latent_transform(sess, model, job, targets_wig) # compress across length if options.fourier_dim is None: targets_rfour = targets_latent targets_ifour = None else: targets_rfour, targets_ifour = fourier_transform( targets_latent, options.fourier_dim) # save targets_real.append(targets_rfour) targets_imag.append(targets_ifour) # update seqs_segments index ssi += targets_wig.shape[0] # update batch bstart = bend update_i += 1 pool.close() # stack arrays targets_real = np.vstack(targets_real) if options.fourier_dim is not None: targets_imag = np.vstack(targets_imag) if not options.no_full: targets_test = np.vstack(targets_test) print("%d target sequences" % targets_real.shape[0]) ################################################################ # correct for unmappable regions ################################################################ if options.unmap_bed is not None: seqs_na = annotate_na(seqs_segments, options.unmap_bed, options.seq_length, options.pool_width) # determine mappable sequences and update test indexes map_indexes = [] test_indexes_set = set(test_indexes) print("test_indexes", len(test_indexes)) test_indexes_na = [] new_i = 0 for old_i in range(seqs_na.shape[0]): # mappable if seqs_na[old_i, :].mean(dtype="float64") < options.na_t: map_indexes.append(old_i) if old_i in test_indexes_set: test_indexes_na.append(new_i) new_i += 1 # unmappable else: # forget it pass # update data structures targets_real = targets_real[map_indexes] if options.fourier_dim is not None: targets_imag = targets_imag[map_indexes] seqs_1hot = seqs_1hot[map_indexes] seqs_segments = [seqs_segments[mi] for mi in map_indexes] seqs_na = seqs_na[map_indexes] test_indexes = test_indexes_na print("test_indexes", len(test_indexes)) ################################################################ # write to train, valid, test HDF5 ################################################################ if options.valid_pct_or_chr.startswith("chr"): # sample valid chromosome valid_indexes = [ si for si in range(len(seqs_segments)) if seqs_segments[si][0] == options.valid_pct_or_chr ] else: # sample valid indexes (we already have test) valid_pct = float(options.valid_pct_or_chr) valid_n = int(valid_pct * targets_real.shape[0]) nontest_indexes = set(range(targets_real.shape[0])) - set(test_indexes) valid_indexes = random.sample(nontest_indexes, valid_n) # remainder is training train_indexes = list( set(range(len(seqs_segments))) - set(valid_indexes) - set(test_indexes)) # training may requires shuffle random.shuffle(sorted(train_indexes)) random.shuffle(sorted(valid_indexes)) random.shuffle(sorted(test_indexes)) # write to HDF5 hdf5_out = h5py.File(hdf5_file, "w") # store pooling hdf5_out.create_dataset("pool_width", data=options.pool_width, dtype="int") # store targets target_ids = np.array(list(target_wigs.keys()), dtype="S") hdf5_out.create_dataset("target_ids", data=target_ids) target_labels = np.array(target_labels, dtype="S") hdf5_out.create_dataset("target_labels", data=target_labels) target_strands = np.array(target_strands, dtype="S") hdf5_out.create_dataset("target_strands", data=target_strands) # HDF5 train hdf5_out.create_dataset( "train_in", data=seqs_1hot[train_indexes], dtype="bool", compression=options.compression, ) hdf5_out.create_dataset( "train_out", data=targets_real[train_indexes], dtype="float16", compression=options.compression, ) if options.fourier_dim is not None: hdf5_out.create_dataset( "train_out_imag", data=targets_imag[train_indexes], dtype="float16", compression=options.compression, ) if options.unmap_bed is not None: hdf5_out.create_dataset( "train_na", data=seqs_na[train_indexes], dtype="bool", compression=options.compression, ) # HDF5 valid hdf5_out.create_dataset( "valid_in", data=seqs_1hot[valid_indexes], dtype="bool", compression=options.compression, ) hdf5_out.create_dataset( "valid_out", data=targets_real[valid_indexes], dtype="float16", compression=options.compression, ) if options.fourier_dim is not None: hdf5_out.create_dataset( "valid_out_imag", data=targets_imag[valid_indexes], dtype="float16", compression=options.compression, ) if options.unmap_bed is not None: hdf5_out.create_dataset( "valid_na", data=seqs_na[valid_indexes], dtype="bool", compression=options.compression, ) # HDF5 test hdf5_out.create_dataset( "test_in", data=seqs_1hot[test_indexes], dtype="bool", compression=options.compression, ) hdf5_out.create_dataset( "test_out", data=targets_real[test_indexes], dtype="float16", compression=options.compression, ) if options.fourier_dim is not None: hdf5_out.create_dataset( "test_out_imag", data=targets_imag[test_indexes], dtype="float16", compression=options.compression, ) if not options.no_full: hdf5_out.create_dataset( "test_out_full", data=targets_test, dtype="float16", compression=options.compression, ) if options.unmap_bed is not None: hdf5_out.create_dataset( "test_na", data=seqs_na[test_indexes], dtype="bool", compression=options.compression, ) hdf5_out.close() # output BED file if options.out_bed_file: out_bed_out = open(options.out_bed_file, "w") for si in train_indexes: print("%s\t%d\t%d\ttrain" % seqs_segments[si], file=out_bed_out) for si in valid_indexes: print("%s\t%d\t%d\tvalid" % seqs_segments[si], file=out_bed_out) for si in test_indexes: print("%s\t%d\t%d\ttest" % seqs_segments[si], file=out_bed_out) out_bed_out.close()
def main(): usage = 'usage: %prog [options] <fasta_file> <targets_file>' parser = OptionParser(usage) parser.add_option('-b', dest='blacklist_bed', help='Set blacklist nucleotides to a baseline value.') parser.add_option( '--break', dest='break_t', default=8388608, type='int', help='Break in half contigs above length [Default: %default]') parser.add_option('--crop', dest='crop_bp', default=0, type='int', help='Crop bp off each end [Default: %default]') parser.add_option( '-d', dest='diagonal_offset', default=2, type='int', help='Positions on the diagonal to ignore [Default: %default]') parser.add_option('-g', dest='gaps_file', help='Genome assembly gaps BED [Default: %default]') parser.add_option( '-k', dest='kernel_stddev', default=0, type='int', help='Gaussian kernel stddev to smooth values [Default: %default]') parser.add_option('-l', dest='seq_length', default=131072, type='int', help='Sequence length [Default: %default]') parser.add_option( '--limit', dest='limit_bed', help='Limit to segments that overlap regions in a BED file') parser.add_option( '--local', dest='run_local', default=False, action='store_true', help='Run jobs locally as opposed to on SLURM [Default: %default]') parser.add_option('-o', dest='out_dir', default='data_out', help='Output directory [Default: %default]') parser.add_option('-p', dest='processes', default=None, type='int', help='Number parallel processes [Default: %default]') parser.add_option('-r', dest='seqs_per_tfr', default=128, type='int', help='Sequences per TFRecord file [Default: %default]') parser.add_option( '--restart', dest='restart', default=False, action='store_true', help='Skip already read HDF5 coverage values. [Default: %default]') parser.add_option('--sample', dest='sample_pct', default=1.0, type='float', help='Down-sample the segments') parser.add_option('--seed', dest='seed', default=44, type='int', help='Random seed [Default: %default]') parser.add_option( '--stride_train', dest='stride_train', default=1., type='float', help='Stride to advance train sequences [Default: seq_length]') parser.add_option( '--stride_test', dest='stride_test', default=1., type='float', help='Stride to advance valid and test sequences [Default: seq_length]' ) parser.add_option( '--soft', dest='soft_clip', default=False, action='store_true', help= 'Soft clip values, applying sqrt to the execess above the threshold [Default: %default]' ) parser.add_option( '-t', dest='test_pct_or_chr', default=0.05, type='str', help='Proportion of the data for testing [Default: %default]') parser.add_option('-u', dest='umap_bed', help='Unmappable regions in BED format') parser.add_option( '--umap_midpoints', dest='umap_midpoints', help='Regions with midpoints to exclude in BED format. Used for 4C/HiC.' ) parser.add_option( '--umap_t', dest='umap_t', default=0.3, type='float', help= 'Remove sequences with more than this unmappable bin % [Default: %default]' ) parser.add_option( '--umap_set', dest='umap_set', default=None, type='float', help= 'Set unmappable regions to this percentile in the sequences\' distribution of values' ) parser.add_option('-w', dest='pool_width', default=128, type='int', help='Sum pool width [Default: %default]') parser.add_option( '-v', dest='valid_pct_or_chr', default=0.05, type='str', help='Proportion of the data for validation [Default: %default]') parser.add_option( '--snap', dest='snap', default=None, type='int', help= 'snap stride to multiple for binned targets in bp, if not None seq_length must be a multiple of snap' ) parser.add_option('--as_obsexp', dest='as_obsexp', action="store_true", default=False, help='save targets as obsexp profiles') parser.add_option('--global_obsexp', dest='global_obsexp', action="store_true", default=False, help='use pre-calculated by-chromosome obs/exp') parser.add_option('--no_log', dest='no_log', action="store_true", default=False, help='do not take log for obs/exp') (options, args) = parser.parse_args() if len(args) != 2: parser.error( 'Must provide FASTA and sample coverage labels and paths.') else: fasta_file = args[0] targets_file = args[1] random.seed(options.seed) np.random.seed(options.seed) # transform proportion strides to base pairs if options.stride_train <= 1: print('stride_train %.f' % options.stride_train, end='') options.stride_train = options.stride_train * options.seq_length print(' converted to %f' % options.stride_train) options.stride_train = int(np.round(options.stride_train)) if options.stride_test <= 1: print('stride_test %.f' % options.stride_test, end='') options.stride_test = options.stride_test * options.seq_length print(' converted to %f' % options.stride_test) options.stride_test = int(np.round(options.stride_test)) if options.snap != None: if np.mod(options.seq_length, options.snap) != 0: raise ValueError('seq_length must be a multiple of snap') if np.mod(options.stride_train, options.snap) != 0: raise ValueError('stride_train must be a multiple of snap') if np.mod(options.stride_test, options.snap) != 0: raise ValueError('stride_test must be a multiple of snap') if os.path.isdir(options.out_dir) and not options.restart: print('Remove output directory %s or use --restart option.' % options.out_dir) exit(1) elif not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) # dump options with open('%s/options.json' % options.out_dir, 'w') as options_json_out: json.dump(options.__dict__, options_json_out, sort_keys=True, indent=4) ################################################################ # define genomic contigs ################################################################ chrom_contigs = genome.load_chromosomes(fasta_file) # remove gaps if options.gaps_file: chrom_contigs = genome.split_contigs(chrom_contigs, options.gaps_file) # ditch the chromosomes for contigs contigs = [] for chrom in chrom_contigs: contigs += [ Contig(chrom, ctg_start, ctg_end) for ctg_start, ctg_end in chrom_contigs[chrom] ] # limit to a BED file if options.limit_bed is not None: contigs = limit_contigs(contigs, options.limit_bed) # filter for large enough contigs = [ ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length ] # break up large contigs if options.break_t is not None: contigs = break_large_contigs(contigs, options.break_t) # print contigs to BED file ctg_bed_file = '%s/contigs.bed' % options.out_dir write_seqs_bed(ctg_bed_file, contigs) ################################################################ # divide between train/valid/test ################################################################ try: # convert to float pct valid_pct = float(options.valid_pct_or_chr) test_pct = float(options.test_pct_or_chr) assert (0 <= valid_pct <= 1) assert (0 <= test_pct <= 1) # divide by pct contig_sets = divide_contigs_pct(contigs, test_pct, valid_pct) except (ValueError, AssertionError): # divide by chr valid_chrs = options.valid_pct_or_chr.split(',') test_chrs = options.test_pct_or_chr.split(',') contig_sets = divide_contigs_chr(contigs, test_chrs, valid_chrs) train_contigs, valid_contigs, test_contigs = contig_sets # rejoin broken contigs within set train_contigs = rejoin_large_contigs(train_contigs) valid_contigs = rejoin_large_contigs(valid_contigs) test_contigs = rejoin_large_contigs(test_contigs) ################################################################ # define model sequences ################################################################ # stride sequences across contig train_mseqs = contig_sequences(train_contigs, options.seq_length, options.stride_train, options.snap, label='train') valid_mseqs = contig_sequences(valid_contigs, options.seq_length, options.stride_test, options.snap, label='valid') test_mseqs = contig_sequences(test_contigs, options.seq_length, options.stride_test, options.snap, label='test') # shuffle random.shuffle(train_mseqs) random.shuffle(valid_mseqs) random.shuffle(test_mseqs) # down-sample if options.sample_pct < 1.0: train_mseqs = random.sample(train_mseqs, int(options.sample_pct * len(train_mseqs))) valid_mseqs = random.sample(valid_mseqs, int(options.sample_pct * len(valid_mseqs))) test_mseqs = random.sample(test_mseqs, int(options.sample_pct * len(test_mseqs))) # merge mseqs = train_mseqs + valid_mseqs + test_mseqs ################################################################ # mappability ################################################################ if (options.umap_bed is not None) or (options.umap_midpoints is not None): if shutil.which('bedtools') is None: print('Install Bedtools to annotate unmappable sites', file=sys.stderr) exit(1) if options.umap_bed is not None: # annotate unmappable positions mseqs_unmap = annotate_unmap(mseqs, options.umap_bed, options.seq_length, options.pool_width) # filter unmappable mseqs_map_mask = (mseqs_unmap.mean(axis=1, dtype='float64') < options.umap_t) mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]] mseqs_unmap = mseqs_unmap[mseqs_map_mask, :] # write to file unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir np.save(unmap_npy, mseqs_unmap) if options.umap_midpoints is not None: # annotate unmappable midpoints for 4C/HiC mseqs_unmap = annotate_unmap(mseqs, options.umap_midpoints, options.seq_length, options.pool_width) # filter unmappable seqmid = mseqs_unmap.shape[ 1] // 2 #int( options.seq_length / options.pool_width /2) mseqs_map_mask = (np.sum(mseqs_unmap[:, seqmid - 1:seqmid + 1], axis=1) == 0) mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]] mseqs_unmap = mseqs_unmap[mseqs_map_mask, :] # write to file unmap_npy = '%s/mseqs_unmap_midpoints.npy' % options.out_dir np.save(unmap_npy, mseqs_unmap) # write sequences to BED print('writing sequences to BED') seqs_bed_file = '%s/sequences.bed' % options.out_dir write_seqs_bed(seqs_bed_file, mseqs, True) ################################################################ # read sequence coverage values ################################################################ # read target datasets targets_df = pd.read_csv(targets_file, index_col=0, sep='\t') seqs_cov_dir = '%s/seqs_cov' % options.out_dir if not os.path.isdir(seqs_cov_dir): os.mkdir(seqs_cov_dir) read_jobs = [] for ti in range(targets_df.shape[0]): genome_cov_file = targets_df['file'].iloc[ti] seqs_cov_stem = '%s/%d' % (seqs_cov_dir, ti) seqs_cov_file = '%s.h5' % seqs_cov_stem clip_ti = None if 'clip' in targets_df.columns: clip_ti = targets_df['clip'].iloc[ti] # scale_ti = 1 # if 'scale' in targets_df.columns: # scale_ti = targets_df['scale'].iloc[ti] if options.restart and os.path.isfile(seqs_cov_file): print('Skipping existing %s' % seqs_cov_file, file=sys.stderr) else: cmd = 'akita_data_read.py' cmd += ' --crop %d' % options.crop_bp cmd += ' -k %d' % options.kernel_stddev cmd += ' -w %d' % options.pool_width if clip_ti is not None: cmd += ' --clip %f' % clip_ti if options.soft_clip: cmd += ' --soft' # cmd += ' -s %f' % scale_ti if options.blacklist_bed: cmd += ' -b %s' % options.blacklist_bed if options.as_obsexp: cmd += ' --as_obsexp' if options.global_obsexp: cmd += ' --global_obsexp' if options.no_log: cmd += ' --no_log' cmd += ' %s' % genome_cov_file cmd += ' %s' % seqs_bed_file cmd += ' %s' % seqs_cov_file if options.run_local: # breaks on some OS # cmd += ' &> %s.err' % seqs_cov_stem read_jobs.append(cmd) else: j = slurm.Job(cmd, name='read_t%d' % ti, out_file='%s.out' % seqs_cov_stem, err_file='%s.err' % seqs_cov_stem, queue='standard', mem=15000, time='12:0:0') read_jobs.append(j) if options.run_local: util.exec_par(read_jobs, options.processes, verbose=True) else: slurm.multi_run(read_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5) ################################################################ # write TF Records ################################################################ # copy targets file shutil.copy(targets_file, '%s/targets.txt' % options.out_dir) # initialize TF Records dir tfr_dir = '%s/tfrecords' % options.out_dir if not os.path.isdir(tfr_dir): os.mkdir(tfr_dir) write_jobs = [] for tvt_set in ['train', 'valid', 'test']: tvt_set_indexes = [ i for i in range(len(mseqs)) if mseqs[i].label == tvt_set ] tvt_set_start = tvt_set_indexes[0] tvt_set_end = tvt_set_indexes[-1] + 1 tfr_i = 0 tfr_start = tvt_set_start tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end) while tfr_start <= tvt_set_end: tfr_stem = '%s/%s-%d' % (tfr_dir, tvt_set, tfr_i) cmd = 'basenji_data_write.py' cmd += ' -s %d' % tfr_start cmd += ' -e %d' % tfr_end # do not use # if options.umap_bed is not None: # cmd += ' -u %s' % unmap_npy # if options.umap_set is not None: # cmd += ' --umap_set %f' % options.umap_set cmd += ' %s' % fasta_file cmd += ' %s' % seqs_bed_file cmd += ' %s' % seqs_cov_dir cmd += ' %s.tfr' % tfr_stem if options.run_local: # breaks on some OS # cmd += ' &> %s.err' % tfr_stem write_jobs.append(cmd) else: j = slurm.Job(cmd, name='write_%s-%d' % (tvt_set, tfr_i), out_file='%s.out' % tfr_stem, err_file='%s.err' % tfr_stem, queue='standard', mem=15000, time='12:0:0') write_jobs.append(j) # update tfr_i += 1 tfr_start += options.seqs_per_tfr tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end) if options.run_local: util.exec_par(write_jobs, options.processes, verbose=True) else: slurm.multi_run(write_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5) ################################################################ # stats ################################################################ stats_dict = {} stats_dict['num_targets'] = targets_df.shape[0] stats_dict['train_seqs'] = len(train_mseqs) stats_dict['valid_seqs'] = len(valid_mseqs) stats_dict['test_seqs'] = len(test_mseqs) stats_dict['seq_length'] = options.seq_length stats_dict['pool_width'] = options.pool_width stats_dict['crop_bp'] = options.crop_bp stats_dict['diagonal_offset'] = options.diagonal_offset target1_length = options.seq_length - 2 * options.crop_bp target1_length = target1_length // options.pool_width target1_length = target1_length - options.diagonal_offset target_length = target1_length * (target1_length + 1) // 2 stats_dict['target_length'] = target_length with open('%s/statistics.json' % options.out_dir, 'w') as stats_json_out: json.dump(stats_dict, stats_json_out, indent=4)
def main(): usage = "usage: %prog [options] <fasta_file> <targets_file>" parser = OptionParser(usage) parser.add_option( "-b", dest="blacklist_bed", help="Set blacklist nucleotides to a baseline value.", ) parser.add_option( "--break", dest="break_t", default=786432, type="int", help="Break in half contigs above length [Default: %default]", ) # parser.add_option('-c', dest='clip', # default=None, type='float', # help='Clip target values to have minimum [Default: %default]') parser.add_option( "-d", dest="sample_pct", default=1.0, type="float", help="Down-sample the segments", ) parser.add_option( "-g", dest="gaps_file", help="Genome assembly gaps BED [Default: %default]" ) parser.add_option( "-l", dest="seq_length", default=131072, type="int", help="Sequence length [Default: %default]", ) parser.add_option( "--limit", dest="limit_bed", help="Limit to segments that overlap regions in a BED file", ) parser.add_option( "--local", dest="run_local", default=False, action="store_true", help="Run jobs locally as opposed to on SLURM [Default: %default]", ) parser.add_option( "-o", dest="out_dir", default="data_out", help="Output directory [Default: %default]", ) parser.add_option( "-p", dest="processes", default=None, type="int", help="Number parallel processes [Default: %default]", ) parser.add_option( "-r", dest="seqs_per_tfr", default=256, type="int", help="Sequences per TFRecord file [Default: %default]", ) parser.add_option( "--seed", dest="seed", default=44, type="int", help="Random seed [Default: %default]", ) parser.add_option( "--stride_train", dest="stride_train", default=1.0, type="float", help="Stride to advance train sequences [Default: seq_length]", ) parser.add_option( "--stride_test", dest="stride_test", default=1.0, type="float", help="Stride to advance valid and test sequences [Default: seq_length]", ) parser.add_option( "--soft", dest="soft_clip", default=False, action="store_true", help="Soft clip values, applying sqrt to the execess above the threshold [Default: %default]", ) parser.add_option( "-t", dest="test_pct_or_chr", default=0.05, type="str", help="Proportion of the data for testing [Default: %default]", ) parser.add_option("-u", dest="umap_bed", help="Unmappable regions in BED format") parser.add_option( "--umap_t", dest="umap_t", default=0.3, type="float", help="Remove sequences with more than this unmappable bin % [Default: %default]", ) parser.add_option( "--umap_set", dest="umap_set", default=None, type="float", help="Set unmappable regions to this percentile in the sequences' distribution of values", ) parser.add_option( "-w", dest="pool_width", default=128, type="int", help="Sum pool width [Default: %default]", ) parser.add_option( "-v", dest="valid_pct_or_chr", default=0.05, type="str", help="Proportion of the data for validation [Default: %default]", ) (options, args) = parser.parse_args() if len(args) != 2: parser.error("Must provide FASTA and sample coverage labels and paths.") else: fasta_file = args[0] targets_file = args[1] random.seed(options.seed) np.random.seed(options.seed) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) if options.stride_train <= 0 or options.stride_train > 1: parser.error("Train stride =%f must be in [0,1]" % options.stride_train) if options.stride_test <= 0 or options.stride_test > 1: parser.error("Test stride =%f must be in [0,1]" % options.stride_test) ################################################################ # define genomic contigs ################################################################ chrom_contigs = genome.load_chromosomes(fasta_file) # remove gaps if options.gaps_file: chrom_contigs = genome.split_contigs(chrom_contigs, options.gaps_file) # ditch the chromosomes for contigs contigs = [] for chrom in chrom_contigs: contigs += [ Contig(chrom, ctg_start, ctg_end) for ctg_start, ctg_end in chrom_contigs[chrom] ] # limit to a BED file if options.limit_bed is not None: contigs = limit_contigs(contigs, options.limit_bed) # filter for large enough contigs = [ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length] # break up large contigs if options.break_t is not None: contigs = break_large_contigs(contigs, options.break_t) # print contigs to BED file ctg_bed_file = "%s/contigs.bed" % options.out_dir write_seqs_bed(ctg_bed_file, contigs) ################################################################ # divide between train/valid/test ################################################################ try: # convert to float pct valid_pct = float(options.valid_pct_or_chr) test_pct = float(options.test_pct_or_chr) assert 0 <= valid_pct <= 1 assert 0 <= test_pct <= 1 # divide by pct contig_sets = divide_contigs_pct(contigs, test_pct, valid_pct) except (ValueError, AssertionError): # divide by chr valid_chr = options.valid_pct_or_chr test_chr = options.test_pct_or_chr contig_sets = divide_contigs_chr(contigs, test_chr, valid_chr) train_contigs, valid_contigs, test_contigs = contig_sets # rejoin broken contigs within set train_contigs = rejoin_large_contigs(train_contigs) valid_contigs = rejoin_large_contigs(valid_contigs) test_contigs = rejoin_large_contigs(test_contigs) ################################################################ # define model sequences ################################################################ # stride sequences across contig train_mseqs = contig_sequences( train_contigs, options.seq_length, options.stride_train, label="train" ) valid_mseqs = contig_sequences( valid_contigs, options.seq_length, options.stride_test, label="valid" ) test_mseqs = contig_sequences( test_contigs, options.seq_length, options.stride_test, label="test" ) # shuffle random.shuffle(train_mseqs) random.shuffle(valid_mseqs) random.shuffle(test_mseqs) # down-sample if options.sample_pct < 1.0: train_mseqs = random.sample( train_mseqs, int(options.sample_pct * len(train_mseqs)) ) valid_mseqs = random.sample( valid_mseqs, int(options.sample_pct * len(valid_mseqs)) ) test_mseqs = random.sample( test_mseqs, int(options.sample_pct * len(test_mseqs)) ) # merge mseqs = train_mseqs + valid_mseqs + test_mseqs ################################################################ # mappability ################################################################ if options.umap_bed is not None: if shutil.which("bedtools") is None: print("Install Bedtools to annotate unmappable sites", file=sys.stderr) exit(1) # annotate unmappable positions mseqs_unmap = annotate_unmap( mseqs, options.umap_bed, options.seq_length, options.pool_width ) # filter unmappable mseqs_map_mask = mseqs_unmap.mean(axis=1, dtype="float64") < options.umap_t mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]] mseqs_unmap = mseqs_unmap[mseqs_map_mask, :] # write to file unmap_npy = "%s/mseqs_unmap.npy" % options.out_dir np.save(unmap_npy, mseqs_unmap) # write sequences to BED seqs_bed_file = "%s/sequences.bed" % options.out_dir write_seqs_bed(seqs_bed_file, mseqs, True) ################################################################ # read sequence coverage values ################################################################ # read target datasets targets_df = pd.read_table(targets_file, index_col=0) seqs_cov_dir = "%s/seqs_cov" % options.out_dir if not os.path.isdir(seqs_cov_dir): os.mkdir(seqs_cov_dir) read_jobs = [] for ti in range(targets_df.shape[0]): genome_cov_file = targets_df["file"].iloc[ti] seqs_cov_stem = "%s/%d" % (seqs_cov_dir, ti) seqs_cov_file = "%s.h5" % seqs_cov_stem clip_ti = None if "clip" in targets_df.columns: clip_ti = targets_df["clip"].iloc[ti] scale_ti = 1 if "scale" in targets_df.columns: scale_ti = targets_df["scale"].iloc[ti] if os.path.isfile(seqs_cov_file): print("Skipping existing %s" % seqs_cov_file, file=sys.stderr) else: cmd = "basenji_data_read.py" cmd += " -w %d" % options.pool_width cmd += " -u %s" % targets_df["sum_stat"].iloc[ti] if clip_ti is not None: cmd += " -c %f" % clip_ti if options.soft_clip: cmd += " --soft" cmd += " -s %f" % scale_ti if options.blacklist_bed: cmd += " -b %s" % options.blacklist_bed cmd += " %s" % genome_cov_file cmd += " %s" % seqs_bed_file cmd += " %s" % seqs_cov_file if options.run_local: cmd += " &> %s.err" % seqs_cov_stem read_jobs.append(cmd) else: j = slurm.Job( cmd, name="read_t%d" % ti, out_file="%s.out" % seqs_cov_stem, err_file="%s.err" % seqs_cov_stem, queue="standard", mem=15000, time="12:0:0", ) read_jobs.append(j) if options.run_local: util.exec_par(read_jobs, options.processes, verbose=True) else: slurm.multi_run( read_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5 ) ################################################################ # write TF Records ################################################################ # copy targets file shutil.copy(targets_file, "%s/targets.txt" % options.out_dir) # initialize TF Records dir tfr_dir = "%s/tfrecords" % options.out_dir if not os.path.isdir(tfr_dir): os.mkdir(tfr_dir) write_jobs = [] for tvt_set in ["train", "valid", "test"]: tvt_set_indexes = [i for i in range(len(mseqs)) if mseqs[i].label == tvt_set] tvt_set_start = tvt_set_indexes[0] tvt_set_end = tvt_set_indexes[-1] + 1 tfr_i = 0 tfr_start = tvt_set_start tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end) while tfr_start <= tvt_set_end: tfr_stem = "%s/%s-%d" % (tfr_dir, tvt_set, tfr_i) cmd = "basenji_data_write.py" cmd += " -s %d" % tfr_start cmd += " -e %d" % tfr_end if options.umap_bed is not None: cmd += " -u %s" % unmap_npy if options.umap_set is not None: cmd += " --umap_set %f" % options.umap_set cmd += " %s" % fasta_file cmd += " %s" % seqs_bed_file cmd += " %s" % seqs_cov_dir cmd += " %s.tfr" % tfr_stem if options.run_local: cmd += " &> %s.err" % tfr_stem write_jobs.append(cmd) else: j = slurm.Job( cmd, name="write_%s-%d" % (tvt_set, tfr_i), out_file="%s.out" % tfr_stem, err_file="%s.err" % tfr_stem, queue="standard", mem=15000, time="12:0:0", ) write_jobs.append(j) # update tfr_i += 1 tfr_start += options.seqs_per_tfr tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end) if options.run_local: util.exec_par(write_jobs, options.processes, verbose=True) else: slurm.multi_run( write_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5 )