def bigwig_write(snp, seq_len, preds, model, bw_file, genome_file): bw_open = bigwig_open(bw_file, genome_file) seq_chrom = snp.chrom seq_start = snp.pos - seq_len // 2 bw_chroms = [seq_chrom] * len(preds) bw_starts = [ int(seq_start + model.batch_buffer + bi * model.target_pool) for bi in range(len(preds)) ] bw_ends = [int(bws + model.target_pool) for bws in bw_starts] preds_list = [float(p) for p in preds] bw_open.addEntries(bw_chroms, bw_starts, ends=bw_ends, values=preds_list) bw_open.close()
def score_write(sess, model, options, seqs_1hot, seqs_chrom, seqs_start): ''' Compute scores and write them as BigWigs for a set of sequences. ''' for si in range(seqs_1hot.shape[0]): # initialize batcher batcher = basenji.batcher.Batcher(seqs_1hot[si:si + 1], batch_size=model.batch_size, pool_width=model.target_pool) # get layer representations t0 = time.time() print('Computing gradients.', end='', flush=True) _, _, _, batch_grads, batch_reprs, _ = model.gradients( sess, batcher, rc=options.rc, shifts=options.shifts, mc_n=options.mc_n, return_all=True) print(' Done in %ds.' % (time.time() - t0), flush=True) # only layer batch_reprs = batch_reprs[0] batch_grads = batch_grads[0] # increase resolution batch_reprs = batch_reprs.astype('float32') batch_grads = batch_grads.astype('float32') # S (sequences) x T (targets) x P (seq position) x U (units layer i) x E (ensembles) print('batch_grads', batch_grads.shape) pooled_length = batch_grads.shape[2] # S (sequences) x P (seq position) x U (Units layer i) x E (ensembles) print('batch_reprs', batch_reprs.shape) # write bigwigs t0 = time.time() print('Writing BigWigs.', end='', flush=True) # for each target for tii in range(len(options.target_indexes)): ti = options.target_indexes[tii] # compute scores if options.norm is None: batch_grads_scores = np.multiply( batch_reprs[0], batch_grads[0, tii, :, :, :]).sum(axis=1) else: batch_grads_scores = np.multiply(batch_reprs[0], batch_grads[0, tii, :, :, :]) batch_grads_scores = np.power(np.abs(batch_grads_scores), options.norm) batch_grads_scores = batch_grads_scores.sum(axis=1) batch_grads_scores = np.power(batch_grads_scores, 1. / options.norm) # compute score statistics batch_grads_mean = batch_grads_scores.mean(axis=1) if options.norm is None: batch_grads_pval = ttest_1samp(batch_grads_scores, 0, axis=1)[1] else: batch_grads_pval = ttest_1samp(batch_grads_scores, 0, axis=1)[1] # batch_grads_pval = chi2(df=) batch_grads_pval /= 2 # open bigwig bws_file = '%s/s%d_t%d_scores.bw' % (options.out_dir, si, ti) bwp_file = '%s/s%d_t%d_pvals.bw' % (options.out_dir, si, ti) bws_open = bigwig_open(bws_file, options.genome_file) # bwp_open = bigwig_open(bwp_file, options.genome_file) # specify bigwig locations and values bw_chroms = [seqs_chrom[si]] * pooled_length bw_starts = [ int(seqs_start[si] + pi * model.target_pool) for pi in range(pooled_length) ] bw_ends = [int(bws + model.target_pool) for bws in bw_starts] bws_values = [float(bgs) for bgs in batch_grads_mean] # bwp_values = [float(bgp) for bgp in batch_grads_pval] # write bws_open.addEntries(bw_chroms, bw_starts, ends=bw_ends, values=bws_values) # bwp_open.addEntries(bw_chroms, bw_starts, ends=bw_ends, values=bwp_values) # close bws_open.close() # bwp_open.close() print(' Done in %ds.' % (time.time() - t0), flush=True) gc.collect()
def main(): usage = "usage: %prog [options] <params_file> <model_file> <genes_hdf5_file>" parser = OptionParser(usage) parser.add_option( "-g", dest="genome_file", default="%s/data/human.hg19.genome" % os.environ["BASENJIDIR"], help="Chromosome lengths file [Default: %default]", ) parser.add_option("-l", dest="gene_list", help="Process only gene ids in the given file") parser.add_option( "-o", dest="out_dir", default="grad_mapg", help="Output directory [Default: %default]", ) parser.add_option("-t", dest="target_indexes", default=None, help="Target indexes to plot") (options, args) = parser.parse_args() if len(args) != 3: parser.error("Must provide parameters, model, and genomic position") else: params_file = args[0] model_file = args[1] genes_hdf5_file = args[2] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # reads in genes HDF5 gene_data = genedata.GeneData(genes_hdf5_file) # subset gene sequences genes_subset = set() if options.gene_list: for line in open(options.gene_list): genes_subset.add(line.rstrip()) gene_data.subset_genes(genes_subset) print("Filtered to %d sequences" % gene_data.num_seqs) ####################################################### # model parameters and placeholders job = params.read_job_params(params_file) job["seq_length"] = gene_data.seq_length job["seq_depth"] = gene_data.seq_depth job["target_pool"] = gene_data.pool_width if "num_targets" not in job: print( "Must specify number of targets (num_targets) in the parameters file.", file=sys.stderr, ) exit(1) # set target indexes if options.target_indexes is not None: options.target_indexes = [ int(ti) for ti in options.target_indexes.split(",") ] target_subset = options.target_indexes else: options.target_indexes = list(range(job["num_targets"])) target_subset = None # build model model = seqnn.SeqNN() model.build(job, target_subset=target_subset) # determine latest pre-dilated layer cnn_dilation = np.array([cp.dilation for cp in model.hp.cnn_params]) dilated_mask = cnn_dilation > 1 dilated_indexes = np.where(dilated_mask)[0] pre_dilated_layer = np.min(dilated_indexes) print("Pre-dilated layer: %d" % pre_dilated_layer) # build gradients ops t0 = time.time() print("Building target/position-specific gradient ops.", end="") model.build_grads_genes(gene_data.gene_seqs, layers=[pre_dilated_layer]) print(" Done in %ds" % (time.time() - t0), flush=True) ####################################################### # acquire gradients # initialize saver saver = tf.train.Saver() with tf.Session() as sess: # load variables into session saver.restore(sess, model_file) for si in range(gene_data.num_seqs): # initialize batcher batcher_si = batcher.Batcher( gene_data.seqs_1hot[si:si + 1], batch_size=model.hp.batch_size, pool_width=model.hp.target_pool, ) # get layer representations t0 = time.time() print("Computing gradients.", end="", flush=True) batch_grads, batch_reprs = model.gradients_genes( sess, batcher_si, gene_data.gene_seqs[si:si + 1]) print(" Done in %ds." % (time.time() - t0), flush=True) # only layer batch_reprs = batch_reprs[0] batch_grads = batch_grads[0] # G (TSSs) x T (targets) x P (seq position) x U (Units layer i) print("batch_grads", batch_grads.shape) pooled_length = batch_grads.shape[2] # S (sequences) x P (seq position) x U (Units layer i) print("batch_reprs", batch_reprs.shape) # write bigwigs t0 = time.time() print("Writing BigWigs.", end="", flush=True) # for each TSS for tss_i in range(batch_grads.shape[0]): tss = gene_data.gene_seqs[si].tss_list[tss_i] # for each target for tii in range(len(options.target_indexes)): ti = options.target_indexes[tii] # dot representation and gradient batch_grads_score = np.multiply( batch_reprs[0], batch_grads[tss_i, tii, :, :]).sum(axis=1) # open bigwig bw_file = "%s/%s-%s_t%d.bw" % ( options.out_dir, tss.gene_id, tss.identifier, ti, ) bw_open = bigwig_open(bw_file, options.genome_file) # access gene sequence information seq_chrom = gene_data.gene_seqs[si].chrom seq_start = gene_data.gene_seqs[si].start # specify bigwig locations and values bw_chroms = [seq_chrom] * pooled_length bw_starts = [ int(seq_start + li * model.hp.target_pool) for li in range(pooled_length) ] bw_ends = [ int(bws + model.hp.target_pool) for bws in bw_starts ] bw_values = [float(bgs) for bgs in batch_grads_score] # write bw_open.addEntries(bw_chroms, bw_starts, ends=bw_ends, values=bw_values) # close bw_open.close() print(" Done in %ds." % (time.time() - t0), flush=True) gc.collect()
def main(): usage = 'usage: %prog [options] <params_file> <model_file> <genes_hdf5_file> <vcf_file>' parser = OptionParser(usage) parser.add_option( '-a', dest='all_sed', default=False, action='store_true', help= 'Print all variant-gene pairs, as opposed to only nonzero [Default: %default]' ) parser.add_option('-b', dest='batch_size', default=None, type='int', help='Batch size [Default: %default]') parser.add_option('-c', dest='csv', default=False, action='store_true', help='Print table as CSV [Default: %default]') parser.add_option('-g', dest='genome_file', default='%s/assembly/human.hg19.genome' % os.environ['HG19'], help='Chromosome lengths file [Default: %default]') parser.add_option( '-o', dest='out_dir', default='sed', help='Output directory for tables and plots [Default: %default]') parser.add_option('-p', dest='processes', default=None, type='int', help='Number of processes, passed by multi script') parser.add_option( '--rc', dest='rc', default=False, action='store_true', help= 'Average the forward and reverse complement predictions when testing [Default: %default]' ) parser.add_option( '--ti', dest='track_indexes', help='Comma-separated list of target indexes to output BigWig tracks') parser.add_option( '-x', dest='transcript_table', default=False, action='store_true', help='Print transcript table in addition to gene [Default: %default]') parser.add_option( '-w', dest='tss_width', default=1, type='int', help= 'Width of bins considered to quantify TSS transcription [Default: %default]' ) (options, args) = parser.parse_args() if len(args) == 4: # single worker params_file = args[0] model_file = args[1] genes_hdf5_file = args[2] vcf_file = args[3] elif len(args) == 6: # multi worker options_pkl_file = args[0] params_file = args[1] model_file = args[2] genes_hdf5_file = args[3] vcf_file = args[4] worker_index = int(args[5]) # load options options_pkl = open(options_pkl_file, 'rb') options = pickle.load(options_pkl) options_pkl.close() # update output directory options.out_dir = '%s/job%d' % (options.out_dir, worker_index) else: parser.error( 'Must provide parameters and model files, genes HDF5 file, and QTL VCF file' ) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) if options.track_indexes is None: options.track_indexes = [] else: options.track_indexes = [ int(ti) for ti in options.track_indexes.split(',') ] if not os.path.isdir('%s/tracks' % options.out_dir): os.mkdir('%s/tracks' % options.out_dir) ################################################################# # reads in genes HDF5 gene_data = basenji.genes.GeneData(genes_hdf5_file) # filter for worker sequences if options.processes is not None: gene_data.worker(worker_index, options.processes) ################################################################# # prep SNPs # load SNPs snps = basenji.vcf.vcf_snps(vcf_file) # intersect w/ segments print('Intersecting gene sequences with SNPs...', flush=True, end='') seqs_snps = basenji.vcf.intersect_seqs_snps(vcf_file, gene_data.seq_coords, vision_p=0.5) print('done', flush=True) ################################################################# # determine SNP sequences to be needed seqs_snps_list = [] for seq_i in range(gene_data.num_seqs): seq_chrom, seq_start, seq_end = gene_data.seq_coords[seq_i] if seqs_snps[seq_i]: # add major allele seqs_snps_list.append((seq_i, None, None)) # add minor alleles for snp_i in seqs_snps[seq_i]: # determine SNP position wrt sequence snp_seq_pos = snps[snp_i].pos - 1 - seq_start # update primary sequence to use major allele basenji.dna_io.hot1_set(gene_data.seqs_1hot[seq_i], snp_seq_pos, snps[snp_i].ref_allele) assert (basenji.dna_io.hot1_get( gene_data.seqs_1hot[seq_i], snp_seq_pos) == snps[snp_i].ref_allele) # append descriptive tuple to list seqs_snps_list.append( (seq_i, snp_seq_pos, snps[snp_i].alt_alleles[0])) ################################################################# # setup model job = basenji.dna_io.read_job_params(params_file) job['batch_length'] = gene_data.seq_length job['seq_depth'] = gene_data.seq_depth job['target_pool'] = gene_data.pool_width if 'num_targets' not in job and gene_data.num_targets is not None: job['num_targets'] = gene_data.num_targets if 'num_targets' not in job: print( "Must specify number of targets (num_targets) in the parameters file. I know, it's annoying. Sorry.", file=sys.stderr) exit(1) # build model model = basenji.seqnn.SeqNN() model.build(job) ################################################################# # compute, collect, and print SEDs header_cols = ('rsid', 'ref', 'alt', 'gene', 'tss_dist', 'target', 'ref_pred', 'alt_pred', 'sed', 'ser') if options.csv: sed_gene_out = open('%s/sed_gene.csv' % options.out_dir, 'w') print(','.join(header_cols), file=sed_gene_out) if options.transcript_table: sed_tx_out = open('%s/sed_tx.csv' % options.out_dir, 'w') print(','.join(header_cols), file=sed_tx_out) else: sed_gene_out = open('%s/sed_gene.txt' % options.out_dir, 'w') print(' '.join(header_cols), file=sed_gene_out) if options.transcript_table: sed_tx_out = open('%s/sed_tx.txt' % options.out_dir, 'w') print(' '.join(header_cols), file=sed_tx_out) # helper variables adj = options.tss_width // 2 pred_buffer = model.batch_buffer // model.target_pool # initialize saver saver = tf.train.Saver() with tf.Session() as sess: # load variables into session saver.restore(sess, model_file) # initialize prediction stream seq_preds = PredStream(sess, model, gene_data.seqs_1hot, seqs_snps_list, 128, options.rc) # prediction index pi = 0 for seq_i in range(gene_data.num_seqs): if seqs_snps[seq_i]: # get reference prediction (LxT) ref_preds = seq_preds[pi] pi += 1 for snp_i in seqs_snps[seq_i]: snp = snps[snp_i] # get alternate prediction (LxT) alt_preds = seq_preds[pi] pi += 1 # initialize gene data structures gene_pos_preds = {} # gene -> pos -> (ref_preds,alt_preds) snp_dist_gene = {} # process transcripts for transcript, tx_pos in gene_data.seq_transcripts[seq_i]: # get gene id gene = gene_data.transcript_genes[transcript] # compute distance between SNP and TSS tx_gpos = gene_data.seq_coords[seq_i][1] + ( tx_pos + 0.5) * model.target_pool snp_dist = abs(tx_gpos - snp.pos) if gene in snp_dist_gene: snp_dist_gene[gene] = min(snp_dist_gene[gene], snp_dist) else: snp_dist_gene[gene] = snp_dist # compute transcript pos in predictions tx_pos_buf = tx_pos - pred_buffer # hash transcription positions and predictions to gene id for tx_pos_i in range(tx_pos_buf - adj, tx_pos_buf + adj + 1): gene_pos_preds.setdefault( gene, {})[tx_pos_i] = (ref_preds[tx_pos_i, :], alt_preds[tx_pos_i, :]) # accumulate transcript predictions by (possibly) summing adjacent positions ap = alt_preds[tx_pos_buf - adj:tx_pos_buf + adj + 1, :].sum(axis=0) rp = ref_preds[tx_pos_buf - adj:tx_pos_buf + adj + 1, :].sum(axis=0) # compute SED scores snp_tx_sed = ap - rp snp_tx_ser = np.log2(ap + 1) - np.log2(rp + 1) # print rows to transcript table if options.transcript_table: for ti in range(ref_preds.shape[1]): if options.all_sed or not np.isclose( snp_tx_sed[ti], 0, atol=1e-4): cols = (snp.rsid, basenji.vcf.cap_allele( snp.ref_allele), basenji.vcf.cap_allele( snp.alt_alleles[0]), transcript, snp_dist, gene_data.target_labels[ti], rp[ti], ap[ti], snp_tx_sed[ti], snp_tx_ser[ti]) if options.csv: print(','.join([str(c) for c in cols]), file=sed_tx_out) else: print( '%-13s %s %5s %12s %5d %12s %6.4f %6.4f %7.4f %7.4f' % cols, file=sed_tx_out) # process genes for gene in gene_pos_preds: gene_str = gene if gene in gene_data.multi_seq_genes: gene_str = '%s_multi' % gene # sum gene preds across positions gene_rp = np.zeros(ref_preds.shape[1]) gene_ap = np.zeros(alt_preds.shape[1]) for pos_i in gene_pos_preds[gene]: pos_rp, pos_ap = gene_pos_preds[gene][pos_i] gene_rp += pos_rp gene_ap += pos_ap # compute SED scores snp_gene_sed = gene_ap - gene_rp snp_gene_ser = np.log2(gene_ap + 1) - np.log2(gene_rp + 1) # print rows to gene table for ti in range(ref_preds.shape[1]): if options.all_sed or not np.isclose( snp_gene_sed[ti], 0, atol=1e-4): cols = [ snp.rsid, basenji.vcf.cap_allele(snp.ref_allele), basenji.vcf.cap_allele(snp.alt_alleles[0]), gene_str, snp_dist_gene[gene], gene_data.target_labels[ti], gene_rp[ti], gene_ap[ti], snp_gene_sed[ti], snp_gene_ser[ti] ] if options.csv: print(','.join([str(c) for c in cols]), file=sed_gene_out) else: print( '%-13s %s %5s %12s %5d %12s %6.4f %6.4f %7.4f %7.4f' % tuple(cols), file=sed_gene_out) # print tracks for ti in options.track_indexes: ref_bw_file = '%s/tracks/%s_%s_t%d_ref.bw' % ( options.out_dir, snp.rsid, seq_i, ti) alt_bw_file = '%s/tracks/%s_%s_t%d_alt.bw' % ( options.out_dir, snp.rsid, seq_i, ti) diff_bw_file = '%s/tracks/%s_%s_t%d_diff.bw' % ( options.out_dir, snp.rsid, seq_i, ti) ref_bw_open = bigwig_open(ref_bw_file, options.genome_file) alt_bw_open = bigwig_open(alt_bw_file, options.genome_file) diff_bw_open = bigwig_open(diff_bw_file, options.genome_file) seq_chrom, seq_start, seq_end = gene_data.seq_coords[ seq_i] bw_chroms = [seq_chrom] * ref_preds.shape[0] bw_starts = [ int(seq_start + model.batch_buffer + bi * model.target_pool) for bi in range(ref_preds.shape[0]) ] bw_ends = [ int(bws + model.target_pool) for bws in bw_starts ] ref_values = [float(p) for p in ref_preds[:, ti]] ref_bw_open.addEntries(bw_chroms, bw_starts, ends=bw_ends, values=ref_values) alt_values = [float(p) for p in alt_preds[:, ti]] alt_bw_open.addEntries(bw_chroms, bw_starts, ends=bw_ends, values=alt_values) diff_values = [ alt_values[vi] - ref_values[vi] for vi in range(len(ref_values)) ] diff_bw_open.addEntries(bw_chroms, bw_starts, ends=bw_ends, values=diff_values) ref_bw_open.close() alt_bw_open.close() diff_bw_open.close() # clean up gc.collect() sed_gene_out.close() if options.transcript_table: sed_tx_out.close()
def main(): usage = 'usage: %prog [options] <params_file> <model_file> <genes_hdf5_file>' parser = OptionParser(usage) parser.add_option('-g', dest='genome_file', default='%s/assembly/human.hg19.genome' % os.environ['HG19'], help='Chromosome lengths file [Default: %default]') parser.add_option('-l', dest='gene_list', help='Process only gene ids in the given file') parser.add_option('-o', dest='out_dir', default='grad_map', help='Output directory [Default: %default]') parser.add_option('-t', dest='target_indexes', default=None, help='Target indexes to plot') (options, args) = parser.parse_args() if len(args) != 3: parser.error('Must provide parameters, model, and genomic position') else: params_file = args[0] model_file = args[1] genes_hdf5_file = args[2] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # reads in genes HDF5 gene_data = basenji.genedata.GeneData(genes_hdf5_file) # subset gene sequences genes_subset = set() if options.gene_list: for line in open(options.gene_list): genes_subset.add(line.rstrip()) gene_data.subset_genes(genes_subset) print('Filtered to %d sequences' % gene_data.num_seqs) ####################################################### # model parameters and placeholders job = basenji.dna_io.read_job_params(params_file) job['seq_length'] = gene_data.seq_length job['seq_depth'] = gene_data.seq_depth job['target_pool'] = gene_data.pool_width if 'num_targets' not in job: print( "Must specify number of targets (num_targets) in the parameters file.", file=sys.stderr) exit(1) # set target indexes if options.target_indexes is not None: options.target_indexes = [ int(ti) for ti in options.target_indexes.split(',') ] target_subset = options.target_indexes else: options.target_indexes = list(range(job['num_targets'])) target_subset = None # build model model = basenji.seqnn.SeqNN() model.build(job, target_subset=target_subset) # determine latest pre-dilated layer dilated_mask = np.array(model.cnn_dilation) > 1 dilated_indexes = np.where(dilated_mask)[0] pre_dilated_layer = np.min(dilated_indexes) print('Pre-dilated layer: %d' % pre_dilated_layer) # build gradients ops t0 = time.time() print('Building target/position-specific gradient ops.', end='') model.build_grads_genes(gene_data.gene_seqs, layers=[pre_dilated_layer]) print(' Done in %ds' % (time.time() - t0), flush=True) ####################################################### # acquire gradients # initialize saver saver = tf.train.Saver() with tf.Session() as sess: # load variables into session saver.restore(sess, model_file) for si in range(gene_data.num_seqs): # initialize batcher batcher = basenji.batcher.Batcher(gene_data.seqs_1hot[si:si + 1], batch_size=model.batch_size, pool_width=model.target_pool) # get layer representations t0 = time.time() print('Computing gradients.', end='', flush=True) batch_grads, batch_reprs = model.gradients_genes( sess, batcher, gene_data.gene_seqs[si:si + 1]) print(' Done in %ds.' % (time.time() - t0), flush=True) # only layer batch_reprs = batch_reprs[0] batch_grads = batch_grads[0] # G (TSSs) x T (targets) x P (seq position) x U (Units layer i) print('batch_grads', batch_grads.shape) pooled_length = batch_grads.shape[2] # S (sequences) x P (seq position) x U (Units layer i) print('batch_reprs', batch_reprs.shape) # write bigwigs t0 = time.time() print('Writing BigWigs.', end='', flush=True) # for each TSS for tss_i in range(batch_grads.shape[0]): tss = gene_data.gene_seqs[si].tss_list[tss_i] # for each target for tii in range(len(options.target_indexes)): ti = options.target_indexes[tii] # dot representation and gradient batch_grads_score = np.multiply( batch_reprs[0], batch_grads[tss_i, tii, :, :]).sum(axis=1) # open bigwig bw_file = '%s/%s-%s_t%d.bw' % \ (options.out_dir, tss.gene_id, tss.identifier, ti) bw_open = bigwig_open(bw_file, options.genome_file) # access gene sequence information seq_chrom = gene_data.gene_seqs[si].chrom seq_start = gene_data.gene_seqs[si].start # specify bigwig locations and values bw_chroms = [seq_chrom] * pooled_length bw_starts = [ int(seq_start + li * model.target_pool) for li in range(pooled_length) ] bw_ends = [ int(bws + model.target_pool) for bws in bw_starts ] bw_values = [float(bgs) for bgs in batch_grads_score] # write bw_open.addEntries(bw_chroms, bw_starts, ends=bw_ends, values=bw_values) # close bw_open.close() print(' Done in %ds.' % (time.time() - t0), flush=True) gc.collect()
def main(): usage = 'usage: %prog [options] <params_file> <model_file> <genes_hdf5_file>' parser = OptionParser(usage) parser.add_option('-g', dest='genome_file', default='%s/assembly/human.hg19.genome' % os.environ['HG19'], help='Chromosome lengths file [Default: %default]') parser.add_option('-l', dest='transcript_list', help='Process only transcript ids in the given file') parser.add_option('-o', dest='out_dir', default='grad_map', help='Output directory [Default: %default]') parser.add_option('-t', dest='target_indexes', default=None, help='Target indexes to plot') (options, args) = parser.parse_args() if len(args) != 3: parser.error('Must provide parameters, model, and genomic position') else: params_file = args[0] model_file = args[1] genes_hdf5_file = args[2] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # reads in genes HDF5 gene_data = basenji.genes.GeneData(genes_hdf5_file) # subset transcripts transcripts_subset = set() if options.transcript_list: for line in open(options.transcript_list): transcripts_subset.add(line.rstrip()) gene_data.subset_transcripts(transcripts_subset) print('Filtered to %d sequences' % gene_data.num_seqs) ####################################################### # model parameters and placeholders job = basenji.dna_io.read_job_params(params_file) job['batch_length'] = gene_data.seq_length job['seq_depth'] = gene_data.seq_depth job['target_pool'] = gene_data.pool_width job['save_reprs'] = True if 'num_targets' not in job: print( "Must specify number of targets (num_targets) in the parameters file. I know, it's annoying. Sorry.", file=sys.stderr) exit(1) # build model model = basenji.seqnn.SeqNN() model.build(job) # determine final pooling layer post_pooling_layer = len(model.cnn_pool) - 1 ####################################################### # acquire gradients # set target indexes if options.target_indexes is not None: options.target_indexes = [ int(ti) for ti in options.target_indexes.split(',') ] else: options.target_indexes = list(range(job['num_targets'])) # initialize saver saver = tf.train.Saver() with tf.Session() as sess: # load variables into session saver.restore(sess, model_file) si = 0 while si < gene_data.num_seqs: # initialize batcher # batcher = basenji.batcher.Batcher(seqs_1hot[si:si+model.batch_size], batch_size=model.batch_size, pool_width=model.target_pool) batcher = basenji.batcher.Batcher(gene_data.seqs_1hot[si:si + 1], batch_size=model.batch_size, pool_width=model.target_pool) # determine transcript positions transcript_positions = set() # for bi in range(model.batch_size): # TEMP for bi in range(1): if si + bi < len(gene_data.seq_transcripts): for transcript, tx_pos in gene_data.seq_transcripts[si + bi]: transcript_positions.add(tx_pos) transcript_positions = sorted(list(transcript_positions)) # get layer representations t0 = time.time() print('Computing gradients.', end='', flush=True) batch_grads, batch_reprs = model.gradients_pos( sess, batcher, transcript_positions, options.target_indexes, post_pooling_layer) print(' Done in %ds.' % (time.time() - t0), flush=True) # only layer batch_reprs = batch_reprs[0] batch_grads = batch_grads[0] # (B sequences) x (P pooled seq len) x (F filters) x (G gene positions) x (T targets) print('batch_grads', batch_grads.shape) print('batch_reprs', batch_reprs.shape) # (B sequences) x (P pooled seq len) x (G gene positions) x (T targets) pooled_length = batch_grads.shape[1] # write bigwigs t0 = time.time() print('Writing BigWigs.', end='', flush=True) # for bi in range(model.batch_size): # TEMP for bi in range(1): sbi = si + bi if sbi < gene_data.num_seqs: positions_written = set() for transcript, tx_pos in gene_data.seq_transcripts[sbi]: # has this transcript position been written? if tx_pos not in positions_written: # which gene position is this tx_pos? gi = 0 while transcript_positions[gi] != tx_pos: gi += 1 # for each target for tii in range(len(options.target_indexes)): ti = options.target_indexes[tii] # dot representation and gradient batch_grads_score = np.multiply( batch_reprs[bi], batch_grads[bi, :, :, gi, tii]).sum(axis=1) bw_file = '%s/%s_t%d.bw' % (options.out_dir, transcript, ti) bw_open = bigwig_open(bw_file, options.genome_file) seq_chrom, seq_start, seq_end = gene_data.seq_coords[ sbi] bw_chroms = [seq_chrom] * pooled_length bw_starts = [ int(seq_start + li * model.target_pool) for li in range(pooled_length) ] bw_ends = [ int(bws + model.target_pool) for bws in bw_starts ] bw_values = [ float(bgs) for bgs in batch_grads_score ] bw_open.addEntries(bw_chroms, bw_starts, ends=bw_ends, values=bw_values) bw_open.close() positions_written.add(tx_pos) print(' Done in %ds.' % (time.time() - t0), flush=True) gc.collect() # advance through sequences # si += model.batch_size si += 1
def score_write(sess, model, options, target_indexes, seqs_1hot, seqs_chrom, seqs_start): ''' Compute scores and write them as BigWigs for a set of sequences. ''' num_seqs = seqs_1hot.shape[0] num_targets = len(target_indexes) # initialize scores HDF5 scores_h5_file = '%s/scores.h5' % options.out_dir scores_h5_out = h5py.File(scores_h5_file, 'w') for si in range(num_seqs): # initialize batcher batcher_si = batcher.Batcher(seqs_1hot[si:si + 1], batch_size=model.hp.batch_size, pool_width=model.hp.target_pool) # get layer representations t0 = time.time() print('Computing gradients.', end='', flush=True) _, _, _, batch_grads, batch_reprs, _ = model.gradients( sess, batcher_si, rc=options.rc, shifts=options.shifts, mc_n=options.mc_n, return_all=True) print(' Done in %ds.' % (time.time() - t0), flush=True) # only layer batch_reprs = batch_reprs[0] batch_grads = batch_grads[0] # increase resolution batch_reprs = batch_reprs.astype('float32') batch_grads = batch_grads.astype('float32') # S (sequences) x T (targets) x P (seq position) x U (units layer i) x E (ensembles) print('batch_grads', batch_grads.shape) # S (sequences) x P (seq position) x U (Units layer i) x E (ensembles) print('batch_reprs', batch_reprs.shape) preds_length = batch_reprs.shape[1] if 'score' not in scores_h5_out: # initialize scores scores_h5_out.create_dataset('score', shape=(num_seqs, preds_length, num_targets), dtype='float16') scores_h5_out.create_dataset('pvalue', shape=(num_seqs, preds_length, num_targets), dtype='float16') # write bigwigs t0 = time.time() print('Computing and writing scores.', end='', flush=True) # for each target for tii in range(len(target_indexes)): ti = target_indexes[tii] # representation x gradient batch_grads_scores = np.multiply(batch_reprs[0], batch_grads[0, tii, :, :, :]) if options.norm is None: # sum across filters batch_grads_scores = batch_grads_scores.sum(axis=1) else: # raise to power batch_grads_scores = np.power(np.abs(batch_grads_scores), options.norm) # sum across filters batch_grads_scores = batch_grads_scores.sum(axis=1) # normalize w/ 1/power batch_grads_scores = np.power(batch_grads_scores, 1. / options.norm) # mean across ensemble batch_grads_mean = batch_grads_scores.mean(axis=1) # compute p-values if options.norm is None: batch_grads_pval = ttest_1samp(batch_grads_scores, 0, axis=1)[1] else: batch_grads_pval = ttest_1samp(batch_grads_scores, 0, axis=1)[1] # batch_grads_pval = chi2(df=) batch_grads_pval /= 2 # write to HDF5 scores_h5_out['score'][si, :, tii] = batch_grads_mean.astype('float16') scores_h5_out['pvalue'][si, :, tii] = batch_grads_pval.astype('float16') if options.bigwig: # open bigwig bws_file = '%s/s%d_t%d_scores.bw' % (options.out_dir, si, ti) bwp_file = '%s/s%d_t%d_pvals.bw' % (options.out_dir, si, ti) bws_open = bigwig_open(bws_file, options.genome_file) # bwp_open = bigwig_open(bwp_file, options.genome_file) # specify bigwig locations and values bw_chroms = [seqs_chrom[si]] * preds_length bw_starts = [ int(seqs_start[si] + pi * model.hp.target_pool) for pi in range(preds_length) ] bw_ends = [ int(bws + model.hp.target_pool) for bws in bw_starts ] bws_values = [float(bgs) for bgs in batch_grads_mean] # bwp_values = [float(bgp) for bgp in batch_grads_pval] # write bws_open.addEntries(bw_chroms, bw_starts, ends=bw_ends, values=bws_values) # bwp_open.addEntries(bw_chroms, bw_starts, ends=bw_ends, values=bwp_values) # close if options.bigwig: bws_open.close() # bwp_open.close() print(' Done in %ds.' % (time.time() - t0), flush=True) gc.collect() scores_h5_out.close()