def main(): usage = 'usage: %prog [options] <hdf5_file>' parser = OptionParser(usage) parser.add_option('-s', dest='set', default='test', help='Set (train/valid/test) [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide data HDF5 file') else: hdf5_file = args[0] # load 1 hot coded seqeunces from HDF5 hdf5_in = h5py.File(hdf5_file, 'r') seqs_1hot = np.array(hdf5_in['%s_in' % options.set]) seq_headers = np.array(hdf5_in['test_headers']) hdf5_in.close() # convert to ACGT sequences seqs = dna_io.vecs2dna(seqs_1hot) for i, seq in enumerate(seqs): if seq_headers is None: header = 'seq%d' % i else: header = seq_headers[i] print '>%s\n%s' % (header, seq)
def main(): usage = 'usage: %prog [options] <hdf5_file>' parser = OptionParser(usage) parser.add_option('-s', dest='set', default='test', help='Set (train/valid/test) [Default: %default]') (options, args) = parser.parse_args() if len(args) != 1: parser.error( 'Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file' ) else: hdf5_file = args[0] # load 1 hot coded seqeunces from HDF5 hdf5_in = h5py.File(hdf5_file, 'r') seqs_1hot = np.array(hdf5_in['%s_in' % options.set]) try: seq_headers = np.array(hdf5_in['test_headers']) except: seq_headers = None hdf5_in.close() # convert to ACGT sequences seqs = dna_io.vecs2dna(seqs_1hot) for i, seq in enumerate(seqs): if seq_headers is None: header = 'seq%d' % i else: header = seq_headers[i] print '>%s\n%s' % (header, seq)
def main(): usage = "usage: %prog [options] <hdf5_file>" parser = OptionParser(usage) parser.add_option("-s", dest="set", default="test", help="Set (train/valid/test) [Default: %default]") (options, args) = parser.parse_args() if len(args) != 1: parser.error("Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file") else: hdf5_file = args[0] # load 1 hot coded seqeunces from HDF5 hdf5_in = h5py.File(hdf5_file, "r") seqs_1hot = np.array(hdf5_in["%s_in" % options.set]) try: seq_headers = np.array(hdf5_in["test_headers"]) except: seq_headers = None hdf5_in.close() # convert to ACGT sequences seqs = dna_io.vecs2dna(seqs_1hot) for i, seq in enumerate(seqs): if seq_headers is None: header = "seq%d" % i else: header = seq_headers[i] print ">%s\n%s" % (header, seq)
def plot_weight_logo(filter_weights, seq_tensor, norm_beta, norm_gamma, norm_mean, norm_var, out_prefix, maxpct_t=0.0): #Get convolution kmer_conv = np.tensordot(filter_weights, seq_tensor, axes=((0, 1), (1, 2))) #result in a 1D array #normalization using trained parameters kmer_conv_norm = (kmer_conv - norm_mean ) * norm_beta / np.sqrt(norm_var + 0.00001) + norm_gamma #ReLU kmer_conv_relu = [0 if val < 0 else val for val in kmer_conv_norm] #calculate a ReLU cutoff to plot weblogo if maxpct_t == 0: relu_act = 0 else: all_outs = np.ravel(kmer_conv_relu) all_outs_mean = all_outs.mean() all_outs_norm = all_outs - all_outs_mean relu_act = maxpct_t * all_outs_norm.max() + all_outs_mean #Find which kmer pass the cutoff kmer_ok_index = [ i for i in range(len(kmer_conv_relu)) if kmer_conv_relu[i] > relu_act ] if len(kmer_ok_index) > 0: kmer_ok = np.array([seq_tensor[i, :, :] for i in kmer_ok_index]) #shape (50,4,5) kmer_ok_4D = np.expand_dims(kmer_ok, axis=2) #shape (50,4,1,5) kmer_seq = dna_io.vecs2dna(kmer_ok_4D) kmer_fasta = [ ">" + str(i) + "\n" + seq + "\n" for i, seq in zip(range(len(kmer_seq)), kmer_seq) ] kmer_fasta_out = open("%s_activated_kmers.fa" % out_prefix, "w") kmer_fasta_out.writelines(kmer_fasta) kmer_fasta_out.close() if relu_act > 0: subprocess.call( "weblogo -X NO -Y NO -F pdf --resolution 300 --errorbars NO --fineprint '' -C '#CB2026' A A -C '#34459C' C C -C '#FBB116' G G -C '#0C8040' T T < %s_activated_kmers.fa > %s_weight_LOGO.pdf" % (out_prefix, out_prefix), shell=True) else: subprocess.call( "weblogo -U probability -F pdf --resolution 300 --errorbars NO --fineprint '' -C '#CB2026' A A -C '#34459C' C C -C '#FBB116' G G -C '#0C8040' T T < %s_activated_kmers.fa > %s_weight_LOGO.pdf" % (out_prefix, out_prefix), shell=True)
def parse_filter_scores_multiple_base_hdf5(scores_hdf5_file): ### single motif scores = [activation for each filter][base_seq] ### paired motif scores = activation for [filter1][filter2][base_seq] ### seqs = motif that is used to represent each filter scores_hdf5_in = h5py.File(scores_hdf5_file, 'r') preds = np.array(scores_hdf5_in['preds']) seq_vecs = scores_hdf5_in['seqs'] # print preds.shape # print seq_vecs.shape seqs = dna_io.vecs2dna(seq_vecs) scores_hdf5_in.close() assert(NUM_BASE_SEQS + NUM_BASE_SEQS * NUM_SEQS + NUM_BASE_SEQS * NUM_SEQS * NUM_SEQS == len(preds)) num_seqs= NUM_SEQS base_scores = [] single_motif_scores = [] paired_motif_scores = [] for i in range(num_seqs): single_motif_scores.append([False] * NUM_BASE_SEQS) paired_motif_scores.append([]) for j in range(num_seqs): paired_motif_scores[i].append([False] * NUM_BASE_SEQS) z = 0 for i in range(NUM_BASE_SEQS): base_scores.append(preds[i]) base_seqs.append(seqs[i]) z += 1 motif_seqs = [] for i in range(NUM_SEQS): motif_seqs.append(seqs[z][300:319]) for j in range(NUM_BASE_SEQS): single_motif_scores[i][j] = preds[z] z += 1 for i in range(num_seqs): for j in range(num_seqs): for k in range(NUM_BASE_SEQS): paired_motif_scores[i][j][k] = preds[z] z += 1 return (base_seqs, base_scores, single_motif_scores, paired_motif_scores, motif_seqs)
def parse_filter_scores_hdf5(scores_hdf5_file): ### single motif scores = [activation for each filter] ### paired motif scores = activation for [filter1][filter2][offset] ### seqs = motif that is used to represent each filter scores_hdf5_in = h5py.File(scores_hdf5_file, 'r') preds = np.array(scores_hdf5_in['preds']) seq_vecs = scores_hdf5_in['seqs'] print preds.shape print seq_vecs.shape seqs = dna_io.vecs2dna(seq_vecs) scores_hdf5_in.close() # num_seqs = len(seqs) assert(NUM_SEQS + WINDOW_SIZE * NUM_SEQS * NUM_SEQS == len(preds)) num_seqs= NUM_SEQS window_size = WINDOW_SIZE # window_size = (len(preds) - num_seqs) / (num_seqs * num_seqs) single_motif_scores = [] paired_motif_scores = [] for i in range(num_seqs): paired_motif_scores.append([]) for j in range(num_seqs): paired_motif_scores[i].append([False] * window_size) for i in range(num_seqs): single_motif_scores.append(preds[i]) z = num_seqs for i in range(num_seqs): for j in range(num_seqs): for k in range(window_size): paired_motif_scores[i][j][k] = preds[z] z += 1 return (single_motif_scores, paired_motif_scores, seqs)
def main(): usage = 'usage: %prog [options] <model_file>' parser = OptionParser(usage) parser.add_option( '-a', dest='targets_file', default=None, help='File labelings targets in the second column [Default: %default]') parser.add_option( '-c', dest='center_nt', default=50, help='Center nt to consider kmers from [Default: %default]') parser.add_option('-d', dest='model_out_file', default=None, help='Pre-computed model output table.') parser.add_option('-k', dest='kmer', default=8, type='int', help='K-mer length [Default: %default]') parser.add_option('-l', dest='seq_len', default=1000, type='int', help='Input sequence length [Default: %default]') parser.add_option( '-n', dest='num_seqs', default=100000, type='int', help='Number of sequences to predict [Default: %default]') parser.add_option('-o', dest='out_dir', default='.') parser.add_option( '-r', dest='rc', default=False, action='store_true', help='Consider k-mers w/ their reverse complements [Default: %default]' ) parser.add_option( '-t', dest='targets', default=None, help= 'Comma-separated list of targets to analyze in more depth [Default: %default]' ) parser.add_option( '--top', dest='top_num', default=100, type='int', help= 'Number of sequences with which to make a multiple sequence alignment') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide Basset model file.') else: model_file = args[0] random.seed(2) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) if options.model_out_file is not None: seq_dna = [] for line in open('%s/seqs.fa' % options.out_dir): if line[0] == '>': seq_dna.append('') else: seq_dna[-1] += line.rstrip() else: ################################################################# # generate random sequences ################################################################# # random sequences seq_vecs = np.zeros((options.num_seqs, 4, 1, options.seq_len), dtype='float16') for si in range(options.num_seqs): for li in range(options.seq_len): ni = random.randint(0, 3) seq_vecs[si, ni, 0, li] = 1 # create a new HDF5 file seq_hdf5_file = '%s/seqs.h5' % options.out_dir seq_hdf5_out = h5py.File(seq_hdf5_file, 'w') seq_hdf5_out.create_dataset('test_in', data=seq_vecs) seq_hdf5_out.close() # get fasta seq_dna = vecs2dna(seq_vecs) # print to file fasta_out = open('%s/seqs.fa' % options.out_dir, 'w') for i in range(len(seq_dna)): print >> fasta_out, '>%d\n%s' % (i, seq_dna[i]) fasta_out.close() ################################################################# # Torch predict ################################################################# options.model_out_file = '%s/model_out.txt' % options.out_dir torch_cmd = 'basset_predict.lua -scores %s %s %s' % ( model_file, seq_hdf5_file, options.model_out_file) print torch_cmd subprocess.call(torch_cmd, shell=True) # clean up sequence HDF5 os.remove(seq_hdf5_file) # load scores seq_scores = np.loadtxt(options.model_out_file, dtype='float32') # read target labels if options.targets_file: target_labels = [ line.split()[1] for line in open(options.targets_file) ] else: target_labels = ['t%d' % (ti + 1) for ti in range(seq_scores.shape[1])] if options.targets is None: options.targets = range(seq_scores.shape[1]) else: options.targets = [int(ti) for ti in options.targets.split(',')] ################################################################# # process and output ################################################################# kmers_start = (options.seq_len - options.center_nt) / 2 for ti in options.targets: print 'Working on target %d' % ti ############################################## # hash scores by k-mer ############################################## kmer_scores_raw = {} for si in range(len(seq_dna)): # get score sscore = seq_scores[si, ti] # hash to each center kmer for ki in range(kmers_start, kmers_start + options.center_nt): kmer = seq_dna[si][ki:ki + options.kmer] if options.rc: kmer = consider_rc(kmer) kmer_scores_raw.setdefault(kmer, []).append(sscore) ############################################## # compute means and print table ############################################## table_out = open('%s/table%d.txt' % (options.out_dir, ti), 'w') kmer_means_raw = {} for kmer in kmer_scores_raw: kmer_means_raw[kmer] = np.mean(kmer_scores_raw[kmer]) kmer_n = len(kmer_scores_raw[kmer]) cols = (kmer, kmer_n, kmer_means_raw[kmer], np.std(kmer_scores_raw[kmer]) / math.sqrt(kmer_n)) print >> table_out, '%s %4d %6.3f %6.3f' % cols table_out.close() ############################################## # plot density ############################################## plt.figure() sns.distplot(kmer_means_raw.values(), kde=False) plt.savefig('%s/density%d.pdf' % (options.out_dir, ti)) plt.close() ############################################## # top k-mers distance matrix ############################################## kmer_means = {} kmer_means_mean = np.mean(kmer_means_raw.values()) for kmer in kmer_means_raw: kmer_means[kmer] = kmer_means_raw[kmer] - kmer_means_mean # score by score scores_kmers = [(kmer_means[kmer], kmer) for kmer in kmer_means] scores_kmers.sort(reverse=True) # take top k-mers top_kmers = [] top_kmers_scores = [] for score, kmer in scores_kmers[:options.top_num]: top_kmers.append(kmer) top_kmers_scores.append(score) top_kmers = np.array(top_kmers) top_kmers_scores = np.array(top_kmers_scores) # compute distance matrix top_kmers_dists = np.zeros((options.top_num, options.top_num)) for i in range(options.top_num): for j in range(i + 1, options.top_num): if options.rc: top_kmers_dists[i, j] = kmer_distance_rc( top_kmers[i], top_kmers[j]) else: top_kmers_dists[i, j] = kmer_distance(top_kmers[i], top_kmers[j]) top_kmers_dists[j, i] = top_kmers_dists[i, j] # clip the distances np.clip(top_kmers_dists, 0, 3, out=top_kmers_dists) # plot plot_kmer_dists(top_kmers_dists, top_kmers_scores, top_kmers, '%s/top_kmers_heat%d.pdf' % (options.out_dir, ti)) # cluster and plot cluster_kmer_dists(top_kmers_dists, top_kmers_scores, top_kmers, '%s/top_kmers_clust%d.pdf' % (options.out_dir, ti))
def main(): usage = 'usage: %prog [options] <model_file> <test_hdf5_file>' parser = OptionParser(usage) parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5.') parser.add_option('-o', dest='out_dir', default='.') parser.add_option('-m', dest='meme_db', default='%s/data/motifs/Homo_sapiens.meme' % os.environ['BASSETDIR'], help='MEME database used to annotate motifs') parser.add_option('-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]') parser.add_option('-t', dest='trim_filters', default=False, action='store_true', help='Trim uninformative positions off the filter ends [Default: %default]') parser.add_option('--skip-heat', dest='skip_heat', default=False, help="Skip plotting heat maps of filters") parser.add_option('--skip-logo', dest='skip_logo', default=False, help="Skip Weblogo plots for filters") (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide Basset model file and test data in HDF5 format.') else: model_file = args[0] test_hdf5_file = args[1] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # load data ################################################################# # load sequences test_hdf5_in = h5py.File(test_hdf5_file, 'r') seq_vecs = np.array(test_hdf5_in['test_in']) # print seq_vecs.shape # print "with numpy", seq_vecs.nbytes seq_targets = np.array(test_hdf5_in['test_out']) try: target_names = list(test_hdf5_in['target_labels']) except KeyError: target_names = ['t%d'%ti for ti in range(seq_targets.shape[1])] test_hdf5_in.close() ################################################################# # sample ################################################################# if options.model_hdf5_file is not None: print "Model outs file specified. Do not resample, use sample sequences from model outs file." else: if options.sample is not None: # choose sampled indexes sample_i = np.array(random.sample(xrange(seq_vecs.shape[0]), options.sample)) # filter seq_vecs = seq_vecs[sample_i] seq_targets = seq_targets[sample_i] # create a new HDF5 file sample_hdf5_file = '%s/sample.h5' % options.out_dir sample_hdf5_out = h5py.File(sample_hdf5_file, 'w') print seq_vecs.shape sample_hdf5_out.create_dataset('test_in', data=seq_vecs) sample_hdf5_out.close() # update test HDF5 test_hdf5_file = sample_hdf5_file print "Finished creating sample file" ################################################################# # Torch predict ################################################################# if options.model_hdf5_file is None: print "No model hdf5 file specified" options.model_hdf5_file = '%s/model_out.h5' % options.out_dir torch_cmd = 'basset_motifs_predict.lua %s %s %s' % (model_file, test_hdf5_file, options.model_hdf5_file) print torch_cmd subprocess.call(torch_cmd, shell=True) # load model output model_hdf5_in = h5py.File(options.model_hdf5_file, 'r') filter_weights = np.array(model_hdf5_in['weights']) filter_outs = np.array(model_hdf5_in['outs']) # seq_vecs = model_hdf5_in['sample_seqs'] seqs = dna_io.vecs2dna(seq_vecs) model_hdf5_in.close() # store useful variables num_filters = filter_weights.shape[0] filter_size = filter_weights.shape[2] ################################################################# # individual filter plots ################################################################# # also save information contents filters_ic = [] meme_out_file = meme_intro('%s/filters_meme.txt'%options.out_dir, seqs) for f in range(num_filters): print 'Filter %d' % f # plot filter parameters as a heatmap if not options.skip_heat: plot_filter_heat(filter_weights[f,:,:], '%s/filter%d_heat.pdf' % (options.out_dir,f)) # plot weblogo of high scoring outputs if not options.skip_logo: plot_filter_logo(filter_outs[:,f,:], filter_size, seqs, '%s/filter%d_logo'%(options.out_dir,f), maxpct_t=0.5) # make a PWM for the filter filter_pwm, nsites = make_filter_pwm('%s/filter%d_logo.fa'%(options.out_dir,f)) if nsites < 10: # no information filters_ic.append(0) print "No information" else: # compute and save information content filters_ic.append(info_content(filter_pwm)) # add to the meme motif file meme_add(meme_out_file, f, filter_pwm, nsites, options.trim_filters) meme_out_file.close() ################################################################# # annotate filters ################################################################# # run tomtom subprocess.call('tomtom -dist pearson -thresh 0.1 -oc %s/tomtom %s/filters_meme.txt %s' % (options.out_dir, options.out_dir, options.meme_db), shell=True) # read in annotations filter_names = name_filters(num_filters, '%s/tomtom/tomtom.txt'%options.out_dir, options.meme_db) ################################################################# # print a table of information ################################################################# table_out = open('%s/table.txt'%options.out_dir, 'w') # print header for later panda reading header_cols = ('', 'consensus', 'annotation', 'ic', 'mean', 'std') print >> table_out, '%3s %19s %10s %5s %6s %6s' % header_cols for f in range(num_filters): # collapse to a consensus motif consensus = filter_motif(filter_weights[f,:,:]) # grab annotation annotation = '.' name_pieces = filter_names[f].split('_') if len(name_pieces) > 1: annotation = name_pieces[1] # plot density of filter output scores fmean, fstd = plot_score_density(np.ravel(filter_outs[:,f,:]), '%s/filter%d_dens.pdf' % (options.out_dir,f)) row_cols = (f, consensus, annotation, filters_ic[f], fmean, fstd) print >> table_out, '%-3d %19s %10s %5.2f %6.4f %6.4f' % row_cols table_out.close() ################################################################# # global filter plots ################################################################# # plot filter-sequence heatmap plot_filter_seq_heat(filter_outs, '%s/filter_seqs.pdf'%options.out_dir) # plot filter-segment heatmap plot_filter_seg_heat(filter_outs, '%s/filter_segs.pdf'%options.out_dir) plot_filter_seg_heat(filter_outs, '%s/filter_segs_raw.pdf'%options.out_dir, whiten=False) # plot filter-target correlation heatmap plot_target_corr(filter_outs, seq_targets, filter_names, target_names, '%s/filter_target_cors_mean.pdf'%options.out_dir, 'mean') plot_target_corr(filter_outs, seq_targets, filter_names, target_names, '%s/filter_target_cors_max.pdf'%options.out_dir, 'max')
def main(): usage = 'usage: %prog [options] <model_file> <input_file>' parser = OptionParser(usage) parser.add_option('-a', dest='input_activity_file', help='Optional activity table corresponding to an input FASTA file') parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]') parser.add_option('-g', dest='gain_height', default=False, action='store_true', help='Nucleotide heights determined by the max of loss and gain [Default: %default]') parser.add_option('-m', dest='min_limit', default=0.1, type='float', help='Minimum heatmap limit [Default: %default]') parser.add_option('-n', dest='center_nt', default=0, type='int', help='Center nt to mutate and plot in the heat map [Default: %default]') parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]') parser.add_option('-p', dest='print_table_all', default=False, action='store_true', help='Print all targets to the table [Default: %default]') parser.add_option('-r', dest='rng_seed', default=1, type='float', help='Random number generator seed [Default: %default]') parser.add_option('-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]') parser.add_option('-t', dest='targets', default='0', help='Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file') else: model_file = args[0] input_file = args[1] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) random.seed(options.rng_seed) ################################################################# # parse input file ################################################################# try: # input_file is FASTA # load sequences and headers seqs = [] seq_headers = [] for line in open(input_file): if line[0] == '>': seq_headers.append(line[1:].rstrip()) seqs.append('') else: seqs[-1] += line.rstrip() model_input_hdf5 = '%s/model_in.h5'%options.out_dir if options.input_activity_file: # one hot code seqs_1hot, targets = dna_io.load_data_1hot(input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False) # read in target names target_labels = open(options.input_activity_file).readline().strip().split('\t') else: # load sequences seqs_1hot = dna_io.load_sequences(input_file, permute=False) targets = None target_labels = None # sample if options.sample: sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] if targets is not None: targets = targets[sample_i] # reshape sequences for torch seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,seqs_1hot.shape[1]/4)) # write as test data to a HDF5 file h5f = h5py.File(model_input_hdf5, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() except (IOError, IndexError): # input_file is HDF5 try: model_input_hdf5 = input_file # load (sampled) test data from HDF5 hdf5_in = h5py.File(input_file, 'r') seqs_1hot = np.array(hdf5_in['test_in']) targets = np.array(hdf5_in['test_out']) try: # TEMP seq_headers = np.array(hdf5_in['test_headers']) target_labels = np.array(hdf5_in['target_labels']) except: seq_headers = None target_labels = None hdf5_in.close() # sample if options.sample: sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] targets = targets[sample_i] # write sampled data to a new HDF5 file model_input_hdf5 = '%s/model_in.h5'%options.out_dir h5f = h5py.File(model_input_hdf5, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() # convert to ACGT sequences seqs = dna_io.vecs2dna(seqs_1hot) except IOError: parser.error('Could not parse input file as FASTA or HDF5.') ################################################################# # Torch predict modifications ################################################################# if options.model_hdf5_file is None: options.model_hdf5_file = '%s/model_out.h5' % options.out_dir torch_cmd = 'basset_sat_predict.lua -center_nt %d %s %s %s' % (options.center_nt, model_file, model_input_hdf5, options.model_hdf5_file) print torch_cmd subprocess.call(torch_cmd, shell=True) ################################################################# # load modification predictions ################################################################# hdf5_in = h5py.File(options.model_hdf5_file, 'r') seq_mod_preds = np.array(hdf5_in['seq_mod_preds']) hdf5_in.close() # trim seqs to match seq_mod_preds length seq_len = len(seqs[0]) delta_start = 0 delta_len = seq_mod_preds.shape[2] if delta_len < seq_len: delta_start = (seq_len - delta_len)/2 for i in range(len(seqs)): seqs[i] = seqs[i][delta_start:delta_start+delta_len] # decide which cells to plot if options.targets == '-1': plot_targets = xrange(seq_mod_preds.shape[3]) else: plot_targets = [int(ci) for ci in options.targets.split(',')] ################################################################# # plot ################################################################# table_out = open('%s/table.txt' % options.out_dir, 'w') rdbu = sns.color_palette("RdBu_r", 10) nts = 'ACGT' for si in range(seq_mod_preds.shape[0]): try: header = seq_headers[si] except TypeError: header = 'seq%d' % si seq = seqs[si] # plot some descriptive heatmaps for each individual cell type for ci in plot_targets: seq_mod_preds_cell = seq_mod_preds[si,:,:,ci] real_pred_cell = get_real_pred(seq_mod_preds_cell, seq) # compute matrices norm_matrix = seq_mod_preds_cell - real_pred_cell min_scores = seq_mod_preds_cell.min(axis=0) max_scores = seq_mod_preds_cell.max(axis=0) minmax_matrix = np.vstack([min_scores - real_pred_cell, max_scores - real_pred_cell]) # prepare figure sns.set(style='white', font_scale=0.5) sns.axes_style({'axes.linewidth':1}) heat_cols = 400 sad_start = 1 sad_end = 323 logo_start = 0 logo_end = 324 fig = plt.figure(figsize=(20,3)) ax_logo = plt.subplot2grid((3,heat_cols), (0,logo_start), colspan=(logo_end-logo_start)) ax_sad = plt.subplot2grid((3,heat_cols), (1,sad_start), colspan=(sad_end-sad_start)) ax_heat = plt.subplot2grid((3,heat_cols), (2,0), colspan=heat_cols) # print a WebLogo of the sequence vlim = max(options.min_limit, abs(minmax_matrix).max()) if options.gain_height: seq_heights = 0.25 + 1.75/vlim*(abs(minmax_matrix).max(axis=0)) else: seq_heights = 0.25 + 1.75/vlim*(-minmax_matrix[0]) logo_eps = '%s/%s_c%d_seq.eps' % (options.out_dir, header_filename(header), ci) seq_logo(seq, seq_heights, logo_eps) # add to figure logo_png = '%s.png' % logo_eps[:-4] subprocess.call('convert -density 300 %s %s' % (logo_eps, logo_png), shell=True) logo = Image.open(logo_png) ax_logo.imshow(logo) ax_logo.set_axis_off() # plot loss and gain SAD scores ax_sad.plot(-minmax_matrix[0], c=rdbu[0], label='loss', linewidth=1) ax_sad.plot(minmax_matrix[1], c=rdbu[-1], label='gain', linewidth=1) ax_sad.set_xlim(0,minmax_matrix.shape[1]) ax_sad.legend() # ax_sad.grid(True, linestyle=':') for axis in ['top','bottom','left','right']: ax_sad.spines[axis].set_linewidth(0.5) # plot real-normalized scores vlim = max(options.min_limit, abs(norm_matrix).max()) sns.heatmap(norm_matrix, linewidths=0, cmap='RdBu_r', vmin=-vlim, vmax=vlim, xticklabels=False, ax=ax_heat) ax_heat.yaxis.set_ticklabels('TGCA', rotation='horizontal') # , size=10) # save final figure plt.tight_layout() plt.savefig('%s/%s_c%d_heat.pdf' % (options.out_dir,header.replace(':','_'), ci), dpi=300) plt.close() ################################################################# # print table of nt variability for each cell ################################################################# print_targets = plot_targets if options.print_table_all: print_targets = range(seq_mod_preds.shape[3]) for ci in print_targets: seq_mod_preds_cell = seq_mod_preds[si,:,:,ci] real_pred_cell = get_real_pred(seq_mod_preds_cell, seq) min_scores = seq_mod_preds_cell.min(axis=0) max_scores = seq_mod_preds_cell.max(axis=0) loss_matrix = real_pred_cell - seq_mod_preds_cell.min(axis=0) gain_matrix = seq_mod_preds_cell.max(axis=0) - real_pred_cell for pos in range(seq_mod_preds_cell.shape[1]): cols = [header, delta_start+pos, ci, loss_matrix[pos], gain_matrix[pos]] print >> table_out, '\t'.join([str(c) for c in cols]) table_out.close()
def main(): usage = 'usage: %prog [options] <model_file>' parser = OptionParser(usage) parser.add_option('-a', dest='targets_file', default=None, help='File labelings targets in the second column [Default: %default]') parser.add_option('-c', dest='center_nt', default=50, help='Center nt to consider kmers from [Default: %default]') parser.add_option('-d', dest='model_out_file', default=None, help='Pre-computed model output table.') parser.add_option('-k', dest='kmer', default=8, type='int', help='K-mer length [Default: %default]') parser.add_option('-l', dest='seq_len', default=1000, type='int', help='Input sequence length [Default: %default]') parser.add_option('-n', dest='num_seqs', default=100000, type='int', help='Number of sequences to predict [Default: %default]') parser.add_option('-o', dest='out_dir', default='.') parser.add_option('-r', dest='rc', default=False, action='store_true', help='Consider k-mers w/ their reverse complements [Default: %default]') parser.add_option('-t', dest='targets', default=None, help='Comma-separated list of targets to analyze in more depth [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide Basset model file.') else: model_file = args[0] random.seed(2) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # generate random sequences ################################################################# # random sequences seq_vecs = np.zeros((options.num_seqs,4,1,options.seq_len), dtype='float16') for si in range(options.num_seqs): for li in range(options.seq_len): ni = random.randint(0,3) seq_vecs[si,ni,0,li] = 1 # create a new HDF5 file seq_hdf5_file = '%s/seqs.h5' % options.out_dir seq_hdf5_out = h5py.File(seq_hdf5_file, 'w') seq_hdf5_out.create_dataset('test_in', data=seq_vecs) seq_hdf5_out.close() # get fasta seq_dna = vecs2dna(seq_vecs) ################################################################# # Torch predict ################################################################# if options.model_out_file is None: options.model_out_file = '%s/model_out.txt' % options.out_dir torch_cmd = 'basset_predict.lua -scores %s %s %s' % (model_file, seq_hdf5_file, options.model_out_file) print torch_cmd subprocess.call(torch_cmd, shell=True) # load scores seq_scores = np.loadtxt(options.model_out_file, dtype='float32') # read target labels if options.targets_file: target_labels = [line.split()[1] for line in open(options.targets_file)] else: target_labels = ['t%d'%(ti+1) for ti in range(seq_scores.shape[1])] if options.targets == None: options.targets = range(seq_scores.shape[1]) ################################################################# # process and output ################################################################# kmers_start = (options.seq_len - options.center_nt) / 2 for ti in options.targets: ############################################## # hash scores by k-mer ############################################## kmer_scores = {} for si in range(len(seq_dna)): # get score sscore = seq_scores[si,ti] # hash to each center kmer for ki in range(kmers_start, kmers_start + options.center_nt): kmer = seq_dna[si][ki:ki+options.kmer] if options.rc: kmer = consider_rc(kmer) kmer_scores.setdefault(kmer,[]).append(sscore) ############################################## # print table ############################################## table_out = open('%s/table%d.txt' % (options.out_dir,ti), 'w') for kmer in kmer_scores: cols = (kmer, len(kmer_scores[kmer]), np.mean(kmer_scores[kmer]), np.std(kmer_scores[kmer])/math.sqrt(len(kmer_scores[kmer]))) print >> table_out, '%s %4d %6.3f %6.3f' % cols table_out.close()
def main(): usage = "usage: %prog [options] <motif> <model_file> <test_hdf5_file>" parser = OptionParser(usage) parser.add_option("-d", dest="model_hdf5_file", default=None, help="Pre-computed model output as HDF5.") parser.add_option("-f", dest="filters", default=None, help="Filters to plot length analysis [Default: %default]") parser.add_option("-o", dest="out_dir", default=".") parser.add_option( "-p", dest="pool", default=False, action="store_true", help="Take representation after pooling [Default: %default]", ) parser.add_option("-s", dest="sample", default=None, type="int", help="Sequences to sample [Default: %default]") parser.add_option( "-t", dest="targets", default=None, help="Comma-separated list of targets to analyze in more depth [Default: %default]", ) (options, args) = parser.parse_args() if len(args) != 3: parser.error("Must provide motif, Basset model file, and test data in HDF5 format.") else: motif = args[0] model_file = args[1] test_hdf5_file = args[2] random.seed(2) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # load data ################################################################# # load sequences test_hdf5_in = h5py.File(test_hdf5_file, "r") seq_vecs = np.array(test_hdf5_in["test_in"]) seq_targets = np.array(test_hdf5_in["test_out"]) seq_headers = np.array(test_hdf5_in["test_headers"]) target_labels = np.array(test_hdf5_in["target_labels"]) test_hdf5_in.close() ################################################################# # sample ################################################################# if options.sample is not None and options.sample < seq_vecs.shape[0]: # choose sampled indexes sample_i = np.array(random.sample(xrange(seq_vecs.shape[0]), options.sample)) # filter seq_vecs = seq_vecs[sample_i] seq_targets = seq_targets[sample_i] seq_headers = seq_headers[sample_i] # create a new HDF5 file sample_hdf5_file = "%s/sample.h5" % options.out_dir sample_hdf5_out = h5py.File(sample_hdf5_file, "w") sample_hdf5_out.create_dataset("test_in", data=seq_vecs) sample_hdf5_out.close() # update test HDF5 test_hdf5_file = sample_hdf5_file ################################################################# # write in motif ################################################################# # this code must match the Torch code seq_len = seq_vecs.shape[3] seq_mid = math.floor(seq_len / 2.0 - len(motif) / 2.0) - 1 for si in range(seq_vecs.shape[0]): for pi in range(len(motif)): one_hot_set(seq_vecs[si], seq_mid + pi, motif[pi]) # get fasta seq_dna = vecs2dna(seq_vecs) ################################################################# # Torch predict ################################################################# if options.model_hdf5_file is None: pool_str = "" if options.pool: pool_str = "-pool" options.model_hdf5_file = "%s/model_out.h5" % options.out_dir torch_cmd = "basset_anchor_predict.lua %s %s %s %s %s" % ( pool_str, motif, model_file, test_hdf5_file, options.model_hdf5_file, ) print torch_cmd subprocess.call(torch_cmd, shell=True) # load model output model_hdf5_in = h5py.File(options.model_hdf5_file, "r") pre_preds = np.array(model_hdf5_in["pre_preds"]) preds = np.array(model_hdf5_in["preds"]) scores = np.array(model_hdf5_in["scores"]) seq_filter_outs = np.array(model_hdf5_in["filter_outs"]) pre_seq_filter_outs = np.array(model_hdf5_in["pre_filter_outs"]) model_hdf5_in.close() # pre-process seq_filter_means = seq_filter_outs.mean(axis=2) filter_means = seq_filter_means.mean(axis=0) filter_msds = seq_filter_means.std(axis=0) + 1e-6 num_seqs = seq_filter_means.shape[0] num_filters = seq_filter_means.shape[1] num_targets = len(target_labels) if options.filters is None: options.filters = range(num_filters) else: options.filters = [int(fi) for fi in options.filters.split(",")] if options.targets is None: options.targets = range(num_targets) else: options.targets = [int(ti) for ti in options.targets.split(",")] ################################################################# # scatter plot prediction changes ################################################################# sns.set(style="ticks", font_scale=1.5) lim_eps = 0.02 for ti in options.targets: if num_seqs > 500: isample = np.array(random.sample(range(num_seqs), 500)) else: isample = np.array(range(num_seqs)) plt.figure(figsize=(8, 8)) g = sns.jointplot(pre_preds[isample, ti], preds[isample, ti], color="black", stat_func=None, alpha=0.5, space=0) ax = g.ax_joint ax.plot([0, 1], [0, 1], c="black", linewidth=1, linestyle="--") ax.set_xlim((0 - lim_eps, 1 + lim_eps)) ax.set_ylim((0 - lim_eps, 1 + lim_eps)) ax.set_xlabel("Pre-insertion accessibility") ax.set_ylabel("Post-insertion accessibility") ax.grid(True, linestyle=":") ax_x = g.ax_marg_x ax_x.set_title(target_labels[ti]) plt.savefig("%s/scatter_t%d.pdf" % (options.out_dir, ti)) plt.close() ################################################################# # plot sequences ################################################################# for ti in options.targets: # sort sequences by score seqsi = np.argsort(scores[:, ti])[::-1] # print a fasta file with uniformly sampled sequences unif_i = np.array([int(sp) for sp in np.arange(0, num_seqs, num_seqs / 200.0)]) seqsi_uniform = seqsi[unif_i] fasta_out = open("%s/seqs_t%d.fa" % (options.out_dir, ti), "w") for si in seqsi_uniform: print >> fasta_out, ">%s_gc%.2f_p%.2f\n%s" % (seq_headers[si], gc(seq_dna[si]), preds[si, ti], seq_dna[si]) fasta_out.close() # print their filter/pos activations to a table # this is slow and big, and I only need it when I'm trying # to find a specific example. table_out = open("%s/seqs_t%d_table.txt" % (options.out_dir, ti), "w") for si in seqsi_uniform: for fi in range(num_filters): for pi in range(seq_filter_outs.shape[2]): cols = (seq_headers[si], fi, pi, seq_filter_outs[si, fi, pi]) print >> table_out, "%-25s %3d %3d %5.2f" % cols table_out.close() # sample fewer for heat map unif_i = np.array([int(sp) for sp in np.arange(0, num_seqs, num_seqs / 200.0)]) seqsi_uniform = seqsi[unif_i] """ these kinda suck # plot heat map plt.figure() n = 20 ax_sf = plt.subplot2grid((1,n), (0,0), colspan=n-1) ax_ss = plt.subplot2grid((1,n), (0,n-1)) # filter heat sf_norm = seq_filter_means[seqsi_uniform,:] - filter_means # sf_norm = np.divide(seq_filter_means[seqsi_uniform,:] - filter_means, filter_msds) sns.heatmap(sf_norm, vmin=-.04, vmax=.04, xticklabels=False, yticklabels=False, ax=ax_sf) # scores heat sns.heatmap(scores[seqsi_uniform,ti].reshape(-1,1), xticklabels=False, yticklabels=False, ax=ax_ss) # this crashed the program, and I don't know why # plt.tight_layout() plt.savefig('%s/seqs_t%d.pdf' % (options.out_dir, ti)) plt.close() """ ################################################################# # filter mean correlations ################################################################# # compute and print table_out = open("%s/table.txt" % options.out_dir, "w") filter_target_cors = np.zeros((num_filters, num_targets)) for fi in range(num_filters): for ti in range(num_targets): cor, p = spearmanr(seq_filter_means[:, fi], scores[:, ti]) cols = (fi, ti, cor, p) print >> table_out, "%-3d %3d %6.3f %6.1e" % cols if np.isnan(cor): cor = 0 filter_target_cors[fi, ti] = cor table_out.close() # plot ftc_df = pd.DataFrame(filter_target_cors, columns=target_labels) plt.figure() g = sns.clustermap(ftc_df) for tick in g.ax_heatmap.get_xticklabels(): tick.set_rotation(-45) tick.set_horizontalalignment("left") tick.set_fontsize(3) for tick in g.ax_heatmap.get_yticklabels(): tick.set_fontsize(3) plt.savefig("%s/filters_targets.pdf" % options.out_dir) plt.close() ################################################################# # filter position correlation ################################################################# sns.set(style="ticks", font_scale=1.7) table_out = open("%s/filter_pos.txt" % options.out_dir, "w") for fi in options.filters: for ti in options.targets: print "Plotting f%d versus t%d" % (fi, ti) # compute correlations pos_cors = [] pos_cors_pre = [] nans = 0 for pi in range(seq_filter_outs.shape[2]): # motif correlation cor, p = spearmanr(seq_filter_outs[:, fi, pi], preds[:, ti]) if np.isnan(cor): cor = 0 p = 1 nans += 1 pos_cors.append(cor) # pre correlation cor_pre, p_pre = spearmanr(pre_seq_filter_outs[:, fi, pi], pre_preds[:, ti]) if np.isnan(cor_pre): cor_pre = 0 p_pre = 1 pos_cors_pre.append(cor_pre) cols = (fi, pi, ti, cor, p, cor_pre, p_pre) print >> table_out, "%-3d %3d %3d %6.3f %6.1e %6.3f %6.1e" % cols if nans < 50: # plot # df_pc = pd.DataFrame({'Position':range(len(pos_cors)), 'Correlation':pos_cors}) plt.figure(figsize=(9, 6)) plt.title(target_labels[ti]) # sns.regplot(x='Position', y='Correlation', data=df_pc, lowess=True) plt.scatter( range(len(pos_cors)), pos_cors_pre, c=sns_colors[2], alpha=0.8, linewidths=0, label="Before motif insertion", ) plt.scatter( range(len(pos_cors)), pos_cors, c=sns_colors[1], alpha=0.8, linewidths=0, label="After motif insertion", ) plt.axhline(y=0, linestyle="--", c="grey", linewidth=1) ax = plt.gca() ax.set_xlim(0, len(pos_cors)) ax.set_xlabel("Position") ax.set_ylabel("Activation vs Prediction Correlation") ax.grid(True, linestyle=":") sns.despine() plt.legend() plt.tight_layout() plt.savefig("%s/f%d_t%d.pdf" % (options.out_dir, fi, ti)) plt.close() table_out.close()
def main(): usage = "usage: %prog [options] <model_file> <input_file>" parser = OptionParser(usage) parser.add_option( "-a", dest="input_activity_file", help="Optional activity table corresponding to an input FASTA file" ) parser.add_option( "-d", dest="model_hdf5_file", default=None, help="Pre-computed model output as HDF5 [Default: %default]" ) parser.add_option( "-g", dest="gain_height", default=False, action="store_true", help="Nucleotide heights determined by the max of loss and gain [Default: %default]", ) parser.add_option( "-m", dest="min_limit", default=0.1, type="float", help="Minimum heatmap limit [Default: %default]" ) parser.add_option( "-n", dest="center_nt", default=200, type="int", help="Center nt to mutate and plot in the heat map [Default: %default]", ) parser.add_option("-o", dest="out_dir", default="heat", help="Output directory [Default: %default]") parser.add_option( "-s", dest="sample", default=None, type="int", help="Sample sequences from the test set [Default:%default]" ) parser.add_option( "-t", dest="targets", default="0", help="Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]", ) (options, args) = parser.parse_args() if len(args) != 2: parser.error("Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file") else: model_file = args[0] input_file = args[1] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # parse input file ################################################################# try: # input_file is FASTA # load sequences and headers seqs = [] seq_headers = [] for line in open(input_file): if line[0] == ">": seq_headers.append(line[1:].rstrip()) seqs.append("") else: seqs[-1] += line.rstrip() model_input_hdf5 = "%s/model_in.h5" % options.out_dir if options.input_activity_file: # one hot code seqs_1hot, targets = dna_io.load_data_1hot( input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False ) # read in target names target_labels = open(options.input_activity_file).readline().strip().split("\t") else: # load sequences seqs_1hot = dna_io.load_sequences(input_file, permute=False) targets = None target_labels = None # sample if options.sample: sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] if targets is not None: targets = targets[sample_i] # reshape sequences for torch seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4)) # write as test data to a HDF5 file h5f = h5py.File(model_input_hdf5, "w") h5f.create_dataset("test_in", data=seqs_1hot) h5f.close() except (IOError, IndexError): # input_file is HDF5 try: model_input_hdf5 = input_file # load (sampled) test data from HDF5 hdf5_in = h5py.File(input_file, "r") seqs_1hot = np.array(hdf5_in["test_in"]) targets = np.array(hdf5_in["test_out"]) try: # TEMP seq_headers = np.array(hdf5_in["test_headers"]) target_labels = np.array(hdf5_in["target_labels"]) except: seq_headers = None target_labels = None hdf5_in.close() # sample if options.sample: sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] targets = targets[sample_i] # write sampled data to a new HDF5 file model_input_hdf5 = "%s/model_in.h5" % options.out_dir h5f = h5py.File(model_input_hdf5, "w") h5f.create_dataset("test_in", data=seqs_1hot) h5f.close() # convert to ACGT sequences seqs = dna_io.vecs2dna(seqs_1hot) except IOError: parser.error("Could not parse input file as FASTA or HDF5.") ################################################################# # Torch predict modifications ################################################################# if options.model_hdf5_file is None: options.model_hdf5_file = "%s/model_out.h5" % options.out_dir torch_cmd = "basset_sat_predict.lua -center_nt %d %s %s %s" % ( options.center_nt, model_file, model_input_hdf5, options.model_hdf5_file, ) if subprocess.call(torch_cmd, shell=True): message("Error running basset_sat_predict.lua", "error") ################################################################# # load modification predictions ################################################################# hdf5_in = h5py.File(options.model_hdf5_file, "r") seq_mod_preds = np.array(hdf5_in["seq_mod_preds"]) hdf5_in.close() # trim seqs to match seq_mod_preds length seq_len = len(seqs[0]) delta_start = 0 delta_len = seq_mod_preds.shape[2] if delta_len < seq_len: delta_start = (seq_len - delta_len) / 2 for i in range(len(seqs)): seqs[i] = seqs[i][delta_start : delta_start + delta_len] # decide which cells to plot if options.targets == "-1": plot_targets = xrange(seq_mod_preds.shape[3]) else: plot_targets = [int(ci) for ci in options.targets.split(",")] ################################################################# # plot ################################################################# table_out = open("%s/table.txt" % options.out_dir, "w") rdbu = sns.color_palette("RdBu_r", 10) nts = "ACGT" for si in range(seq_mod_preds.shape[0]): try: header = seq_headers[si] except TypeError: header = "seq%d" % si seq = seqs[si] # plot some descriptive heatmaps for each individual cell type for ci in plot_targets: seq_mod_preds_cell = seq_mod_preds[si, :, :, ci] real_pred_cell = get_real_pred(seq_mod_preds_cell, seq) # compute matrices norm_matrix = seq_mod_preds_cell - real_pred_cell min_scores = seq_mod_preds_cell.min(axis=0) max_scores = seq_mod_preds_cell.max(axis=0) minmax_matrix = np.vstack([min_scores - real_pred_cell, max_scores - real_pred_cell]) # prepare figure sns.set(style="white", font_scale=0.5) sns.axes_style({"axes.linewidth": 1}) heat_cols = 400 sad_start = 1 sad_end = 323 logo_start = 0 logo_end = 324 fig = plt.figure(figsize=(20, 3)) ax_logo = plt.subplot2grid((3, heat_cols), (0, logo_start), colspan=(logo_end - logo_start)) ax_sad = plt.subplot2grid((3, heat_cols), (1, sad_start), colspan=(sad_end - sad_start)) ax_heat = plt.subplot2grid((3, heat_cols), (2, 0), colspan=heat_cols) # print a WebLogo of the sequence vlim = max(options.min_limit, abs(minmax_matrix).max()) if options.gain_height: seq_heights = 0.25 + 1.75 / vlim * (abs(minmax_matrix).max(axis=0)) else: seq_heights = 0.25 + 1.75 / vlim * (-minmax_matrix[0]) logo_eps = "%s/%s_c%d_seq.eps" % (options.out_dir, header_filename(header), ci) seq_logo(seq, seq_heights, logo_eps) # add to figure logo_png = "%s.png" % logo_eps[:-4] logo_cmd = "convert -density 300 %s %s" % (logo_eps, logo_png) if subprocess.call(logo_cmd, shell=True): message("Error running convert", "error") logo = Image.open(logo_png) ax_logo.imshow(logo) ax_logo.set_axis_off() # plot loss and gain SAD scores ax_sad.plot(-minmax_matrix[0], c=rdbu[0], label="loss", linewidth=1) ax_sad.plot(minmax_matrix[1], c=rdbu[-1], label="gain", linewidth=1) ax_sad.set_xlim(0, minmax_matrix.shape[1]) ax_sad.legend() # ax_sad.grid(True, linestyle=':') for axis in ["top", "bottom", "left", "right"]: ax_sad.spines[axis].set_linewidth(0.5) # plot real-normalized scores vlim = max(options.min_limit, abs(norm_matrix).max()) sns.heatmap(norm_matrix, linewidths=0, cmap="RdBu_r", vmin=-vlim, vmax=vlim, xticklabels=False, ax=ax_heat) ax_heat.yaxis.set_ticklabels("TGCA", rotation="horizontal") # , size=10) # save final figure plt.tight_layout() plt.savefig("%s/%s_c%d_heat.pdf" % (options.out_dir, header.replace(":", "_"), ci), dpi=300) plt.close() ################################################################# # print table of nt variability for each cell ################################################################# for ci in range(seq_mod_preds.shape[3]): seq_mod_preds_cell = seq_mod_preds[si, :, :, ci] real_pred_cell = get_real_pred(seq_mod_preds_cell, seq) min_scores = seq_mod_preds_cell.min(axis=0) max_scores = seq_mod_preds_cell.max(axis=0) loss_matrix = real_pred_cell - seq_mod_preds_cell.min(axis=0) gain_matrix = seq_mod_preds_cell.max(axis=0) - real_pred_cell for pos in range(seq_mod_preds_cell.shape[1]): cols = [header, delta_start + pos, ci, loss_matrix[pos], gain_matrix[pos]] print >> table_out, "\t".join([str(c) for c in cols]) table_out.close()
def parse_interaction_scores_hdf5(scores_hdf5_file): ### single motif scores = [activation for each filter][base_seq] ### paired motif scores = activation for [filter1][filter2][base_seq] ### motif_seqs = motif that is used to represent each filter def get_base_seqs(seqs): return seqs[:NUM_BASE_SEQS] def get_motifs(seqs): motif_seqs = [] z = NUM_BASE_SEQS for i in range(NUM_MOTIFS): motif_seqs.append(seqs[z][300:319]) for j in range(NUM_BASE_SEQS): z += 1 return motif_seqs def get_base_scores(preds): base_scores = [] for i in range(NUM_BASE_SEQS): base_scores.append(preds[i]) return base_scores def get_single_motif_scores(preds): single_motif_scores = [] z = NUM_BASE_SEQS for i in range(NUM_MOTIFS): single_motif_scores.append([False] * NUM_BASE_SEQS) for j in range(NUM_BASE_SEQS): single_motif_scores[i][j] = preds[z] z += 1 return single_motif_scores def get_single_motif_scores_offset(preds): z = NUM_BASE_SEQS + NUM_BASE_SEQS * NUM_MOTIFS single_motif_scores = [] for i in range(NUM_MOTIFS): single_motif_scores.append([False] * NUM_BASE_SEQS) for j in range(NUM_BASE_SEQS): single_motif_scores.append(preds[z]) z += 1 return single_motif_scores def get_paired_motif_scores(preds): paired_motif_scores = [] for i in range(NUM_MOTIFS): paired_motif_scores.append([]) for j in range(NUM_MOTIFS): paired_motif_scores[i].append([False] * NUM_BASE_SEQS) z = NUM_BASE_SEQS + 2 * NUM_BASE_SEQS * NUM_MOTIFS for i in range(NUM_MOTIFS): for j in range(NUM_MOTIFS): for k in range(NUM_BASE_SEQS): paired_motif_scores[i][j][k] = preds[z] z += 1 return paired_motif_scores ### Read in file scores_hdf5_in = h5py.File(scores_hdf5_file, "r") preds = np.array(scores_hdf5_in["preds"]) seq_vecs = scores_hdf5_in["seqs"] seqs = dna_io.vecs2dna(seq_vecs) scores_hdf5_in.close() ### Make sure global variables are set properly assert NUM_BASE_SEQS + 2 * NUM_BASE_SEQS * NUM_MOTIFS + NUM_BASE_SEQS * NUM_MOTIFS * NUM_MOTIFS == len(preds) base_seqs = get_base_seqs(seqs) base_scores = get_base_scores(preds) single_motif_scores = get_single_motif_scores(preds) single_motif_scores_offset = get_single_motif_scores(preds) paired_motif_scores = get_paired_motif_scores(preds) motif_seqs = get_motifs(seqs) return (base_seqs, base_scores, single_motif_scores, single_motif_scores_offset, paired_motif_scores, motif_seqs)
def main(): usage = 'usage: %prog [options] <model_file> <input_file>' parser = OptionParser(usage) parser.add_option( '-a', dest='input_activity_file', help='Optional activity table corresponding to an input FASTA file') parser.add_option( '-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]') parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]') parser.add_option('-r', dest='rng_seed', default=1, type='float', help='Random number generator seed [Default: %default]') parser.add_option( '-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]') parser.add_option( '-t', dest='targets', default='0', help= 'Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]' ) (options, args) = parser.parse_args() if len(args) != 2: parser.error( 'Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file' ) else: model_file = args[0] input_file = args[1] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) random.seed(options.rng_seed) ################################################################# # parse input file ################################################################# try: # input_file is FASTA # load sequences and headers seqs = [] seq_headers = [] for line in open(input_file): if line[0] == '>': seq_headers.append(line[1:].rstrip()) seqs.append('') else: seqs[-1] += line.rstrip() model_input_hdf5 = '%s/model_in.h5' % options.out_dir if options.input_activity_file: # one hot code seqs_1hot, targets = dna_io.load_data_1hot( input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False) # read in target names target_labels = open( options.input_activity_file).readline().strip().split('\t') else: # load sequences seqs_1hot = dna_io.load_sequences(input_file, permute=False) targets = None target_labels = None # sample if options.sample: sample_i = np.array( random.sample(xrange(seqs_1hot.shape[0]), options.sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] if targets is not None: targets = targets[sample_i] # reshape sequences for torch seqs_1hot = seqs_1hot.reshape( (seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4)) # write as test data to a HDF5 file h5f = h5py.File(model_input_hdf5, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() except (IOError, IndexError): # input_file is HDF5 try: model_input_hdf5 = input_file # load (sampled) test data from HDF5 hdf5_in = h5py.File(input_file, 'r') seqs_1hot = np.array(hdf5_in['test_in']) targets = np.array(hdf5_in['test_out']) try: # TEMP seq_headers = np.array(hdf5_in['test_headers']) target_labels = np.array(hdf5_in['target_labels']) except: seq_headers = None target_labels = None hdf5_in.close() # sample if options.sample: sample_i = np.array( random.sample(xrange(seqs_1hot.shape[0]), options.sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] targets = targets[sample_i] # write sampled data to a new HDF5 file model_input_hdf5 = '%s/model_in.h5' % options.out_dir h5f = h5py.File(model_input_hdf5, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() # convert to ACGT sequences seqs = dna_io.vecs2dna(seqs_1hot) except IOError: parser.error('Could not parse input file as FASTA or HDF5.') ################################################################# # Torch predict modifications ################################################################# if options.model_hdf5_file is None: options.model_hdf5_file = '%s/model_out.h5' % options.out_dir torch_cmd = 'basset_net_predict.lua %s %s %s' % ( model_file, model_input_hdf5, options.model_hdf5_file) print torch_cmd subprocess.call(torch_cmd, shell=True) ################################################################# # load modification predictions ################################################################# hdf5_in = h5py.File(options.model_hdf5_file, 'r') reprs = [] l = 1 while 'reprs%d' % l in hdf5_in.keys(): reprs.append(np.array(hdf5_in['reprs%d' % l])) l += 1 hdf5_in.close() ################################################################# # plot ################################################################# print len(reprs) for l in range(len(reprs)): for si in range(len(seq_headers)): plt.figure() # just write the sequence out above it # or maybe I'll ultimately want to write an # influence version. yea probably. print reprs[l][si].shape sns.heatmap(reprs[l][si], linewidths=0, xticklabels=False) plt.savefig('%s/%s_l%d.pdf' % (options.out_dir, header_filename(seq_headers[si]), l)) plt.close()
def main(): usage = 'usage: %prog [options] <model_file> <input_file>' parser = OptionParser(usage) parser.add_option('-a', dest='input_activity_file', help='Optional activity table corresponding to an input FASTA file') parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]') parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]') parser.add_option('-r', dest='rng_seed', default=1, type='float', help='Random number generator seed [Default: %default]') parser.add_option('-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]') parser.add_option('-t', dest='targets', default='0', help='Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file') else: model_file = args[0] input_file = args[1] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) random.seed(options.rng_seed) ################################################################# # parse input file ################################################################# try: # input_file is FASTA # load sequences and headers seqs = [] seq_headers = [] for line in open(input_file): if line[0] == '>': seq_headers.append(line[1:].rstrip()) seqs.append('') else: seqs[-1] += line.rstrip() model_input_hdf5 = '%s/model_in.h5'%options.out_dir if options.input_activity_file: # one hot code seqs_1hot, targets = dna_io.load_data_1hot(input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False) # read in target names target_labels = open(options.input_activity_file).readline().strip().split('\t') else: # load sequences seqs_1hot = dna_io.load_sequences(input_file, permute=False) targets = None target_labels = None # sample if options.sample: sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] if targets is not None: targets = targets[sample_i] # reshape sequences for torch seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,seqs_1hot.shape[1]/4)) # write as test data to a HDF5 file h5f = h5py.File(model_input_hdf5, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() except (IOError, IndexError): # input_file is HDF5 try: model_input_hdf5 = input_file # load (sampled) test data from HDF5 hdf5_in = h5py.File(input_file, 'r') seqs_1hot = np.array(hdf5_in['test_in']) targets = np.array(hdf5_in['test_out']) try: # TEMP seq_headers = np.array(hdf5_in['test_headers']) target_labels = np.array(hdf5_in['target_labels']) except: seq_headers = None target_labels = None hdf5_in.close() # sample if options.sample: sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] targets = targets[sample_i] # write sampled data to a new HDF5 file model_input_hdf5 = '%s/model_in.h5'%options.out_dir h5f = h5py.File(model_input_hdf5, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() # convert to ACGT sequences seqs = dna_io.vecs2dna(seqs_1hot) except IOError: parser.error('Could not parse input file as FASTA or HDF5.') ################################################################# # Torch predict modifications ################################################################# if options.model_hdf5_file is None: options.model_hdf5_file = '%s/model_out.h5' % options.out_dir torch_cmd = 'basset_net_predict.lua %s %s %s' % (model_file, model_input_hdf5, options.model_hdf5_file) print torch_cmd subprocess.call(torch_cmd, shell=True) ################################################################# # load modification predictions ################################################################# hdf5_in = h5py.File(options.model_hdf5_file, 'r') reprs = [] l = 1 while 'reprs%d'%l in hdf5_in.keys(): reprs.append(np.array(hdf5_in['reprs%d'%l])) l += 1 hdf5_in.close() ################################################################# # plot ################################################################# print len(reprs) for l in range(len(reprs)): for si in range(len(seq_headers)): plt.figure() # just write the sequence out above it # or maybe I'll ultimately want to write an # influence version. yea probably. print reprs[l][si].shape sns.heatmap(reprs[l][si], linewidths=0, xticklabels=False) plt.savefig('%s/%s_l%d.pdf' % (options.out_dir, header_filename(seq_headers[si]), l)) plt.close()
def main(): usage = 'usage: %prog [options] <motif> <model_file> <test_hdf5_file>' parser = OptionParser(usage) parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5.') parser.add_option( '-f', dest='filters', default=None, help='Filters to plot length analysis [Default: %default]') parser.add_option('-o', dest='out_dir', default='.') parser.add_option( '-p', dest='pool', default=False, action='store_true', help='Take representation after pooling [Default: %default]') parser.add_option('-s', dest='sample', default=None, type='int', help='Sequences to sample [Default: %default]') parser.add_option( '-t', dest='targets', default=None, help= 'Comma-separated list of targets to analyze in more depth [Default: %default]' ) (options, args) = parser.parse_args() if len(args) != 3: parser.error( 'Must provide motif, Basset model file, and test data in HDF5 format.' ) else: motif = args[0] model_file = args[1] test_hdf5_file = args[2] random.seed(2) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # load data ################################################################# # load sequences test_hdf5_in = h5py.File(test_hdf5_file, 'r') seq_vecs = np.array(test_hdf5_in['test_in']) seq_targets = np.array(test_hdf5_in['test_out']) seq_headers = np.array(test_hdf5_in['test_headers']) target_labels = np.array(test_hdf5_in['target_labels']) test_hdf5_in.close() ################################################################# # sample ################################################################# if options.sample is not None and options.sample < seq_vecs.shape[0]: # choose sampled indexes sample_i = np.array( random.sample(xrange(seq_vecs.shape[0]), options.sample)) # filter seq_vecs = seq_vecs[sample_i] seq_targets = seq_targets[sample_i] seq_headers = seq_headers[sample_i] # create a new HDF5 file sample_hdf5_file = '%s/sample.h5' % options.out_dir sample_hdf5_out = h5py.File(sample_hdf5_file, 'w') sample_hdf5_out.create_dataset('test_in', data=seq_vecs) sample_hdf5_out.close() # update test HDF5 test_hdf5_file = sample_hdf5_file ################################################################# # write in motif ################################################################# # this code must match the Torch code seq_len = seq_vecs.shape[3] seq_mid = math.floor(seq_len / 2.0 - len(motif) / 2.0) - 1 for si in range(seq_vecs.shape[0]): for pi in range(len(motif)): one_hot_set(seq_vecs[si], seq_mid + pi, motif[pi]) # get fasta seq_dna = vecs2dna(seq_vecs) ################################################################# # Torch predict ################################################################# if options.model_hdf5_file is None: pool_str = '' if options.pool: pool_str = '-pool' options.model_hdf5_file = '%s/model_out.h5' % options.out_dir torch_cmd = 'basset_anchor_predict.lua %s %s %s %s %s' % ( pool_str, motif, model_file, test_hdf5_file, options.model_hdf5_file) print torch_cmd subprocess.call(torch_cmd, shell=True) # load model output model_hdf5_in = h5py.File(options.model_hdf5_file, 'r') pre_preds = np.array(model_hdf5_in['pre_preds']) preds = np.array(model_hdf5_in['preds']) scores = np.array(model_hdf5_in['scores']) seq_filter_outs = np.array(model_hdf5_in['filter_outs']) pre_seq_filter_outs = np.array(model_hdf5_in['pre_filter_outs']) model_hdf5_in.close() # pre-process seq_filter_means = seq_filter_outs.mean(axis=2) filter_means = seq_filter_means.mean(axis=0) filter_msds = seq_filter_means.std(axis=0) + 1e-6 num_seqs = seq_filter_means.shape[0] num_filters = seq_filter_means.shape[1] num_targets = len(target_labels) if options.filters is None: options.filters = range(num_filters) else: options.filters = [int(fi) for fi in options.filters.split(',')] if options.targets is None: options.targets = range(num_targets) else: options.targets = [int(ti) for ti in options.targets.split(',')] ################################################################# # scatter plot prediction changes ################################################################# sns.set(style='ticks', font_scale=1.5) lim_eps = 0.02 for ti in options.targets: if num_seqs > 500: isample = np.array(random.sample(range(num_seqs), 500)) else: isample = np.array(range(num_seqs)) plt.figure(figsize=(8, 8)) g = sns.jointplot(pre_preds[isample, ti], preds[isample, ti], color='black', stat_func=None, alpha=0.5, space=0) ax = g.ax_joint ax.plot([0, 1], [0, 1], c='black', linewidth=1, linestyle='--') ax.set_xlim((0 - lim_eps, 1 + lim_eps)) ax.set_ylim((0 - lim_eps, 1 + lim_eps)) ax.set_xlabel('Pre-insertion accessibility') ax.set_ylabel('Post-insertion accessibility') ax.grid(True, linestyle=':') ax_x = g.ax_marg_x ax_x.set_title(target_labels[ti]) plt.savefig('%s/scatter_t%d.pdf' % (options.out_dir, ti)) plt.close() ################################################################# # plot sequences ################################################################# for ti in options.targets: # sort sequences by score seqsi = np.argsort(scores[:, ti])[::-1] # print a fasta file with uniformly sampled sequences unif_i = np.array( [int(sp) for sp in np.arange(0, num_seqs, num_seqs / 200.0)]) seqsi_uniform = seqsi[unif_i] fasta_out = open('%s/seqs_t%d.fa' % (options.out_dir, ti), 'w') for si in seqsi_uniform: print >> fasta_out, '>%s_gc%.2f_p%.2f\n%s' % ( seq_headers[si], gc(seq_dna[si]), preds[si, ti], seq_dna[si]) fasta_out.close() # print their filter/pos activations to a table # this is slow and big, and I only need it when I'm trying # to find a specific example. table_out = open('%s/seqs_t%d_table.txt' % (options.out_dir, ti), 'w') for si in seqsi_uniform: for fi in range(num_filters): for pi in range(seq_filter_outs.shape[2]): cols = (seq_headers[si], fi, pi, seq_filter_outs[si, fi, pi]) print >> table_out, '%-25s %3d %3d %5.2f' % cols table_out.close() # sample fewer for heat map unif_i = np.array( [int(sp) for sp in np.arange(0, num_seqs, num_seqs / 200.0)]) seqsi_uniform = seqsi[unif_i] ''' these kinda suck # plot heat map plt.figure() n = 20 ax_sf = plt.subplot2grid((1,n), (0,0), colspan=n-1) ax_ss = plt.subplot2grid((1,n), (0,n-1)) # filter heat sf_norm = seq_filter_means[seqsi_uniform,:] - filter_means # sf_norm = np.divide(seq_filter_means[seqsi_uniform,:] - filter_means, filter_msds) sns.heatmap(sf_norm, vmin=-.04, vmax=.04, xticklabels=False, yticklabels=False, ax=ax_sf) # scores heat sns.heatmap(scores[seqsi_uniform,ti].reshape(-1,1), xticklabels=False, yticklabels=False, ax=ax_ss) # this crashed the program, and I don't know why # plt.tight_layout() plt.savefig('%s/seqs_t%d.pdf' % (options.out_dir, ti)) plt.close() ''' ################################################################# # filter mean correlations ################################################################# # compute and print table_out = open('%s/table.txt' % options.out_dir, 'w') filter_target_cors = np.zeros((num_filters, num_targets)) for fi in range(num_filters): for ti in range(num_targets): cor, p = spearmanr(seq_filter_means[:, fi], scores[:, ti]) cols = (fi, ti, cor, p) print >> table_out, '%-3d %3d %6.3f %6.1e' % cols if np.isnan(cor): cor = 0 filter_target_cors[fi, ti] = cor table_out.close() # plot ftc_df = pd.DataFrame(filter_target_cors, columns=target_labels) plt.figure() g = sns.clustermap(ftc_df) for tick in g.ax_heatmap.get_xticklabels(): tick.set_rotation(-45) tick.set_horizontalalignment('left') tick.set_fontsize(3) for tick in g.ax_heatmap.get_yticklabels(): tick.set_fontsize(3) plt.savefig('%s/filters_targets.pdf' % options.out_dir) plt.close() ################################################################# # filter position correlation ################################################################# sns.set(style='ticks', font_scale=1.7) table_out = open('%s/filter_pos.txt' % options.out_dir, 'w') for fi in options.filters: for ti in options.targets: print 'Plotting f%d versus t%d' % (fi, ti) # compute correlations pos_cors = [] pos_cors_pre = [] nans = 0 for pi in range(seq_filter_outs.shape[2]): # motif correlation cor, p = spearmanr(seq_filter_outs[:, fi, pi], preds[:, ti]) if np.isnan(cor): cor = 0 p = 1 nans += 1 pos_cors.append(cor) # pre correlation cor_pre, p_pre = spearmanr(pre_seq_filter_outs[:, fi, pi], pre_preds[:, ti]) if np.isnan(cor_pre): cor_pre = 0 p_pre = 1 pos_cors_pre.append(cor_pre) cols = (fi, pi, ti, cor, p, cor_pre, p_pre) print >> table_out, '%-3d %3d %3d %6.3f %6.1e %6.3f %6.1e' % cols if nans < 50: # plot # df_pc = pd.DataFrame({'Position':range(len(pos_cors)), 'Correlation':pos_cors}) plt.figure(figsize=(9, 6)) plt.title(target_labels[ti]) # sns.regplot(x='Position', y='Correlation', data=df_pc, lowess=True) plt.scatter(range(len(pos_cors)), pos_cors_pre, c=sns_colors[2], alpha=0.8, linewidths=0, label='Before motif insertion') plt.scatter(range(len(pos_cors)), pos_cors, c=sns_colors[1], alpha=0.8, linewidths=0, label='After motif insertion') plt.axhline(y=0, linestyle='--', c='grey', linewidth=1) ax = plt.gca() ax.set_xlim(0, len(pos_cors)) ax.set_xlabel('Position') ax.set_ylabel('Activation vs Prediction Correlation') ax.grid(True, linestyle=':') sns.despine() plt.legend() plt.tight_layout() plt.savefig('%s/f%d_t%d.pdf' % (options.out_dir, fi, ti)) plt.close() table_out.close()
def main(): usage = 'usage: %prog [options] <model_file>' parser = OptionParser(usage) parser.add_option('-a', dest='targets_file', default=None, help='File labelings targets in the second column [Default: %default]') parser.add_option('-c', dest='center_nt', default=50, help='Center nt to consider kmers from [Default: %default]') parser.add_option('-d', dest='model_out_file', default=None, help='Pre-computed model output table.') parser.add_option('-k', dest='kmer', default=8, type='int', help='K-mer length [Default: %default]') parser.add_option('-l', dest='seq_len', default=1000, type='int', help='Input sequence length [Default: %default]') parser.add_option('-n', dest='num_seqs', default=100000, type='int', help='Number of sequences to predict [Default: %default]') parser.add_option('-o', dest='out_dir', default='.') parser.add_option('-r', dest='rc', default=False, action='store_true', help='Consider k-mers w/ their reverse complements [Default: %default]') parser.add_option('-t', dest='targets', default=None, help='Comma-separated list of targets to analyze in more depth [Default: %default]') parser.add_option('--top', dest='top_num', default=100, type='int', help='Number of sequences with which to make a multiple sequence alignment') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide Basset model file.') else: model_file = args[0] random.seed(2) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) if options.model_out_file is not None: seq_dna = [] for line in open('%s/seqs.fa' % options.out_dir): if line[0] == '>': seq_dna.append('') else: seq_dna[-1] += line.rstrip() else: ################################################################# # generate random sequences ################################################################# # random sequences seq_vecs = np.zeros((options.num_seqs,4,1,options.seq_len), dtype='float16') for si in range(options.num_seqs): for li in range(options.seq_len): ni = random.randint(0,3) seq_vecs[si,ni,0,li] = 1 # create a new HDF5 file seq_hdf5_file = '%s/seqs.h5' % options.out_dir seq_hdf5_out = h5py.File(seq_hdf5_file, 'w') seq_hdf5_out.create_dataset('test_in', data=seq_vecs) seq_hdf5_out.close() # get fasta seq_dna = vecs2dna(seq_vecs) # print to file fasta_out = open('%s/seqs.fa' % options.out_dir, 'w') for i in range(len(seq_dna)): print >> fasta_out, '>%d\n%s' % (i,seq_dna[i]) fasta_out.close() ################################################################# # Torch predict ################################################################# options.model_out_file = '%s/model_out.txt' % options.out_dir torch_cmd = 'basset_predict.lua -scores %s %s %s' % (model_file, seq_hdf5_file, options.model_out_file) print torch_cmd subprocess.call(torch_cmd, shell=True) # clean up sequence HDF5 os.remove(seq_hdf5_file) # load scores seq_scores = np.loadtxt(options.model_out_file, dtype='float32') # read target labels if options.targets_file: target_labels = [line.split()[1] for line in open(options.targets_file)] else: target_labels = ['t%d'%(ti+1) for ti in range(seq_scores.shape[1])] if options.targets is None: options.targets = range(seq_scores.shape[1]) else: options.targets = [int(ti) for ti in options.targets.split(',')] ################################################################# # process and output ################################################################# kmers_start = (options.seq_len - options.center_nt) / 2 for ti in options.targets: print 'Working on target %d' % ti ############################################## # hash scores by k-mer ############################################## kmer_scores_raw = {} for si in range(len(seq_dna)): # get score sscore = seq_scores[si,ti] # hash to each center kmer for ki in range(kmers_start, kmers_start + options.center_nt): kmer = seq_dna[si][ki:ki+options.kmer] if options.rc: kmer = consider_rc(kmer) kmer_scores_raw.setdefault(kmer,[]).append(sscore) ############################################## # compute means and print table ############################################## table_out = open('%s/table%d.txt' % (options.out_dir,ti), 'w') kmer_means_raw = {} for kmer in kmer_scores_raw: kmer_means_raw[kmer] = np.mean(kmer_scores_raw[kmer]) kmer_n = len(kmer_scores_raw[kmer]) cols = (kmer, kmer_n, kmer_means_raw[kmer], np.std(kmer_scores_raw[kmer])/math.sqrt(kmer_n)) print >> table_out, '%s %4d %6.3f %6.3f' % cols table_out.close() ############################################## # plot density ############################################## plt.figure() sns.distplot(kmer_means_raw.values(), kde=False) plt.savefig('%s/density%d.pdf' % (options.out_dir,ti)) plt.close() ############################################## # top k-mers distance matrix ############################################## kmer_means = {} kmer_means_mean = np.mean(kmer_means_raw.values()) for kmer in kmer_means_raw: kmer_means[kmer] = kmer_means_raw[kmer] - kmer_means_mean # score by score scores_kmers = [(kmer_means[kmer],kmer) for kmer in kmer_means] scores_kmers.sort(reverse=True) # take top k-mers top_kmers = [] top_kmers_scores = [] for score, kmer in scores_kmers[:options.top_num]: top_kmers.append(kmer) top_kmers_scores.append(score) top_kmers = np.array(top_kmers) top_kmers_scores = np.array(top_kmers_scores) # compute distance matrix top_kmers_dists = np.zeros((options.top_num, options.top_num)) for i in range(options.top_num): for j in range(i+1,options.top_num): if options.rc: top_kmers_dists[i,j] = kmer_distance_rc(top_kmers[i], top_kmers[j]) else: top_kmers_dists[i,j] = kmer_distance(top_kmers[i], top_kmers[j]) top_kmers_dists[j,i] = top_kmers_dists[i,j] # clip the distances np.clip(top_kmers_dists, 0, 3, out=top_kmers_dists) # plot plot_kmer_dists(top_kmers_dists, top_kmers_scores, top_kmers, '%s/top_kmers_heat%d.pdf'%(options.out_dir,ti)) # cluster and plot cluster_kmer_dists(top_kmers_dists, top_kmers_scores, top_kmers, '%s/top_kmers_clust%d.pdf'%(options.out_dir,ti))
def main(): usage = "usage: %prog [options] <model_file> <profile_file> <input_file>" parser = OptionParser(usage) parser.add_option( "-a", dest="input_activity_file", help="Optional activity table corresponding to an input FASTA file" ) parser.add_option( "--all", dest="all_data", default=False, action="store_true", help="Search all training/valid/test sequences. By default we search only the test set. [Default: %default]", ) parser.add_option( "--cuda", dest="cuda", default=False, action="store_true", help="Run on GPGPU [Default: %default]" ) parser.add_option( "--cudnn", dest="cudnn", default=False, action="store_true", help="Run on GPGPU w/cuDNN [Default: %default]" ) parser.add_option( "-d", dest="model_out_file", default=None, help="Pre-computed model predictions output table [Default: %default]", ) parser.add_option( "-e", dest="norm_even", default=False, action="store_true", help="Normalize the weights for the positive and negative datasets to be even [Default: %default]", ) parser.add_option("-f", dest="font_heat", default=6, type="int", help="Heat map axis font size [Default: %default]") parser.add_option( "-n", dest="num_dissect", default=10, type="int", help="Dissect the top n hits [Default: %default]" ) parser.add_option("-o", dest="out_dir", default="profile", help="Output directory [Default: %default]") parser.add_option( "-r", dest="norm_preds", default=False, action="store_true", help="Normalize predictions to have equal frequency [Default: %default]", ) parser.add_option( "-z", dest="weight_zero", default=1.0, type="float", help="Adjust the weights for the zero samples by this value [Default: %default]", ) (options, args) = parser.parse_args() if len(args) != 3: parser.error( "Must provide Basset model file, activity profile file, and input sequences (as a FASTA file or test data in an HDF file)" ) else: model_file = args[0] profile_file = args[1] input_file = args[2] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # parse input file ################################################################# try: # input_file is FASTA # load sequences and headers seqs = [] seq_headers = [] for line in open(input_file): if line[0] == ">": seq_headers.append(line[1:].rstrip()) seqs.append("") else: seqs[-1] += line.rstrip() # convert to arrays seqs = np.array(seqs) seq_headers = np.array(seq_headers) model_input_hdf5 = "%s/model_in.h5" % options.out_dir if options.input_activity_file: # one hot code seqs_1hot, targets = dna_io.load_data_1hot( input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False ) else: # load sequences seqs_1hot = dna_io.load_sequences(input_file, permute=False) targets = None # reshape sequences for torch seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4)) # write as test data to a HDF5 file h5f = h5py.File(model_input_hdf5, "w") h5f.create_dataset("test_in", data=seqs_1hot) h5f.close() except (IOError, IndexError, UnicodeDecodeError): # input_file is HDF5 try: model_input_hdf5 = input_file # load (sampled) test data from HDF5 hdf5_in = h5py.File(input_file, "r") seqs_1hot = np.array(hdf5_in["test_in"]) targets = np.array(hdf5_in["test_out"]) seq_headers = np.array([h.decode("UTF-8") for h in hdf5_in["test_headers"]]) hdf5_in.close() # convert to ACGT sequences seqs = dna_io.vecs2dna(seqs_1hot) except IOError: parser.error("Could not parse input file as FASTA or HDF5.") ################################################################# # Torch predict modifications ################################################################# # GPU options (needed below, too) gpgpu_str = "" if options.cudnn: gpgpu_str = "-cudnn" elif options.cuda: gpgpu_str = "-cuda" if options.model_out_file is None: options.model_out_file = "%s/preds.txt" % options.out_dir torch_cmd = "basset_predict.lua -mc_n 10 -rc %s %s %s %s" % ( gpgpu_str, model_file, model_input_hdf5, options.model_out_file, ) print(torch_cmd) subprocess.call(torch_cmd, shell=True) # read in predictions seqs_preds = np.loadtxt(options.model_out_file) num_targets = seqs_preds.shape[1] ################################################################# # parse profile file ################################################################# activity_profile, profile_weights, profile_mask, target_labels = load_profile( profile_file, num_targets, options.norm_even, options.weight_zero ) # normalize predictions if options.norm_preds: pred_means = seqs_preds.mean(axis=0) # save to file for basset_refine.py np.save("%s/pred_means" % options.out_dir, pred_means) # aim for profile weighted average aim_mean = np.average(pred_means[profile_mask], weights=profile_weights[profile_mask]) # normalize for ti in range(seqs_preds.shape[1]): ratio_ti = pred_means[ti] / aim_mean if profile_mask[ti] and (ratio_ti < 1 / 4 or ratio_ti > 4): print( "WARNING: target %d with mean %.4f differs 4-fold from the median %.3f" % (ti, pred_means[ti], aim_mean), file=sys.stderr, ) seqs_preds[:, ti] = znorm(seqs_preds[:, ti], pred_means[ti], aim_mean) ################################################################# # plot clustered heat map limited to relevant targets ################################################################# seqs_preds_prof = seqs_preds[:, profile_mask] seqs_preds_var = seqs_preds_prof.var(axis=1) seqs_sort_var = np.argsort(seqs_preds_var)[::-1] # heat map plt.figure() g = sns.clustermap( np.transpose(seqs_preds_prof[seqs_sort_var[:1500]]), metric="cosine", linewidths=0, yticklabels=target_labels[profile_mask], xticklabels=False, ) plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) for label in g.ax_heatmap.yaxis.get_majorticklabels(): label.set_fontsize(options.font_heat) plt.savefig("%s/heat_clust.pdf" % options.out_dir) plt.close() # dimension reduction # model_pca = PCA(n_components=50) # spp_pca = model.fit_transform(np.transpose(seqs_preds_prof)) # model = TSNE(n_components=2, perplexity=5, metric='euclidean') # spp_dr = model.fit_transform(spp_pca) model = PCA(n_components=2) spp_dr = model.fit_transform(np.transpose(seqs_preds_prof)) plt.figure() plt.scatter(spp_dr[:, 0], spp_dr[:, 1], c="black", s=5) target_labels_prof_concise = [tl.split(":")[-1] for tl in target_labels[profile_mask]] for label, x, y, activity in zip( target_labels_prof_concise, spp_dr[:, 0], spp_dr[:, 1], activity_profile[profile_mask] ): plt.annotate(label, xy=(x, y), size=10, color=sns.color_palette("deep")[int(activity)]) plt.savefig("%s/dim_red.pdf" % options.out_dir) plt.close() ################################################################# # compute profile distances ################################################################# # compute prediction distances seqs_pdists = [] for si in range(seqs_preds.shape[0]): # sd = np.power(seqs_preds[si,profile_mask]-activity_profile[profile_mask], 2).sum() sd = log_loss( activity_profile[profile_mask], seqs_preds[si, profile_mask], sample_weight=profile_weights[profile_mask] ) seqs_pdists.append(sd) seqs_pdists = np.array(seqs_pdists) # obtain sorted indexes seqs_sort_dist = np.argsort(seqs_pdists) # compute target distances seqs_tdists = [] for si in range(seqs_preds.shape[0]): tdists = np.absolute(targets[si, profile_mask] - activity_profile[profile_mask]) tdists_weight = np.multiply(tdists, profile_weights[profile_mask]) td = tdists_weight.sum() seqs_tdists.append(td) seqs_tdists = np.array(seqs_tdists) # print as table table_out = open("%s/table.txt" % options.out_dir, "w") for si in seqs_sort_dist: cols = [si, seqs_pdists[si], seqs_tdists[si]] + list(seqs_preds[si, profile_mask]) print("\t".join([str(c) for c in cols]), file=table_out) table_out.close() ################################################################# # plot sorted heat map ################################################################# plt.figure() g = sns.clustermap( np.transpose(seqs_preds_prof[seqs_sort_dist[:1000]]), col_cluster=False, metric="cosine", linewidths=0, yticklabels=target_labels[profile_mask], xticklabels=False, ) plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) for label in g.ax_heatmap.yaxis.get_majorticklabels(): label.set_fontsize(options.font_heat) plt.savefig("%s/heat_rank.pdf" % options.out_dir) plt.close() ################################################################# # dissect the top hits ################################################################# satmut_targets = ",".join([str(ti) for ti in range(len(activity_profile)) if profile_mask[ti]]) if gpgpu_str != "": gpgpu_str = "-%s" % gpgpu_str for ni in range(options.num_dissect): si = seqs_sort_dist[ni] # print FASTA fasta_file = "%s/seq%d.fa" % (options.out_dir, ni) fasta_out = open(fasta_file, "w") print(">%s\n%s" % (seq_headers[si], seqs[si]), file=fasta_out) fasta_out.close() # saturated mutagenesis cmd = "basset_sat.py %s --mc_n 10 -n 500 -o %s/satmut%d -t %s %s %s" % ( gpgpu_str, options.out_dir, ni, satmut_targets, model_file, fasta_file, ) subprocess.call(cmd, shell=True) # predictions and targets heat profile_sort = np.argsort(activity_profile[profile_mask]) heat_mat = np.array([activity_profile[profile_mask], targets[si, profile_mask], seqs_preds_prof[si]]) heat_mat = heat_mat[:, profile_sort] plt.figure() ax = sns.heatmap( np.transpose(heat_mat), yticklabels=target_labels[profile_mask][profile_sort], xticklabels=["Desired", "Experiment", "Prediction"], ) plt.setp(ax.xaxis.get_majorticklabels(), rotation=-45) plt.setp(ax.yaxis.get_majorticklabels(), rotation=-0) for label in ax.yaxis.get_majorticklabels(): label.set_fontsize(options.font_heat) plt.savefig("%s/heat%d.pdf" % (options.out_dir, ni)) plt.close()
def main(): usage = 'usage: %prog [options] <model_file> <input_file>' parser = OptionParser(usage) parser.add_option( '-a', dest='input_activity_file', help='Optional activity table corresponding to an input FASTA file') parser.add_option( '-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]') parser.add_option( '-g', dest='gain_height', default=False, action='store_true', help= 'Nucleotide heights determined by the max of loss and gain [Default: %default]' ) parser.add_option('-m', dest='min_limit', default=0.1, type='float', help='Minimum heatmap limit [Default: %default]') parser.add_option( '-n', dest='center_nt', default=200, type='int', help='Center nt to mutate and plot in the heat map [Default: %default]' ) parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]') parser.add_option( '-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]') parser.add_option( '-t', dest='targets', default='0', help= 'Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]' ) (options, args) = parser.parse_args() if len(args) != 2: parser.error( 'Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file' ) else: model_file = args[0] input_file = args[1] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # parse input file ################################################################# try: # input_file is FASTA # load sequences and headers seqs = [] seq_headers = [] for line in open(input_file): if line[0] == '>': seq_headers.append(line[1:].rstrip()) seqs.append('') else: seqs[-1] += line.rstrip() model_input_hdf5 = '%s/model_in.h5' % options.out_dir if options.input_activity_file: # one hot code seqs_1hot, targets = dna_io.load_data_1hot( input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False) # read in target names target_labels = open( options.input_activity_file).readline().strip().split('\t') else: # load sequences seqs_1hot = dna_io.load_sequences(input_file, permute=False) targets = None target_labels = None # sample if options.sample: sample_i = np.array( random.sample(xrange(seqs_1hot.shape[0]), options.sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] if targets is not None: targets = targets[sample_i] # reshape sequences for torch seqs_1hot = seqs_1hot.reshape( (seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4)) # write as test data to a HDF5 file h5f = h5py.File(model_input_hdf5, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() except (IOError, IndexError): # input_file is HDF5 try: model_input_hdf5 = input_file # load (sampled) test data from HDF5 hdf5_in = h5py.File(input_file, 'r') seqs_1hot = np.array(hdf5_in['test_in']) targets = np.array(hdf5_in['test_out']) try: # TEMP seq_headers = np.array(hdf5_in['test_headers']) target_labels = np.array(hdf5_in['target_labels']) except: seq_headers = None target_labels = None hdf5_in.close() # sample if options.sample: sample_i = np.array( random.sample(xrange(seqs_1hot.shape[0]), options.sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] targets = targets[sample_i] # write sampled data to a new HDF5 file model_input_hdf5 = '%s/model_in.h5' % options.out_dir h5f = h5py.File(model_input_hdf5, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() # convert to ACGT sequences seqs = dna_io.vecs2dna(seqs_1hot) except IOError: parser.error('Could not parse input file as FASTA or HDF5.') ################################################################# # Torch predict modifications ################################################################# if options.model_hdf5_file is None: options.model_hdf5_file = '%s/model_out.h5' % options.out_dir torch_cmd = 'basset_sat_predict.lua -center_nt %d %s %s %s' % ( options.center_nt, model_file, model_input_hdf5, options.model_hdf5_file) print torch_cmd subprocess.call(torch_cmd, shell=True) ################################################################# # load modification predictions ################################################################# hdf5_in = h5py.File(options.model_hdf5_file, 'r') seq_mod_preds = np.array(hdf5_in['seq_mod_preds']) hdf5_in.close() # trim seqs to match seq_mod_preds length seq_len = len(seqs[0]) delta_start = 0 delta_len = seq_mod_preds.shape[2] if delta_len < seq_len: delta_start = (seq_len - delta_len) / 2 for i in range(len(seqs)): seqs[i] = seqs[i][delta_start:delta_start + delta_len] # decide which cells to plot if options.targets == '-1': plot_targets = xrange(seq_mod_preds.shape[3]) else: plot_targets = [int(ci) for ci in options.targets.split(',')] ################################################################# # plot ################################################################# table_out = open('%s/table.txt' % options.out_dir, 'w') rdbu = sns.color_palette("RdBu_r", 10) nts = 'ACGT' for si in range(seq_mod_preds.shape[0]): try: header = seq_headers[si] except TypeError: header = 'seq%d' % si seq = seqs[si] # plot some descriptive heatmaps for each individual cell type for ci in plot_targets: seq_mod_preds_cell = seq_mod_preds[si, :, :, ci] real_pred_cell = get_real_pred(seq_mod_preds_cell, seq) # compute matrices norm_matrix = seq_mod_preds_cell - real_pred_cell min_scores = seq_mod_preds_cell.min(axis=0) max_scores = seq_mod_preds_cell.max(axis=0) minmax_matrix = np.vstack( [min_scores - real_pred_cell, max_scores - real_pred_cell]) # prepare figure sns.set(style='white', font_scale=0.5) sns.axes_style({'axes.linewidth': 1}) heat_cols = 400 sad_start = 1 sad_end = 323 logo_start = 0 logo_end = 324 fig = plt.figure(figsize=(20, 3)) ax_logo = plt.subplot2grid((3, heat_cols), (0, logo_start), colspan=(logo_end - logo_start)) ax_sad = plt.subplot2grid((3, heat_cols), (1, sad_start), colspan=(sad_end - sad_start)) ax_heat = plt.subplot2grid((3, heat_cols), (2, 0), colspan=heat_cols) # print a WebLogo of the sequence vlim = max(options.min_limit, abs(minmax_matrix).max()) if options.gain_height: seq_heights = 0.25 + 1.75 / vlim * (abs(minmax_matrix).max( axis=0)) else: seq_heights = 0.25 + 1.75 / vlim * (-minmax_matrix[0]) logo_eps = '%s/%s_c%d_seq.eps' % (options.out_dir, header_filename(header), ci) seq_logo(seq, seq_heights, logo_eps) # add to figure logo_png = '%s.png' % logo_eps[:-4] subprocess.call('convert -density 300 %s %s' % (logo_eps, logo_png), shell=True) logo = Image.open(logo_png) ax_logo.imshow(logo) ax_logo.set_axis_off() # plot loss and gain SAD scores ax_sad.plot(-minmax_matrix[0], c=rdbu[0], label='loss', linewidth=1) ax_sad.plot(minmax_matrix[1], c=rdbu[-1], label='gain', linewidth=1) ax_sad.set_xlim(0, minmax_matrix.shape[1]) ax_sad.legend() # ax_sad.grid(True, linestyle=':') for axis in ['top', 'bottom', 'left', 'right']: ax_sad.spines[axis].set_linewidth(0.5) # plot real-normalized scores vlim = max(options.min_limit, abs(norm_matrix).max()) sns.heatmap(norm_matrix, linewidths=0, cmap='RdBu_r', vmin=-vlim, vmax=vlim, xticklabels=False, ax=ax_heat) ax_heat.yaxis.set_ticklabels('TGCA', rotation='horizontal') # , size=10) # save final figure plt.tight_layout() plt.savefig('%s/%s_c%d_heat.pdf' % (options.out_dir, header.replace(':', '_'), ci), dpi=300) plt.close() ################################################################# # print table of nt variability for each cell ################################################################# for ci in range(seq_mod_preds.shape[3]): seq_mod_preds_cell = seq_mod_preds[si, :, :, ci] real_pred_cell = get_real_pred(seq_mod_preds_cell, seq) min_scores = seq_mod_preds_cell.min(axis=0) max_scores = seq_mod_preds_cell.max(axis=0) loss_matrix = real_pred_cell - seq_mod_preds_cell.min(axis=0) gain_matrix = seq_mod_preds_cell.max(axis=0) - real_pred_cell for pos in range(seq_mod_preds_cell.shape[1]): cols = [ header, delta_start + pos, ci, loss_matrix[pos], gain_matrix[pos] ] print >> table_out, '\t'.join([str(c) for c in cols]) table_out.close()
def main(): usage = 'usage: %prog [options] <model_file> <profile_file> <input_file>' parser = OptionParser(usage) parser.add_option( '-a', dest='input_activity_file', help='Optional activity table corresponding to an input FASTA file') parser.add_option( '--all', dest='all_data', default=False, action='store_true', help= 'Search all training/valid/test sequences. By default we search only the test set. [Default: %default]' ) parser.add_option('--cuda', dest='cuda', default=False, action='store_true', help='Run on GPGPU [Default: %default]') parser.add_option('--cudnn', dest='cudnn', default=False, action='store_true', help='Run on GPGPU w/cuDNN [Default: %default]') parser.add_option( '-d', dest='model_out_file', default=None, help='Pre-computed model predictions output table [Default: %default]') parser.add_option( '-e', dest='norm_even', default=False, action='store_true', help= 'Normalize the weights for the positive and negative datasets to be even [Default: %default]' ) parser.add_option('-f', dest='font_heat', default=6, type='int', help='Heat map axis font size [Default: %default]') parser.add_option('-n', dest='num_dissect', default=10, type='int', help='Dissect the top n hits [Default: %default]') parser.add_option('-o', dest='out_dir', default='profile', help='Output directory [Default: %default]') parser.add_option( '-r', dest='norm_preds', default=False, action='store_true', help='Normalize predictions to have equal frequency [Default: %default]' ) parser.add_option( '-z', dest='weight_zero', default=1.0, type='float', help= 'Adjust the weights for the zero samples by this value [Default: %default]' ) (options, args) = parser.parse_args() if len(args) != 3: parser.error( 'Must provide Basset model file, activity profile file, and input sequences (as a FASTA file or test data in an HDF file)' ) else: model_file = args[0] profile_file = args[1] input_file = args[2] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # parse input file ################################################################# try: # input_file is FASTA # load sequences and headers seqs = [] seq_headers = [] for line in open(input_file): if line[0] == '>': seq_headers.append(line[1:].rstrip()) seqs.append('') else: seqs[-1] += line.rstrip() # convert to arrays seqs = np.array(seqs) seq_headers = np.array(seq_headers) model_input_hdf5 = '%s/model_in.h5' % options.out_dir if options.input_activity_file: # one hot code seqs_1hot, targets = dna_io.load_data_1hot( input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False) else: # load sequences seqs_1hot = dna_io.load_sequences(input_file, permute=False) targets = None # reshape sequences for torch seqs_1hot = seqs_1hot.reshape( (seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4)) # write as test data to a HDF5 file h5f = h5py.File(model_input_hdf5, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() except (IOError, IndexError, UnicodeDecodeError): # input_file is HDF5 try: model_input_hdf5 = input_file # load (sampled) test data from HDF5 hdf5_in = h5py.File(input_file, 'r') seqs_1hot = np.array(hdf5_in['test_in']) targets = np.array(hdf5_in['test_out']) seq_headers = np.array( [h.decode('UTF-8') for h in hdf5_in['test_headers']]) hdf5_in.close() # convert to ACGT sequences seqs = dna_io.vecs2dna(seqs_1hot) except IOError: parser.error('Could not parse input file as FASTA or HDF5.') ################################################################# # Torch predict modifications ################################################################# # GPU options (needed below, too) gpgpu_str = '' if options.cudnn: gpgpu_str = '-cudnn' elif options.cuda: gpgpu_str = '-cuda' if options.model_out_file is None: options.model_out_file = '%s/preds.txt' % options.out_dir torch_cmd = 'basset_predict.lua -mc_n 10 -rc %s %s %s %s' % ( gpgpu_str, model_file, model_input_hdf5, options.model_out_file) print(torch_cmd) subprocess.call(torch_cmd, shell=True) # read in predictions seqs_preds = np.loadtxt(options.model_out_file) num_targets = seqs_preds.shape[1] ################################################################# # parse profile file ################################################################# activity_profile, profile_weights, profile_mask, target_labels = load_profile( profile_file, num_targets, options.norm_even, options.weight_zero) # normalize predictions if options.norm_preds: pred_means = seqs_preds.mean(axis=0) # save to file for basset_refine.py np.save('%s/pred_means' % options.out_dir, pred_means) # aim for profile weighted average aim_mean = np.average(pred_means[profile_mask], weights=profile_weights[profile_mask]) # normalize for ti in range(seqs_preds.shape[1]): ratio_ti = pred_means[ti] / aim_mean if profile_mask[ti] and (ratio_ti < 1 / 4 or ratio_ti > 4): print( 'WARNING: target %d with mean %.4f differs 4-fold from the median %.3f' % (ti, pred_means[ti], aim_mean), file=sys.stderr) seqs_preds[:, ti] = znorm(seqs_preds[:, ti], pred_means[ti], aim_mean) ################################################################# # plot clustered heat map limited to relevant targets ################################################################# seqs_preds_prof = seqs_preds[:, profile_mask] seqs_preds_var = seqs_preds_prof.var(axis=1) seqs_sort_var = np.argsort(seqs_preds_var)[::-1] # heat map plt.figure() g = sns.clustermap(np.transpose(seqs_preds_prof[seqs_sort_var[:1500]]), metric='cosine', linewidths=0, yticklabels=target_labels[profile_mask], xticklabels=False) plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) for label in g.ax_heatmap.yaxis.get_majorticklabels(): label.set_fontsize(options.font_heat) plt.savefig('%s/heat_clust.pdf' % options.out_dir) plt.close() # dimension reduction # model_pca = PCA(n_components=50) # spp_pca = model.fit_transform(np.transpose(seqs_preds_prof)) # model = TSNE(n_components=2, perplexity=5, metric='euclidean') # spp_dr = model.fit_transform(spp_pca) model = PCA(n_components=2) spp_dr = model.fit_transform(np.transpose(seqs_preds_prof)) plt.figure() plt.scatter(spp_dr[:, 0], spp_dr[:, 1], c='black', s=5) target_labels_prof_concise = [ tl.split(':')[-1] for tl in target_labels[profile_mask] ] for label, x, y, activity in zip(target_labels_prof_concise, spp_dr[:, 0], spp_dr[:, 1], activity_profile[profile_mask]): plt.annotate(label, xy=(x, y), size=10, color=sns.color_palette('deep')[int(activity)]) plt.savefig('%s/dim_red.pdf' % options.out_dir) plt.close() ################################################################# # compute profile distances ################################################################# # compute prediction distances seqs_pdists = [] for si in range(seqs_preds.shape[0]): # sd = np.power(seqs_preds[si,profile_mask]-activity_profile[profile_mask], 2).sum() sd = log_loss(activity_profile[profile_mask], seqs_preds[si, profile_mask], sample_weight=profile_weights[profile_mask]) seqs_pdists.append(sd) seqs_pdists = np.array(seqs_pdists) # obtain sorted indexes seqs_sort_dist = np.argsort(seqs_pdists) # compute target distances seqs_tdists = [] for si in range(seqs_preds.shape[0]): tdists = np.absolute(targets[si, profile_mask] - activity_profile[profile_mask]) tdists_weight = np.multiply(tdists, profile_weights[profile_mask]) td = tdists_weight.sum() seqs_tdists.append(td) seqs_tdists = np.array(seqs_tdists) # print as table table_out = open('%s/table.txt' % options.out_dir, 'w') for si in seqs_sort_dist: cols = [si, seqs_pdists[si], seqs_tdists[si]] + list( seqs_preds[si, profile_mask]) print('\t'.join([str(c) for c in cols]), file=table_out) table_out.close() ################################################################# # plot sorted heat map ################################################################# plt.figure() g = sns.clustermap(np.transpose(seqs_preds_prof[seqs_sort_dist[:1000]]), col_cluster=False, metric='cosine', linewidths=0, yticklabels=target_labels[profile_mask], xticklabels=False) plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) for label in g.ax_heatmap.yaxis.get_majorticklabels(): label.set_fontsize(options.font_heat) plt.savefig('%s/heat_rank.pdf' % options.out_dir) plt.close() ################################################################# # dissect the top hits ################################################################# satmut_targets = ','.join( [str(ti) for ti in range(len(activity_profile)) if profile_mask[ti]]) if gpgpu_str != '': gpgpu_str = '-%s' % gpgpu_str for ni in range(options.num_dissect): si = seqs_sort_dist[ni] # print FASTA fasta_file = '%s/seq%d.fa' % (options.out_dir, ni) fasta_out = open(fasta_file, 'w') print('>%s\n%s' % (seq_headers[si], seqs[si]), file=fasta_out) fasta_out.close() # saturated mutagenesis cmd = 'basset_sat.py %s --mc_n 10 -n 500 -o %s/satmut%d -t %s %s %s' % ( gpgpu_str, options.out_dir, ni, satmut_targets, model_file, fasta_file) subprocess.call(cmd, shell=True) # predictions and targets heat profile_sort = np.argsort(activity_profile[profile_mask]) heat_mat = np.array([ activity_profile[profile_mask], targets[si, profile_mask], seqs_preds_prof[si] ]) heat_mat = heat_mat[:, profile_sort] plt.figure() ax = sns.heatmap(np.transpose(heat_mat), yticklabels=target_labels[profile_mask][profile_sort], xticklabels=['Desired', 'Experiment', 'Prediction']) plt.setp(ax.xaxis.get_majorticklabels(), rotation=-45) plt.setp(ax.yaxis.get_majorticklabels(), rotation=-0) for label in ax.yaxis.get_majorticklabels(): label.set_fontsize(options.font_heat) plt.savefig('%s/heat%d.pdf' % (options.out_dir, ni)) plt.close()
def main(): usage = 'usage: %prog [options] <model_file> <test_hdf5_file>' parser = OptionParser(usage) parser.add_option( '-a', dest='act_t', default=0.5, type='float', help= 'Activation threshold (as proportion of max) to consider for PWM [Default: %default]' ) parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5.') parser.add_option('-o', dest='out_dir', default='.') parser.add_option('-m', dest='meme_db', default='%s/data/motifs/Homo_sapiens.meme' % os.environ['BASSETDIR'], help='MEME database used to annotate motifs') parser.add_option( '-p', dest='plot_heats', default=False, action='store_true', help= 'Plot heat maps describing filter activations in the test sequences [Default: %default]' ) parser.add_option( '-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]') parser.add_option( '-t', dest='trim_filters', default=False, action='store_true', help= 'Trim uninformative positions off the filter ends [Default: %default]') (options, args) = parser.parse_args() if len(args) != 2: parser.error( 'Must provide Basset model file and test data in HDF5 format.') else: model_file = args[0] test_hdf5_file = args[1] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # load data ################################################################# # load sequences test_hdf5_in = h5py.File(test_hdf5_file, 'r') seq_vecs = np.array(test_hdf5_in['test_in']) seq_targets = np.array(test_hdf5_in['test_out']) try: target_names = list(test_hdf5_in['target_labels']) except KeyError: target_names = ['t%d' % ti for ti in range(seq_targets.shape[1])] test_hdf5_in.close() ################################################################# # sample ################################################################# if options.sample is not None: # choose sampled indexes sample_i = np.array( random.sample(xrange(seq_vecs.shape[0]), options.sample)) # filter seq_vecs = seq_vecs[sample_i] seq_targets = seq_targets[sample_i] # create a new HDF5 file sample_hdf5_file = '%s/sample.h5' % options.out_dir sample_hdf5_out = h5py.File(sample_hdf5_file, 'w') sample_hdf5_out.create_dataset('test_in', data=seq_vecs) sample_hdf5_out.close() # update test HDF5 test_hdf5_file = sample_hdf5_file # convert to letters seqs = dna_io.vecs2dna(seq_vecs) ################################################################# # Torch predict ################################################################# if options.model_hdf5_file is None: options.model_hdf5_file = '%s/model_out.h5' % options.out_dir torch_cmd = 'basset_motifs_predict.lua %s %s %s' % ( model_file, test_hdf5_file, options.model_hdf5_file) print torch_cmd subprocess.call(torch_cmd, shell=True) # load model output model_hdf5_in = h5py.File(options.model_hdf5_file, 'r') filter_weights = np.array(model_hdf5_in['weights']) filter_outs = np.array(model_hdf5_in['outs']) model_hdf5_in.close() # store useful variables num_filters = filter_weights.shape[0] filter_size = filter_weights.shape[2] ################################################################# # individual filter plots ################################################################# # also save information contents filters_ic = [] meme_out = meme_intro('%s/filters_meme.txt' % options.out_dir, seqs) for f in range(num_filters): print 'Filter %d' % f # plot filter parameters as a heatmap plot_filter_heat(filter_weights[f, :, :], '%s/filter%d_heat.pdf' % (options.out_dir, f)) # write possum motif file filter_possum(filter_weights[f, :, :], 'filter%d' % f, '%s/filter%d_possum.txt' % (options.out_dir, f), options.trim_filters) # plot weblogo of high scoring outputs plot_filter_logo(filter_outs[:, f, :], filter_size, seqs, '%s/filter%d_logo' % (options.out_dir, f), maxpct_t=options.act_t) # make a PWM for the filter filter_pwm, nsites = make_filter_pwm('%s/filter%d_logo.fa' % (options.out_dir, f)) if nsites < 10: # no information filters_ic.append(0) else: # compute and save information content filters_ic.append(info_content(filter_pwm)) # add to the meme motif file meme_add(meme_out, f, filter_pwm, nsites, options.trim_filters) meme_out.close() ################################################################# # annotate filters ################################################################# # run tomtom subprocess.call( 'tomtom -dist pearson -thresh 0.1 -oc %s/tomtom %s/filters_meme.txt %s' % (options.out_dir, options.out_dir, options.meme_db), shell=True) # read in annotations filter_names = name_filters(num_filters, '%s/tomtom/tomtom.txt' % options.out_dir, options.meme_db) ################################################################# # print a table of information ################################################################# table_out = open('%s/table.txt' % options.out_dir, 'w') # print header for later panda reading header_cols = ('', 'consensus', 'annotation', 'ic', 'mean', 'std') print >> table_out, '%3s %19s %10s %5s %6s %6s' % header_cols for f in range(num_filters): # collapse to a consensus motif consensus = filter_motif(filter_weights[f, :, :]) # grab annotation annotation = '.' name_pieces = filter_names[f].split('_') if len(name_pieces) > 1: annotation = name_pieces[1] # plot density of filter output scores fmean, fstd = plot_score_density( np.ravel(filter_outs[:, f, :]), '%s/filter%d_dens.pdf' % (options.out_dir, f)) row_cols = (f, consensus, annotation, filters_ic[f], fmean, fstd) print >> table_out, '%-3d %19s %10s %5.2f %6.4f %6.4f' % row_cols table_out.close() ################################################################# # global filter plots ################################################################# if options.plot_heats: # plot filter-sequence heatmap plot_filter_seq_heat(filter_outs, '%s/filter_seqs.pdf' % options.out_dir) # plot filter-segment heatmap plot_filter_seg_heat(filter_outs, '%s/filter_segs.pdf' % options.out_dir) plot_filter_seg_heat(filter_outs, '%s/filter_segs_raw.pdf' % options.out_dir, whiten=False) # plot filter-target correlation heatmap plot_target_corr(filter_outs, seq_targets, filter_names, target_names, '%s/filter_target_cors_mean.pdf' % options.out_dir, 'mean') plot_target_corr(filter_outs, seq_targets, filter_names, target_names, '%s/filter_target_cors_max.pdf' % options.out_dir, 'max')
def main(): usage = 'usage: %prog [options] <model_file>' parser = OptionParser(usage) parser.add_option( '-a', dest='targets_file', default=None, help='File labelings targets in the second column [Default: %default]') parser.add_option( '-c', dest='center_nt', default=50, help='Center nt to consider kmers from [Default: %default]') parser.add_option('-d', dest='model_out_file', default=None, help='Pre-computed model output table.') parser.add_option('-k', dest='kmer', default=8, type='int', help='K-mer length [Default: %default]') parser.add_option('-l', dest='seq_len', default=1000, type='int', help='Input sequence length [Default: %default]') parser.add_option( '-n', dest='num_seqs', default=100000, type='int', help='Number of sequences to predict [Default: %default]') parser.add_option('-o', dest='out_dir', default='.') parser.add_option( '-r', dest='rc', default=False, action='store_true', help='Consider k-mers w/ their reverse complements [Default: %default]' ) parser.add_option( '-t', dest='targets', default=None, help= 'Comma-separated list of targets to analyze in more depth [Default: %default]' ) (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide Basset model file.') else: model_file = args[0] random.seed(2) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # generate random sequences ################################################################# # random sequences seq_vecs = np.zeros((options.num_seqs, 4, 1, options.seq_len), dtype='float16') for si in range(options.num_seqs): for li in range(options.seq_len): ni = random.randint(0, 3) seq_vecs[si, ni, 0, li] = 1 # create a new HDF5 file seq_hdf5_file = '%s/seqs.h5' % options.out_dir seq_hdf5_out = h5py.File(seq_hdf5_file, 'w') seq_hdf5_out.create_dataset('test_in', data=seq_vecs) seq_hdf5_out.close() # get fasta seq_dna = vecs2dna(seq_vecs) ################################################################# # Torch predict ################################################################# if options.model_out_file is None: options.model_out_file = '%s/model_out.txt' % options.out_dir torch_cmd = 'basset_predict.lua -scores %s %s %s' % ( model_file, seq_hdf5_file, options.model_out_file) print torch_cmd subprocess.call(torch_cmd, shell=True) # load scores seq_scores = np.loadtxt(options.model_out_file, dtype='float32') # read target labels if options.targets_file: target_labels = [ line.split()[1] for line in open(options.targets_file) ] else: target_labels = ['t%d' % (ti + 1) for ti in range(seq_scores.shape[1])] if options.targets == None: options.targets = range(seq_scores.shape[1]) ################################################################# # process and output ################################################################# kmers_start = (options.seq_len - options.center_nt) / 2 for ti in options.targets: ############################################## # hash scores by k-mer ############################################## kmer_scores = {} for si in range(len(seq_dna)): # get score sscore = seq_scores[si, ti] # hash to each center kmer for ki in range(kmers_start, kmers_start + options.center_nt): kmer = seq_dna[si][ki:ki + options.kmer] if options.rc: kmer = consider_rc(kmer) kmer_scores.setdefault(kmer, []).append(sscore) ############################################## # print table ############################################## table_out = open('%s/table%d.txt' % (options.out_dir, ti), 'w') for kmer in kmer_scores: cols = (kmer, len(kmer_scores[kmer]), np.mean(kmer_scores[kmer]), np.std(kmer_scores[kmer]) / math.sqrt(len(kmer_scores[kmer]))) print >> table_out, '%s %4d %6.3f %6.3f' % cols table_out.close()