def main(): usage = 'usage: %prog [options] <model_file> <input_file>' parser = OptionParser(usage) parser.add_option( '-a', dest='input_activity_file', help='Optional activity table corresponding to an input FASTA file') parser.add_option( '-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]') parser.add_option( '-g', dest='gain_height', default=False, action='store_true', help= 'Nucleotide heights determined by the max of loss and gain [Default: %default]' ) parser.add_option('-m', dest='min_limit', default=0.1, type='float', help='Minimum heatmap limit [Default: %default]') parser.add_option( '-n', dest='center_nt', default=200, type='int', help='Center nt to mutate and plot in the heat map [Default: %default]' ) parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]') parser.add_option( '-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]') parser.add_option( '-t', dest='targets', default='0', help= 'Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]' ) (options, args) = parser.parse_args() if len(args) != 2: parser.error( 'Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file' ) else: model_file = args[0] input_file = args[1] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # parse input file ################################################################# try: # input_file is FASTA # load sequences and headers seqs = [] seq_headers = [] for line in open(input_file): if line[0] == '>': seq_headers.append(line[1:].rstrip()) seqs.append('') else: seqs[-1] += line.rstrip() model_input_hdf5 = '%s/model_in.h5' % options.out_dir if options.input_activity_file: # one hot code seqs_1hot, targets = dna_io.load_data_1hot( input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False) # read in target names target_labels = open( options.input_activity_file).readline().strip().split('\t') else: # load sequences seqs_1hot = dna_io.load_sequences(input_file, permute=False) targets = None target_labels = None # sample if options.sample: sample_i = np.array( random.sample(xrange(seqs_1hot.shape[0]), options.sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] if targets is not None: targets = targets[sample_i] # reshape sequences for torch seqs_1hot = seqs_1hot.reshape( (seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4)) # write as test data to a HDF5 file h5f = h5py.File(model_input_hdf5, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() except (IOError, IndexError): # input_file is HDF5 try: model_input_hdf5 = input_file # load (sampled) test data from HDF5 hdf5_in = h5py.File(input_file, 'r') seqs_1hot = np.array(hdf5_in['test_in']) targets = np.array(hdf5_in['test_out']) try: # TEMP seq_headers = np.array(hdf5_in['test_headers']) target_labels = np.array(hdf5_in['target_labels']) except: seq_headers = None target_labels = None hdf5_in.close() # sample if options.sample: sample_i = np.array( random.sample(xrange(seqs_1hot.shape[0]), options.sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] targets = targets[sample_i] # write sampled data to a new HDF5 file model_input_hdf5 = '%s/model_in.h5' % options.out_dir h5f = h5py.File(model_input_hdf5, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() # convert to ACGT sequences seqs = dna_io.vecs2dna(seqs_1hot) except IOError: parser.error('Could not parse input file as FASTA or HDF5.') ################################################################# # Torch predict modifications ################################################################# if options.model_hdf5_file is None: options.model_hdf5_file = '%s/model_out.h5' % options.out_dir torch_cmd = 'basset_sat_predict.lua -center_nt %d %s %s %s' % ( options.center_nt, model_file, model_input_hdf5, options.model_hdf5_file) print torch_cmd subprocess.call(torch_cmd, shell=True) ################################################################# # load modification predictions ################################################################# hdf5_in = h5py.File(options.model_hdf5_file, 'r') seq_mod_preds = np.array(hdf5_in['seq_mod_preds']) hdf5_in.close() # trim seqs to match seq_mod_preds length seq_len = len(seqs[0]) delta_start = 0 delta_len = seq_mod_preds.shape[2] if delta_len < seq_len: delta_start = (seq_len - delta_len) / 2 for i in range(len(seqs)): seqs[i] = seqs[i][delta_start:delta_start + delta_len] # decide which cells to plot if options.targets == '-1': plot_targets = xrange(seq_mod_preds.shape[3]) else: plot_targets = [int(ci) for ci in options.targets.split(',')] ################################################################# # plot ################################################################# table_out = open('%s/table.txt' % options.out_dir, 'w') rdbu = sns.color_palette("RdBu_r", 10) nts = 'ACGT' for si in range(seq_mod_preds.shape[0]): try: header = seq_headers[si] except TypeError: header = 'seq%d' % si seq = seqs[si] # plot some descriptive heatmaps for each individual cell type for ci in plot_targets: seq_mod_preds_cell = seq_mod_preds[si, :, :, ci] real_pred_cell = get_real_pred(seq_mod_preds_cell, seq) # compute matrices norm_matrix = seq_mod_preds_cell - real_pred_cell min_scores = seq_mod_preds_cell.min(axis=0) max_scores = seq_mod_preds_cell.max(axis=0) minmax_matrix = np.vstack( [min_scores - real_pred_cell, max_scores - real_pred_cell]) # prepare figure sns.set(style='white', font_scale=0.5) sns.axes_style({'axes.linewidth': 1}) heat_cols = 400 sad_start = 1 sad_end = 323 logo_start = 0 logo_end = 324 fig = plt.figure(figsize=(20, 3)) ax_logo = plt.subplot2grid((3, heat_cols), (0, logo_start), colspan=(logo_end - logo_start)) ax_sad = plt.subplot2grid((3, heat_cols), (1, sad_start), colspan=(sad_end - sad_start)) ax_heat = plt.subplot2grid((3, heat_cols), (2, 0), colspan=heat_cols) # print a WebLogo of the sequence vlim = max(options.min_limit, abs(minmax_matrix).max()) if options.gain_height: seq_heights = 0.25 + 1.75 / vlim * (abs(minmax_matrix).max( axis=0)) else: seq_heights = 0.25 + 1.75 / vlim * (-minmax_matrix[0]) logo_eps = '%s/%s_c%d_seq.eps' % (options.out_dir, header_filename(header), ci) seq_logo(seq, seq_heights, logo_eps) # add to figure logo_png = '%s.png' % logo_eps[:-4] subprocess.call('convert -density 300 %s %s' % (logo_eps, logo_png), shell=True) logo = Image.open(logo_png) ax_logo.imshow(logo) ax_logo.set_axis_off() # plot loss and gain SAD scores ax_sad.plot(-minmax_matrix[0], c=rdbu[0], label='loss', linewidth=1) ax_sad.plot(minmax_matrix[1], c=rdbu[-1], label='gain', linewidth=1) ax_sad.set_xlim(0, minmax_matrix.shape[1]) ax_sad.legend() # ax_sad.grid(True, linestyle=':') for axis in ['top', 'bottom', 'left', 'right']: ax_sad.spines[axis].set_linewidth(0.5) # plot real-normalized scores vlim = max(options.min_limit, abs(norm_matrix).max()) sns.heatmap(norm_matrix, linewidths=0, cmap='RdBu_r', vmin=-vlim, vmax=vlim, xticklabels=False, ax=ax_heat) ax_heat.yaxis.set_ticklabels('TGCA', rotation='horizontal') # , size=10) # save final figure plt.tight_layout() plt.savefig('%s/%s_c%d_heat.pdf' % (options.out_dir, header.replace(':', '_'), ci), dpi=300) plt.close() ################################################################# # print table of nt variability for each cell ################################################################# for ci in range(seq_mod_preds.shape[3]): seq_mod_preds_cell = seq_mod_preds[si, :, :, ci] real_pred_cell = get_real_pred(seq_mod_preds_cell, seq) min_scores = seq_mod_preds_cell.min(axis=0) max_scores = seq_mod_preds_cell.max(axis=0) loss_matrix = real_pred_cell - seq_mod_preds_cell.min(axis=0) gain_matrix = seq_mod_preds_cell.max(axis=0) - real_pred_cell for pos in range(seq_mod_preds_cell.shape[1]): cols = [ header, delta_start + pos, ci, loss_matrix[pos], gain_matrix[pos] ] print >> table_out, '\t'.join([str(c) for c in cols]) table_out.close()
def main(): usage = 'usage: %prog [options] <fasta_file> <targets_file> <out_file>' parser = OptionParser(usage) parser.add_option('-a', dest='add_features_file', default=None, help='Table of additional features') parser.add_option('-b', dest='batch_size', default=None, type='int', help='Align sizes with batch size') parser.add_option( '-c', dest='counts', default=False, action='store_true', help= 'Validation and training proportions are given as raw counts [Default: %default]' ) parser.add_option( '-e', dest='extend_length', type='int', default=None, help='Extend all sequences to this length [Default: %default]') parser.add_option('-r', dest='permute', default=False, action='store_true', help='Permute sequences [Default: %default]') parser.add_option('-s', dest='random_seed', default=1, type='int', help='numpy.random seed [Default: %default]') parser.add_option('-t', dest='test_pct', default=0, type='float', help='Test % [Default: %default]') parser.add_option('-v', dest='valid_pct', default=0, type='float', help='Validation % [Default: %default]') parser.add_option('--vt', dest='valid_test', default=False, action='store_true', help='Use validation as test, too [Default: %default]') (options, args) = parser.parse_args() if len(args) != 3: parser.error( 'Must provide fasta file, targets file, and an output prefix') else: fasta_file = args[0] targets_file = args[1] out_file = args[2] # seed rng before shuffle npr.seed(options.random_seed) ################################################################# # load data ################################################################# seqs, targets = dna_io.load_data_1hot(fasta_file, targets_file, extend_len=options.extend_length, mean_norm=False, whiten=False, permute=False, sort=False) # reshape sequences for torch seqs = seqs.reshape((seqs.shape[0], 4, 1, seqs.shape[1] / 4)) # read headers headers = [] for line in open(fasta_file): if line[0] == '>': headers.append(line[1:].rstrip()) headers = np.array(headers) # read labels target_labels = open(targets_file).readline().strip().split('\t') # read additional features if options.add_features_file: df_add = pd.read_table(options.add_features_file, index_col=0) df_add = df_add.astype(np.float32, copy=False) # permute if options.permute: order = npr.permutation(seqs.shape[0]) seqs = seqs[order] targets = targets[order] headers = headers[order] if options.add_features_file: df_add = df_add.iloc[order] # check proper sum if options.counts: assert (options.test_pct + options.valid_pct <= seqs.shape[0]) else: assert (options.test_pct + options.valid_pct <= 1.0) ################################################################# # divide data ################################################################# if options.counts: test_count = int(options.test_pct) valid_count = int(options.valid_pct) else: test_count = int(0.5 + options.test_pct * seqs.shape[0]) valid_count = int(0.5 + options.valid_pct * seqs.shape[0]) train_count = seqs.shape[0] - test_count - valid_count train_count = batch_round(train_count, options.batch_size) print >> sys.stderr, '%d training sequences ' % train_count test_count = batch_round(test_count, options.batch_size) print >> sys.stderr, '%d test sequences ' % test_count valid_count = batch_round(valid_count, options.batch_size) print >> sys.stderr, '%d validation sequences ' % valid_count i = 0 train_seqs, train_targets = seqs[i:i + train_count, :], targets[i:i + train_count, :] i += train_count valid_seqs, valid_targets, valid_headers = seqs[ i:i + valid_count, :], targets[i:i + valid_count, :], headers[i:i + valid_count] i += valid_count test_seqs, test_targets, test_headers = seqs[i:i + test_count, :], targets[ i:i + test_count, :], headers[i:i + test_count] if options.add_features_file: i = 0 train_add = df_add.iloc[i:i + train_count] i += train_count valid_add = df_add.iloc[i:i + valid_count] i += valid_count test_add = df_add.iloc[i:i + test_count] ################################################################# # construct hdf5 representation ################################################################# h5f = h5py.File(out_file, 'w') h5f.create_dataset('target_labels', data=target_labels) if train_count > 0: h5f.create_dataset('train_in', data=train_seqs) h5f.create_dataset('train_out', data=train_targets) if valid_count > 0: h5f.create_dataset('valid_in', data=valid_seqs) h5f.create_dataset('valid_out', data=valid_targets) if test_count > 0: h5f.create_dataset('test_in', data=test_seqs) h5f.create_dataset('test_out', data=test_targets) h5f.create_dataset('test_headers', data=test_headers) elif options.valid_test: h5f.create_dataset('test_in', data=valid_seqs) h5f.create_dataset('test_out', data=valid_targets) h5f.create_dataset('test_headers', data=valid_headers) if options.add_features_file: h5f.create_dataset('add_labels', data=list(df_add.columns)) if train_count > 0: h5f.create_dataset('train_add', data=train_add.as_matrix()) if valid_count > 0: h5f.create_dataset('valid_add', data=valid_add.as_matrix()) if test_count > 0: h5f.create_dataset('test_add', data=test_add.as_matrix()) elif options.valid_test: h5f.create_dataset('test_add', data=valid_add.as_matrix()) h5f.close()
quit() # filenames fasta_file = args[0] targets_file = args[1] target_labels_file = args[2] train_x_file = args[3] train_y_file = args[4] val_x_file = args[5] val_y_file = args[6] test_x_file = args[7] test_y_file = args[8] # get data print "getting data" seqs, targets = dna_io.load_data_1hot(fasta_file, targets_file, extend_len=None, mean_norm=False, \ whiten=False, permute=False, sort=False) assert (seqs.shape[0] == targets.shape[0]) seqs = seqs.reshape( (seqs.shape[0], 4, seqs.shape[1] / 4)) # shape = (dataset_size, 4, 600) seqs = np.transpose(seqs, (0, 2, 1)) # make shape = (dataset_size, 600, 4) # get an array of the cell types print "getting target labels" target_labels = [] with open(targets_file, "r") as target_file: target_labels = target_file.readline().strip().split("\t") # permute data if need be if options.permute:
def main(): usage = 'usage: %prog [options] <model_file> <input_file>' parser = OptionParser(usage) parser.add_option('-a', dest='input_activity_file', help='Optional activity table corresponding to an input FASTA file') parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]') parser.add_option('-g', dest='gain_height', default=False, action='store_true', help='Nucleotide heights determined by the max of loss and gain [Default: %default]') parser.add_option('-m', dest='min_limit', default=0.1, type='float', help='Minimum heatmap limit [Default: %default]') parser.add_option('-n', dest='center_nt', default=0, type='int', help='Center nt to mutate and plot in the heat map [Default: %default]') parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]') parser.add_option('-p', dest='print_table_all', default=False, action='store_true', help='Print all targets to the table [Default: %default]') parser.add_option('-r', dest='rng_seed', default=1, type='float', help='Random number generator seed [Default: %default]') parser.add_option('-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]') parser.add_option('-t', dest='targets', default='0', help='Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file') else: model_file = args[0] input_file = args[1] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) random.seed(options.rng_seed) ################################################################# # parse input file ################################################################# try: # input_file is FASTA # load sequences and headers seqs = [] seq_headers = [] for line in open(input_file): if line[0] == '>': seq_headers.append(line[1:].rstrip()) seqs.append('') else: seqs[-1] += line.rstrip() model_input_hdf5 = '%s/model_in.h5'%options.out_dir if options.input_activity_file: # one hot code seqs_1hot, targets = dna_io.load_data_1hot(input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False) # read in target names target_labels = open(options.input_activity_file).readline().strip().split('\t') else: # load sequences seqs_1hot = dna_io.load_sequences(input_file, permute=False) targets = None target_labels = None # sample if options.sample: sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] if targets is not None: targets = targets[sample_i] # reshape sequences for torch seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,seqs_1hot.shape[1]/4)) # write as test data to a HDF5 file h5f = h5py.File(model_input_hdf5, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() except (IOError, IndexError): # input_file is HDF5 try: model_input_hdf5 = input_file # load (sampled) test data from HDF5 hdf5_in = h5py.File(input_file, 'r') seqs_1hot = np.array(hdf5_in['test_in']) targets = np.array(hdf5_in['test_out']) try: # TEMP seq_headers = np.array(hdf5_in['test_headers']) target_labels = np.array(hdf5_in['target_labels']) except: seq_headers = None target_labels = None hdf5_in.close() # sample if options.sample: sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] targets = targets[sample_i] # write sampled data to a new HDF5 file model_input_hdf5 = '%s/model_in.h5'%options.out_dir h5f = h5py.File(model_input_hdf5, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() # convert to ACGT sequences seqs = dna_io.vecs2dna(seqs_1hot) except IOError: parser.error('Could not parse input file as FASTA or HDF5.') ################################################################# # Torch predict modifications ################################################################# if options.model_hdf5_file is None: options.model_hdf5_file = '%s/model_out.h5' % options.out_dir torch_cmd = 'basset_sat_predict.lua -center_nt %d %s %s %s' % (options.center_nt, model_file, model_input_hdf5, options.model_hdf5_file) print torch_cmd subprocess.call(torch_cmd, shell=True) ################################################################# # load modification predictions ################################################################# hdf5_in = h5py.File(options.model_hdf5_file, 'r') seq_mod_preds = np.array(hdf5_in['seq_mod_preds']) hdf5_in.close() # trim seqs to match seq_mod_preds length seq_len = len(seqs[0]) delta_start = 0 delta_len = seq_mod_preds.shape[2] if delta_len < seq_len: delta_start = (seq_len - delta_len)/2 for i in range(len(seqs)): seqs[i] = seqs[i][delta_start:delta_start+delta_len] # decide which cells to plot if options.targets == '-1': plot_targets = xrange(seq_mod_preds.shape[3]) else: plot_targets = [int(ci) for ci in options.targets.split(',')] ################################################################# # plot ################################################################# table_out = open('%s/table.txt' % options.out_dir, 'w') rdbu = sns.color_palette("RdBu_r", 10) nts = 'ACGT' for si in range(seq_mod_preds.shape[0]): try: header = seq_headers[si] except TypeError: header = 'seq%d' % si seq = seqs[si] # plot some descriptive heatmaps for each individual cell type for ci in plot_targets: seq_mod_preds_cell = seq_mod_preds[si,:,:,ci] real_pred_cell = get_real_pred(seq_mod_preds_cell, seq) # compute matrices norm_matrix = seq_mod_preds_cell - real_pred_cell min_scores = seq_mod_preds_cell.min(axis=0) max_scores = seq_mod_preds_cell.max(axis=0) minmax_matrix = np.vstack([min_scores - real_pred_cell, max_scores - real_pred_cell]) # prepare figure sns.set(style='white', font_scale=0.5) sns.axes_style({'axes.linewidth':1}) heat_cols = 400 sad_start = 1 sad_end = 323 logo_start = 0 logo_end = 324 fig = plt.figure(figsize=(20,3)) ax_logo = plt.subplot2grid((3,heat_cols), (0,logo_start), colspan=(logo_end-logo_start)) ax_sad = plt.subplot2grid((3,heat_cols), (1,sad_start), colspan=(sad_end-sad_start)) ax_heat = plt.subplot2grid((3,heat_cols), (2,0), colspan=heat_cols) # print a WebLogo of the sequence vlim = max(options.min_limit, abs(minmax_matrix).max()) if options.gain_height: seq_heights = 0.25 + 1.75/vlim*(abs(minmax_matrix).max(axis=0)) else: seq_heights = 0.25 + 1.75/vlim*(-minmax_matrix[0]) logo_eps = '%s/%s_c%d_seq.eps' % (options.out_dir, header_filename(header), ci) seq_logo(seq, seq_heights, logo_eps) # add to figure logo_png = '%s.png' % logo_eps[:-4] subprocess.call('convert -density 300 %s %s' % (logo_eps, logo_png), shell=True) logo = Image.open(logo_png) ax_logo.imshow(logo) ax_logo.set_axis_off() # plot loss and gain SAD scores ax_sad.plot(-minmax_matrix[0], c=rdbu[0], label='loss', linewidth=1) ax_sad.plot(minmax_matrix[1], c=rdbu[-1], label='gain', linewidth=1) ax_sad.set_xlim(0,minmax_matrix.shape[1]) ax_sad.legend() # ax_sad.grid(True, linestyle=':') for axis in ['top','bottom','left','right']: ax_sad.spines[axis].set_linewidth(0.5) # plot real-normalized scores vlim = max(options.min_limit, abs(norm_matrix).max()) sns.heatmap(norm_matrix, linewidths=0, cmap='RdBu_r', vmin=-vlim, vmax=vlim, xticklabels=False, ax=ax_heat) ax_heat.yaxis.set_ticklabels('TGCA', rotation='horizontal') # , size=10) # save final figure plt.tight_layout() plt.savefig('%s/%s_c%d_heat.pdf' % (options.out_dir,header.replace(':','_'), ci), dpi=300) plt.close() ################################################################# # print table of nt variability for each cell ################################################################# print_targets = plot_targets if options.print_table_all: print_targets = range(seq_mod_preds.shape[3]) for ci in print_targets: seq_mod_preds_cell = seq_mod_preds[si,:,:,ci] real_pred_cell = get_real_pred(seq_mod_preds_cell, seq) min_scores = seq_mod_preds_cell.min(axis=0) max_scores = seq_mod_preds_cell.max(axis=0) loss_matrix = real_pred_cell - seq_mod_preds_cell.min(axis=0) gain_matrix = seq_mod_preds_cell.max(axis=0) - real_pred_cell for pos in range(seq_mod_preds_cell.shape[1]): cols = [header, delta_start+pos, ci, loss_matrix[pos], gain_matrix[pos]] print >> table_out, '\t'.join([str(c) for c in cols]) table_out.close()
def main(): usage = 'usage: %prog [options] <model_file> <profile_file> <input_file>' parser = OptionParser(usage) parser.add_option( '-a', dest='input_activity_file', help='Optional activity table corresponding to an input FASTA file') parser.add_option( '--all', dest='all_data', default=False, action='store_true', help= 'Search all training/valid/test sequences. By default we search only the test set. [Default: %default]' ) parser.add_option('--cuda', dest='cuda', default=False, action='store_true', help='Run on GPGPU [Default: %default]') parser.add_option('--cudnn', dest='cudnn', default=False, action='store_true', help='Run on GPGPU w/cuDNN [Default: %default]') parser.add_option( '-d', dest='model_out_file', default=None, help='Pre-computed model predictions output table [Default: %default]') parser.add_option( '-e', dest='norm_even', default=False, action='store_true', help= 'Normalize the weights for the positive and negative datasets to be even [Default: %default]' ) parser.add_option('-f', dest='font_heat', default=6, type='int', help='Heat map axis font size [Default: %default]') parser.add_option('-n', dest='num_dissect', default=10, type='int', help='Dissect the top n hits [Default: %default]') parser.add_option('-o', dest='out_dir', default='profile', help='Output directory [Default: %default]') parser.add_option( '-r', dest='norm_preds', default=False, action='store_true', help='Normalize predictions to have equal frequency [Default: %default]' ) parser.add_option( '-z', dest='weight_zero', default=1.0, type='float', help= 'Adjust the weights for the zero samples by this value [Default: %default]' ) (options, args) = parser.parse_args() if len(args) != 3: parser.error( 'Must provide Basset model file, activity profile file, and input sequences (as a FASTA file or test data in an HDF file)' ) else: model_file = args[0] profile_file = args[1] input_file = args[2] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # parse input file ################################################################# try: # input_file is FASTA # load sequences and headers seqs = [] seq_headers = [] for line in open(input_file): if line[0] == '>': seq_headers.append(line[1:].rstrip()) seqs.append('') else: seqs[-1] += line.rstrip() # convert to arrays seqs = np.array(seqs) seq_headers = np.array(seq_headers) model_input_hdf5 = '%s/model_in.h5' % options.out_dir if options.input_activity_file: # one hot code seqs_1hot, targets = dna_io.load_data_1hot( input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False) else: # load sequences seqs_1hot = dna_io.load_sequences(input_file, permute=False) targets = None # reshape sequences for torch seqs_1hot = seqs_1hot.reshape( (seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4)) # write as test data to a HDF5 file h5f = h5py.File(model_input_hdf5, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() except (IOError, IndexError, UnicodeDecodeError): # input_file is HDF5 try: model_input_hdf5 = input_file # load (sampled) test data from HDF5 hdf5_in = h5py.File(input_file, 'r') seqs_1hot = np.array(hdf5_in['test_in']) targets = np.array(hdf5_in['test_out']) seq_headers = np.array( [h.decode('UTF-8') for h in hdf5_in['test_headers']]) hdf5_in.close() # convert to ACGT sequences seqs = dna_io.vecs2dna(seqs_1hot) except IOError: parser.error('Could not parse input file as FASTA or HDF5.') ################################################################# # Torch predict modifications ################################################################# # GPU options (needed below, too) gpgpu_str = '' if options.cudnn: gpgpu_str = '-cudnn' elif options.cuda: gpgpu_str = '-cuda' if options.model_out_file is None: options.model_out_file = '%s/preds.txt' % options.out_dir torch_cmd = 'basset_predict.lua -mc_n 10 -rc %s %s %s %s' % ( gpgpu_str, model_file, model_input_hdf5, options.model_out_file) print(torch_cmd) subprocess.call(torch_cmd, shell=True) # read in predictions seqs_preds = np.loadtxt(options.model_out_file) num_targets = seqs_preds.shape[1] ################################################################# # parse profile file ################################################################# activity_profile, profile_weights, profile_mask, target_labels = load_profile( profile_file, num_targets, options.norm_even, options.weight_zero) # normalize predictions if options.norm_preds: pred_means = seqs_preds.mean(axis=0) # save to file for basset_refine.py np.save('%s/pred_means' % options.out_dir, pred_means) # aim for profile weighted average aim_mean = np.average(pred_means[profile_mask], weights=profile_weights[profile_mask]) # normalize for ti in range(seqs_preds.shape[1]): ratio_ti = pred_means[ti] / aim_mean if profile_mask[ti] and (ratio_ti < 1 / 4 or ratio_ti > 4): print( 'WARNING: target %d with mean %.4f differs 4-fold from the median %.3f' % (ti, pred_means[ti], aim_mean), file=sys.stderr) seqs_preds[:, ti] = znorm(seqs_preds[:, ti], pred_means[ti], aim_mean) ################################################################# # plot clustered heat map limited to relevant targets ################################################################# seqs_preds_prof = seqs_preds[:, profile_mask] seqs_preds_var = seqs_preds_prof.var(axis=1) seqs_sort_var = np.argsort(seqs_preds_var)[::-1] # heat map plt.figure() g = sns.clustermap(np.transpose(seqs_preds_prof[seqs_sort_var[:1500]]), metric='cosine', linewidths=0, yticklabels=target_labels[profile_mask], xticklabels=False) plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) for label in g.ax_heatmap.yaxis.get_majorticklabels(): label.set_fontsize(options.font_heat) plt.savefig('%s/heat_clust.pdf' % options.out_dir) plt.close() # dimension reduction # model_pca = PCA(n_components=50) # spp_pca = model.fit_transform(np.transpose(seqs_preds_prof)) # model = TSNE(n_components=2, perplexity=5, metric='euclidean') # spp_dr = model.fit_transform(spp_pca) model = PCA(n_components=2) spp_dr = model.fit_transform(np.transpose(seqs_preds_prof)) plt.figure() plt.scatter(spp_dr[:, 0], spp_dr[:, 1], c='black', s=5) target_labels_prof_concise = [ tl.split(':')[-1] for tl in target_labels[profile_mask] ] for label, x, y, activity in zip(target_labels_prof_concise, spp_dr[:, 0], spp_dr[:, 1], activity_profile[profile_mask]): plt.annotate(label, xy=(x, y), size=10, color=sns.color_palette('deep')[int(activity)]) plt.savefig('%s/dim_red.pdf' % options.out_dir) plt.close() ################################################################# # compute profile distances ################################################################# # compute prediction distances seqs_pdists = [] for si in range(seqs_preds.shape[0]): # sd = np.power(seqs_preds[si,profile_mask]-activity_profile[profile_mask], 2).sum() sd = log_loss(activity_profile[profile_mask], seqs_preds[si, profile_mask], sample_weight=profile_weights[profile_mask]) seqs_pdists.append(sd) seqs_pdists = np.array(seqs_pdists) # obtain sorted indexes seqs_sort_dist = np.argsort(seqs_pdists) # compute target distances seqs_tdists = [] for si in range(seqs_preds.shape[0]): tdists = np.absolute(targets[si, profile_mask] - activity_profile[profile_mask]) tdists_weight = np.multiply(tdists, profile_weights[profile_mask]) td = tdists_weight.sum() seqs_tdists.append(td) seqs_tdists = np.array(seqs_tdists) # print as table table_out = open('%s/table.txt' % options.out_dir, 'w') for si in seqs_sort_dist: cols = [si, seqs_pdists[si], seqs_tdists[si]] + list( seqs_preds[si, profile_mask]) print('\t'.join([str(c) for c in cols]), file=table_out) table_out.close() ################################################################# # plot sorted heat map ################################################################# plt.figure() g = sns.clustermap(np.transpose(seqs_preds_prof[seqs_sort_dist[:1000]]), col_cluster=False, metric='cosine', linewidths=0, yticklabels=target_labels[profile_mask], xticklabels=False) plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) for label in g.ax_heatmap.yaxis.get_majorticklabels(): label.set_fontsize(options.font_heat) plt.savefig('%s/heat_rank.pdf' % options.out_dir) plt.close() ################################################################# # dissect the top hits ################################################################# satmut_targets = ','.join( [str(ti) for ti in range(len(activity_profile)) if profile_mask[ti]]) if gpgpu_str != '': gpgpu_str = '-%s' % gpgpu_str for ni in range(options.num_dissect): si = seqs_sort_dist[ni] # print FASTA fasta_file = '%s/seq%d.fa' % (options.out_dir, ni) fasta_out = open(fasta_file, 'w') print('>%s\n%s' % (seq_headers[si], seqs[si]), file=fasta_out) fasta_out.close() # saturated mutagenesis cmd = 'basset_sat.py %s --mc_n 10 -n 500 -o %s/satmut%d -t %s %s %s' % ( gpgpu_str, options.out_dir, ni, satmut_targets, model_file, fasta_file) subprocess.call(cmd, shell=True) # predictions and targets heat profile_sort = np.argsort(activity_profile[profile_mask]) heat_mat = np.array([ activity_profile[profile_mask], targets[si, profile_mask], seqs_preds_prof[si] ]) heat_mat = heat_mat[:, profile_sort] plt.figure() ax = sns.heatmap(np.transpose(heat_mat), yticklabels=target_labels[profile_mask][profile_sort], xticklabels=['Desired', 'Experiment', 'Prediction']) plt.setp(ax.xaxis.get_majorticklabels(), rotation=-45) plt.setp(ax.yaxis.get_majorticklabels(), rotation=-0) for label in ax.yaxis.get_majorticklabels(): label.set_fontsize(options.font_heat) plt.savefig('%s/heat%d.pdf' % (options.out_dir, ni)) plt.close()
def main(): usage = "usage: %prog [options] <model_file> <input_file>" parser = OptionParser(usage) parser.add_option( "-a", dest="input_activity_file", help="Optional activity table corresponding to an input FASTA file" ) parser.add_option( "-d", dest="model_hdf5_file", default=None, help="Pre-computed model output as HDF5 [Default: %default]" ) parser.add_option( "-g", dest="gain_height", default=False, action="store_true", help="Nucleotide heights determined by the max of loss and gain [Default: %default]", ) parser.add_option( "-m", dest="min_limit", default=0.1, type="float", help="Minimum heatmap limit [Default: %default]" ) parser.add_option( "-n", dest="center_nt", default=200, type="int", help="Center nt to mutate and plot in the heat map [Default: %default]", ) parser.add_option("-o", dest="out_dir", default="heat", help="Output directory [Default: %default]") parser.add_option( "-s", dest="sample", default=None, type="int", help="Sample sequences from the test set [Default:%default]" ) parser.add_option( "-t", dest="targets", default="0", help="Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]", ) (options, args) = parser.parse_args() if len(args) != 2: parser.error("Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file") else: model_file = args[0] input_file = args[1] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # parse input file ################################################################# try: # input_file is FASTA # load sequences and headers seqs = [] seq_headers = [] for line in open(input_file): if line[0] == ">": seq_headers.append(line[1:].rstrip()) seqs.append("") else: seqs[-1] += line.rstrip() model_input_hdf5 = "%s/model_in.h5" % options.out_dir if options.input_activity_file: # one hot code seqs_1hot, targets = dna_io.load_data_1hot( input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False ) # read in target names target_labels = open(options.input_activity_file).readline().strip().split("\t") else: # load sequences seqs_1hot = dna_io.load_sequences(input_file, permute=False) targets = None target_labels = None # sample if options.sample: sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] if targets is not None: targets = targets[sample_i] # reshape sequences for torch seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4)) # write as test data to a HDF5 file h5f = h5py.File(model_input_hdf5, "w") h5f.create_dataset("test_in", data=seqs_1hot) h5f.close() except (IOError, IndexError): # input_file is HDF5 try: model_input_hdf5 = input_file # load (sampled) test data from HDF5 hdf5_in = h5py.File(input_file, "r") seqs_1hot = np.array(hdf5_in["test_in"]) targets = np.array(hdf5_in["test_out"]) try: # TEMP seq_headers = np.array(hdf5_in["test_headers"]) target_labels = np.array(hdf5_in["target_labels"]) except: seq_headers = None target_labels = None hdf5_in.close() # sample if options.sample: sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] targets = targets[sample_i] # write sampled data to a new HDF5 file model_input_hdf5 = "%s/model_in.h5" % options.out_dir h5f = h5py.File(model_input_hdf5, "w") h5f.create_dataset("test_in", data=seqs_1hot) h5f.close() # convert to ACGT sequences seqs = dna_io.vecs2dna(seqs_1hot) except IOError: parser.error("Could not parse input file as FASTA or HDF5.") ################################################################# # Torch predict modifications ################################################################# if options.model_hdf5_file is None: options.model_hdf5_file = "%s/model_out.h5" % options.out_dir torch_cmd = "basset_sat_predict.lua -center_nt %d %s %s %s" % ( options.center_nt, model_file, model_input_hdf5, options.model_hdf5_file, ) if subprocess.call(torch_cmd, shell=True): message("Error running basset_sat_predict.lua", "error") ################################################################# # load modification predictions ################################################################# hdf5_in = h5py.File(options.model_hdf5_file, "r") seq_mod_preds = np.array(hdf5_in["seq_mod_preds"]) hdf5_in.close() # trim seqs to match seq_mod_preds length seq_len = len(seqs[0]) delta_start = 0 delta_len = seq_mod_preds.shape[2] if delta_len < seq_len: delta_start = (seq_len - delta_len) / 2 for i in range(len(seqs)): seqs[i] = seqs[i][delta_start : delta_start + delta_len] # decide which cells to plot if options.targets == "-1": plot_targets = xrange(seq_mod_preds.shape[3]) else: plot_targets = [int(ci) for ci in options.targets.split(",")] ################################################################# # plot ################################################################# table_out = open("%s/table.txt" % options.out_dir, "w") rdbu = sns.color_palette("RdBu_r", 10) nts = "ACGT" for si in range(seq_mod_preds.shape[0]): try: header = seq_headers[si] except TypeError: header = "seq%d" % si seq = seqs[si] # plot some descriptive heatmaps for each individual cell type for ci in plot_targets: seq_mod_preds_cell = seq_mod_preds[si, :, :, ci] real_pred_cell = get_real_pred(seq_mod_preds_cell, seq) # compute matrices norm_matrix = seq_mod_preds_cell - real_pred_cell min_scores = seq_mod_preds_cell.min(axis=0) max_scores = seq_mod_preds_cell.max(axis=0) minmax_matrix = np.vstack([min_scores - real_pred_cell, max_scores - real_pred_cell]) # prepare figure sns.set(style="white", font_scale=0.5) sns.axes_style({"axes.linewidth": 1}) heat_cols = 400 sad_start = 1 sad_end = 323 logo_start = 0 logo_end = 324 fig = plt.figure(figsize=(20, 3)) ax_logo = plt.subplot2grid((3, heat_cols), (0, logo_start), colspan=(logo_end - logo_start)) ax_sad = plt.subplot2grid((3, heat_cols), (1, sad_start), colspan=(sad_end - sad_start)) ax_heat = plt.subplot2grid((3, heat_cols), (2, 0), colspan=heat_cols) # print a WebLogo of the sequence vlim = max(options.min_limit, abs(minmax_matrix).max()) if options.gain_height: seq_heights = 0.25 + 1.75 / vlim * (abs(minmax_matrix).max(axis=0)) else: seq_heights = 0.25 + 1.75 / vlim * (-minmax_matrix[0]) logo_eps = "%s/%s_c%d_seq.eps" % (options.out_dir, header_filename(header), ci) seq_logo(seq, seq_heights, logo_eps) # add to figure logo_png = "%s.png" % logo_eps[:-4] logo_cmd = "convert -density 300 %s %s" % (logo_eps, logo_png) if subprocess.call(logo_cmd, shell=True): message("Error running convert", "error") logo = Image.open(logo_png) ax_logo.imshow(logo) ax_logo.set_axis_off() # plot loss and gain SAD scores ax_sad.plot(-minmax_matrix[0], c=rdbu[0], label="loss", linewidth=1) ax_sad.plot(minmax_matrix[1], c=rdbu[-1], label="gain", linewidth=1) ax_sad.set_xlim(0, minmax_matrix.shape[1]) ax_sad.legend() # ax_sad.grid(True, linestyle=':') for axis in ["top", "bottom", "left", "right"]: ax_sad.spines[axis].set_linewidth(0.5) # plot real-normalized scores vlim = max(options.min_limit, abs(norm_matrix).max()) sns.heatmap(norm_matrix, linewidths=0, cmap="RdBu_r", vmin=-vlim, vmax=vlim, xticklabels=False, ax=ax_heat) ax_heat.yaxis.set_ticklabels("TGCA", rotation="horizontal") # , size=10) # save final figure plt.tight_layout() plt.savefig("%s/%s_c%d_heat.pdf" % (options.out_dir, header.replace(":", "_"), ci), dpi=300) plt.close() ################################################################# # print table of nt variability for each cell ################################################################# for ci in range(seq_mod_preds.shape[3]): seq_mod_preds_cell = seq_mod_preds[si, :, :, ci] real_pred_cell = get_real_pred(seq_mod_preds_cell, seq) min_scores = seq_mod_preds_cell.min(axis=0) max_scores = seq_mod_preds_cell.max(axis=0) loss_matrix = real_pred_cell - seq_mod_preds_cell.min(axis=0) gain_matrix = seq_mod_preds_cell.max(axis=0) - real_pred_cell for pos in range(seq_mod_preds_cell.shape[1]): cols = [header, delta_start + pos, ci, loss_matrix[pos], gain_matrix[pos]] print >> table_out, "\t".join([str(c) for c in cols]) table_out.close()
def main(): usage = 'usage: %prog [options] <fasta_file> <targets_file> <out_file>' parser = OptionParser(usage) parser.add_option('-a', dest='add_features_file', default=None, help='Table of additional features') parser.add_option('-b', dest='batch_size', default=None, type='int', help='Align sizes with batch size') parser.add_option('-c', dest='counts', default=False, action='store_true', help='Validation and training proportions are given as raw counts [Default: %default]') parser.add_option('-e', dest='extend_length', type='int', default=None, help='Extend all sequences to this length [Default: %default]') parser.add_option('-r', dest='permute', default=False, action='store_true', help='Permute sequences [Default: %default]') parser.add_option('-s', dest='random_seed', default=1, type='int', help='numpy.random seed [Default: %default]') parser.add_option('-t', dest='test_pct', default=0, type='float', help='Test % [Default: %default]') parser.add_option('-v', dest='valid_pct', default=0, type='float', help='Validation % [Default: %default]') parser.add_option('--vt', dest='valid_test', default=False, action='store_true', help='Use validation ') (options,args) = parser.parse_args() if len(args) != 3: parser.error('Must provide fasta file, targets file, and an output prefix') else: fasta_file = args[0] targets_file = args[1] out_file = args[2] # seed rng before shuffle npr.seed(options.random_seed) ################################################################# # load data ################################################################# seqs, targets = dna_io.load_data_1hot(fasta_file, targets_file, extend_len=options.extend_length, mean_norm=False, whiten=False, permute=False, sort=False) # reshape sequences for torch seqs = seqs.reshape((seqs.shape[0],4,1,seqs.shape[1]/4)) # read headers headers = [] for line in open(fasta_file): if line[0] == '>': headers.append(line[1:].rstrip()) headers = np.array(headers) # read labels target_labels = open(targets_file).readline().strip().split('\t') # read additional features if options.add_features_file: df_add = pd.read_table(options.add_features_file, index_col=0) df_add = df_add.astype(np.float32, copy=False) # permute if options.permute: order = npr.permutation(seqs.shape[0]) seqs = seqs[order] targets = targets[order] headers = headers[order] if options.add_features_file: df_add = df_add.iloc[order] # check proper sum if options.counts: assert(options.test_pct + options.valid_pct <= seqs.shape[0]) else: assert(options.test_pct + options.valid_pct <= 1.0) ################################################################# # divide data ################################################################# if options.counts: test_count = options.test_pct valid_count = options.valid_pct else: test_count = int(0.5 + options.test_pct * seqs.shape[0]) valid_count = int(0.5 + options.valid_pct * seqs.shape[0]) train_count = seqs.shape[0] - test_count - valid_count train_count = batch_round(train_count, options.batch_size) print >> sys.stderr, '%d training sequences ' % train_count test_count = batch_round(test_count, options.batch_size) print >> sys.stderr, '%d test sequences ' % test_count valid_count = batch_round(valid_count, options.batch_size) print >> sys.stderr, '%d validation sequences ' % valid_count i = 0 train_seqs, train_targets = seqs[i:i+train_count,:], targets[i:i+train_count,:] i += train_count valid_seqs, valid_targets, valid_headers = seqs[i:i+valid_count,:], targets[i:i+valid_count,:], headers[i:i+valid_count] i += valid_count test_seqs, test_targets, test_headers = seqs[i:i+test_count,:], targets[i:i+test_count,:], headers[i:i+test_count] if options.add_features_file: i = 0 train_add = df_add.iloc[i:i+train_count] i += train_count valid_add = df_add.iloc[i:i+valid_count] i += valid_count test_add = df_add.iloc[i:i+test_count] ################################################################# # construct hdf5 representation ################################################################# h5f = h5py.File(out_file, 'w') h5f.create_dataset('target_labels', data=target_labels) if train_count > 0: h5f.create_dataset('train_in', data=train_seqs) h5f.create_dataset('train_out', data=train_targets) if valid_count > 0: h5f.create_dataset('valid_in', data=valid_seqs) h5f.create_dataset('valid_out', data=valid_targets) if test_count > 0: h5f.create_dataset('test_in', data=test_seqs) h5f.create_dataset('test_out', data=test_targets) h5f.create_dataset('test_headers', data=test_headers) elif options.valid_test: h5f.create_dataset('test_in', data=valid_seqs) h5f.create_dataset('test_out', data=valid_targets) h5f.create_dataset('test_headers', data=valid_headers) if options.add_features_file: h5f.create_dataset('add_labels', data=list(df_add.columns)) if train_count > 0: h5f.create_dataset('train_add', data=train_add.as_matrix()) if valid_count > 0: h5f.create_dataset('valid_add', data=valid_add.as_matrix()) if test_count > 0: h5f.create_dataset('test_add', data=test_add.as_matrix()) elif options.valid_test: h5f.create_dataset('test_add', data=valid_add.as_matrix()) h5f.close()
def main(): usage = 'usage: %prog [options] <fasta_file> <targets_file> <out_file>' parser = OptionParser(usage) (options, args) = parser.parse_args() if len(args) != 3: parser.error( 'Must provide fasta file, targets file, and an output prefix') else: fasta_file = args[0] targets_file = args[1] out_file = args[2] seqs, targets = dna_io.load_data_1hot(fasta_file, targets_file) seqs = seqs.reshape((seqs.shape[0], 4, 1, seqs.shape[1] / 4)) target_labels = open(targets_file).readline().strip().split('\t') order = npr.permutation(seqs.shape[0]) seqs = seqs[order] targets = targets[order] order = npr.permutation(seqs.shape[0]) seqs = seqs[order] targets = targets[order] seqsnum1 = int(seqs.shape[0] / 20) x = 0 kseqs = [] ktargets = [] for i in range(19): kseqs.append(seqs[x:x + seqsnum1, :].tolist()) ktargets.append(targets[x:x + seqsnum1, :].tolist()) x += seqsnum1 kseqs.append(seqs[x:seqs.shape[0], :].tolist()) ktargets.append(targets[x:seqs.shape[0], :].tolist()) length = len(kseqs[0]) for i in range(20): name = out_file + str(i + 1) + '.h5' valid_seqs = kseqs[i] valid_targets = ktargets[i] if i != 0: train_seqs = kseqs[0][:][:][:][:] train_targets = ktargets[0][:][:][:][:] else: train_seqs = kseqs[1][:][:][:][:] train_targets = ktargets[1][:][:][:][:] for j in range(1, 20): if (j != i) and (i != 0): train_seqs += kseqs[j][:][:][:][:] train_targets += ktargets[j][:][:][:][:] elif (j != 1) and (i == 0): train_seqs += kseqs[j][:][:][:][:] train_targets += ktargets[j][:][:][:][:] h5f = h5py.File(name, 'w') h5f.create_dataset('target_labels', data=target_labels) h5f.create_dataset('train_in', data=train_seqs) h5f.create_dataset('train_out', data=train_targets) h5f.create_dataset('valid_in', data=valid_seqs) h5f.create_dataset('valid_out', data=valid_targets) h5f.close()
def main(): usage = 'usage: %prog [options] <model_file> <input_file>' parser = OptionParser(usage) parser.add_option('-a', dest='input_activity_file', help='Optional activity table corresponding to an input FASTA file') parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]') parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]') parser.add_option('-r', dest='rng_seed', default=1, type='float', help='Random number generator seed [Default: %default]') parser.add_option('-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]') parser.add_option('-t', dest='targets', default='0', help='Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file') else: model_file = args[0] input_file = args[1] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) random.seed(options.rng_seed) ################################################################# # parse input file ################################################################# try: # input_file is FASTA # load sequences and headers seqs = [] seq_headers = [] for line in open(input_file): if line[0] == '>': seq_headers.append(line[1:].rstrip()) seqs.append('') else: seqs[-1] += line.rstrip() model_input_hdf5 = '%s/model_in.h5'%options.out_dir if options.input_activity_file: # one hot code seqs_1hot, targets = dna_io.load_data_1hot(input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False) # read in target names target_labels = open(options.input_activity_file).readline().strip().split('\t') else: # load sequences seqs_1hot = dna_io.load_sequences(input_file, permute=False) targets = None target_labels = None # sample if options.sample: sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] if targets is not None: targets = targets[sample_i] # reshape sequences for torch seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,seqs_1hot.shape[1]/4)) # write as test data to a HDF5 file h5f = h5py.File(model_input_hdf5, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() except (IOError, IndexError): # input_file is HDF5 try: model_input_hdf5 = input_file # load (sampled) test data from HDF5 hdf5_in = h5py.File(input_file, 'r') seqs_1hot = np.array(hdf5_in['test_in']) targets = np.array(hdf5_in['test_out']) try: # TEMP seq_headers = np.array(hdf5_in['test_headers']) target_labels = np.array(hdf5_in['target_labels']) except: seq_headers = None target_labels = None hdf5_in.close() # sample if options.sample: sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] targets = targets[sample_i] # write sampled data to a new HDF5 file model_input_hdf5 = '%s/model_in.h5'%options.out_dir h5f = h5py.File(model_input_hdf5, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() # convert to ACGT sequences seqs = dna_io.vecs2dna(seqs_1hot) except IOError: parser.error('Could not parse input file as FASTA or HDF5.') ################################################################# # Torch predict modifications ################################################################# if options.model_hdf5_file is None: options.model_hdf5_file = '%s/model_out.h5' % options.out_dir torch_cmd = 'basset_net_predict.lua %s %s %s' % (model_file, model_input_hdf5, options.model_hdf5_file) print torch_cmd subprocess.call(torch_cmd, shell=True) ################################################################# # load modification predictions ################################################################# hdf5_in = h5py.File(options.model_hdf5_file, 'r') reprs = [] l = 1 while 'reprs%d'%l in hdf5_in.keys(): reprs.append(np.array(hdf5_in['reprs%d'%l])) l += 1 hdf5_in.close() ################################################################# # plot ################################################################# print len(reprs) for l in range(len(reprs)): for si in range(len(seq_headers)): plt.figure() # just write the sequence out above it # or maybe I'll ultimately want to write an # influence version. yea probably. print reprs[l][si].shape sns.heatmap(reprs[l][si], linewidths=0, xticklabels=False) plt.savefig('%s/%s_l%d.pdf' % (options.out_dir, header_filename(seq_headers[si]), l)) plt.close()
def main(): usage = "usage: %prog [options] <model_file> <profile_file> <input_file>" parser = OptionParser(usage) parser.add_option( "-a", dest="input_activity_file", help="Optional activity table corresponding to an input FASTA file" ) parser.add_option( "--all", dest="all_data", default=False, action="store_true", help="Search all training/valid/test sequences. By default we search only the test set. [Default: %default]", ) parser.add_option( "--cuda", dest="cuda", default=False, action="store_true", help="Run on GPGPU [Default: %default]" ) parser.add_option( "--cudnn", dest="cudnn", default=False, action="store_true", help="Run on GPGPU w/cuDNN [Default: %default]" ) parser.add_option( "-d", dest="model_out_file", default=None, help="Pre-computed model predictions output table [Default: %default]", ) parser.add_option( "-e", dest="norm_even", default=False, action="store_true", help="Normalize the weights for the positive and negative datasets to be even [Default: %default]", ) parser.add_option("-f", dest="font_heat", default=6, type="int", help="Heat map axis font size [Default: %default]") parser.add_option( "-n", dest="num_dissect", default=10, type="int", help="Dissect the top n hits [Default: %default]" ) parser.add_option("-o", dest="out_dir", default="profile", help="Output directory [Default: %default]") parser.add_option( "-r", dest="norm_preds", default=False, action="store_true", help="Normalize predictions to have equal frequency [Default: %default]", ) parser.add_option( "-z", dest="weight_zero", default=1.0, type="float", help="Adjust the weights for the zero samples by this value [Default: %default]", ) (options, args) = parser.parse_args() if len(args) != 3: parser.error( "Must provide Basset model file, activity profile file, and input sequences (as a FASTA file or test data in an HDF file)" ) else: model_file = args[0] profile_file = args[1] input_file = args[2] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ################################################################# # parse input file ################################################################# try: # input_file is FASTA # load sequences and headers seqs = [] seq_headers = [] for line in open(input_file): if line[0] == ">": seq_headers.append(line[1:].rstrip()) seqs.append("") else: seqs[-1] += line.rstrip() # convert to arrays seqs = np.array(seqs) seq_headers = np.array(seq_headers) model_input_hdf5 = "%s/model_in.h5" % options.out_dir if options.input_activity_file: # one hot code seqs_1hot, targets = dna_io.load_data_1hot( input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False ) else: # load sequences seqs_1hot = dna_io.load_sequences(input_file, permute=False) targets = None # reshape sequences for torch seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4)) # write as test data to a HDF5 file h5f = h5py.File(model_input_hdf5, "w") h5f.create_dataset("test_in", data=seqs_1hot) h5f.close() except (IOError, IndexError, UnicodeDecodeError): # input_file is HDF5 try: model_input_hdf5 = input_file # load (sampled) test data from HDF5 hdf5_in = h5py.File(input_file, "r") seqs_1hot = np.array(hdf5_in["test_in"]) targets = np.array(hdf5_in["test_out"]) seq_headers = np.array([h.decode("UTF-8") for h in hdf5_in["test_headers"]]) hdf5_in.close() # convert to ACGT sequences seqs = dna_io.vecs2dna(seqs_1hot) except IOError: parser.error("Could not parse input file as FASTA or HDF5.") ################################################################# # Torch predict modifications ################################################################# # GPU options (needed below, too) gpgpu_str = "" if options.cudnn: gpgpu_str = "-cudnn" elif options.cuda: gpgpu_str = "-cuda" if options.model_out_file is None: options.model_out_file = "%s/preds.txt" % options.out_dir torch_cmd = "basset_predict.lua -mc_n 10 -rc %s %s %s %s" % ( gpgpu_str, model_file, model_input_hdf5, options.model_out_file, ) print(torch_cmd) subprocess.call(torch_cmd, shell=True) # read in predictions seqs_preds = np.loadtxt(options.model_out_file) num_targets = seqs_preds.shape[1] ################################################################# # parse profile file ################################################################# activity_profile, profile_weights, profile_mask, target_labels = load_profile( profile_file, num_targets, options.norm_even, options.weight_zero ) # normalize predictions if options.norm_preds: pred_means = seqs_preds.mean(axis=0) # save to file for basset_refine.py np.save("%s/pred_means" % options.out_dir, pred_means) # aim for profile weighted average aim_mean = np.average(pred_means[profile_mask], weights=profile_weights[profile_mask]) # normalize for ti in range(seqs_preds.shape[1]): ratio_ti = pred_means[ti] / aim_mean if profile_mask[ti] and (ratio_ti < 1 / 4 or ratio_ti > 4): print( "WARNING: target %d with mean %.4f differs 4-fold from the median %.3f" % (ti, pred_means[ti], aim_mean), file=sys.stderr, ) seqs_preds[:, ti] = znorm(seqs_preds[:, ti], pred_means[ti], aim_mean) ################################################################# # plot clustered heat map limited to relevant targets ################################################################# seqs_preds_prof = seqs_preds[:, profile_mask] seqs_preds_var = seqs_preds_prof.var(axis=1) seqs_sort_var = np.argsort(seqs_preds_var)[::-1] # heat map plt.figure() g = sns.clustermap( np.transpose(seqs_preds_prof[seqs_sort_var[:1500]]), metric="cosine", linewidths=0, yticklabels=target_labels[profile_mask], xticklabels=False, ) plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) for label in g.ax_heatmap.yaxis.get_majorticklabels(): label.set_fontsize(options.font_heat) plt.savefig("%s/heat_clust.pdf" % options.out_dir) plt.close() # dimension reduction # model_pca = PCA(n_components=50) # spp_pca = model.fit_transform(np.transpose(seqs_preds_prof)) # model = TSNE(n_components=2, perplexity=5, metric='euclidean') # spp_dr = model.fit_transform(spp_pca) model = PCA(n_components=2) spp_dr = model.fit_transform(np.transpose(seqs_preds_prof)) plt.figure() plt.scatter(spp_dr[:, 0], spp_dr[:, 1], c="black", s=5) target_labels_prof_concise = [tl.split(":")[-1] for tl in target_labels[profile_mask]] for label, x, y, activity in zip( target_labels_prof_concise, spp_dr[:, 0], spp_dr[:, 1], activity_profile[profile_mask] ): plt.annotate(label, xy=(x, y), size=10, color=sns.color_palette("deep")[int(activity)]) plt.savefig("%s/dim_red.pdf" % options.out_dir) plt.close() ################################################################# # compute profile distances ################################################################# # compute prediction distances seqs_pdists = [] for si in range(seqs_preds.shape[0]): # sd = np.power(seqs_preds[si,profile_mask]-activity_profile[profile_mask], 2).sum() sd = log_loss( activity_profile[profile_mask], seqs_preds[si, profile_mask], sample_weight=profile_weights[profile_mask] ) seqs_pdists.append(sd) seqs_pdists = np.array(seqs_pdists) # obtain sorted indexes seqs_sort_dist = np.argsort(seqs_pdists) # compute target distances seqs_tdists = [] for si in range(seqs_preds.shape[0]): tdists = np.absolute(targets[si, profile_mask] - activity_profile[profile_mask]) tdists_weight = np.multiply(tdists, profile_weights[profile_mask]) td = tdists_weight.sum() seqs_tdists.append(td) seqs_tdists = np.array(seqs_tdists) # print as table table_out = open("%s/table.txt" % options.out_dir, "w") for si in seqs_sort_dist: cols = [si, seqs_pdists[si], seqs_tdists[si]] + list(seqs_preds[si, profile_mask]) print("\t".join([str(c) for c in cols]), file=table_out) table_out.close() ################################################################# # plot sorted heat map ################################################################# plt.figure() g = sns.clustermap( np.transpose(seqs_preds_prof[seqs_sort_dist[:1000]]), col_cluster=False, metric="cosine", linewidths=0, yticklabels=target_labels[profile_mask], xticklabels=False, ) plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) for label in g.ax_heatmap.yaxis.get_majorticklabels(): label.set_fontsize(options.font_heat) plt.savefig("%s/heat_rank.pdf" % options.out_dir) plt.close() ################################################################# # dissect the top hits ################################################################# satmut_targets = ",".join([str(ti) for ti in range(len(activity_profile)) if profile_mask[ti]]) if gpgpu_str != "": gpgpu_str = "-%s" % gpgpu_str for ni in range(options.num_dissect): si = seqs_sort_dist[ni] # print FASTA fasta_file = "%s/seq%d.fa" % (options.out_dir, ni) fasta_out = open(fasta_file, "w") print(">%s\n%s" % (seq_headers[si], seqs[si]), file=fasta_out) fasta_out.close() # saturated mutagenesis cmd = "basset_sat.py %s --mc_n 10 -n 500 -o %s/satmut%d -t %s %s %s" % ( gpgpu_str, options.out_dir, ni, satmut_targets, model_file, fasta_file, ) subprocess.call(cmd, shell=True) # predictions and targets heat profile_sort = np.argsort(activity_profile[profile_mask]) heat_mat = np.array([activity_profile[profile_mask], targets[si, profile_mask], seqs_preds_prof[si]]) heat_mat = heat_mat[:, profile_sort] plt.figure() ax = sns.heatmap( np.transpose(heat_mat), yticklabels=target_labels[profile_mask][profile_sort], xticklabels=["Desired", "Experiment", "Prediction"], ) plt.setp(ax.xaxis.get_majorticklabels(), rotation=-45) plt.setp(ax.yaxis.get_majorticklabels(), rotation=-0) for label in ax.yaxis.get_majorticklabels(): label.set_fontsize(options.font_heat) plt.savefig("%s/heat%d.pdf" % (options.out_dir, ni)) plt.close()
def main(): usage = 'usage: %prog [options] <model_file> <input_file>' parser = OptionParser(usage) parser.add_option( '-a', dest='input_activity_file', help='Optional activity table corresponding to an input FASTA file') parser.add_option( '-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]') parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]') parser.add_option('-r', dest='rng_seed', default=1, type='float', help='Random number generator seed [Default: %default]') parser.add_option( '-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]') parser.add_option( '-t', dest='targets', default='0', help= 'Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]' ) (options, args) = parser.parse_args() if len(args) != 2: parser.error( 'Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file' ) else: model_file = args[0] input_file = args[1] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) random.seed(options.rng_seed) ################################################################# # parse input file ################################################################# try: # input_file is FASTA # load sequences and headers seqs = [] seq_headers = [] for line in open(input_file): if line[0] == '>': seq_headers.append(line[1:].rstrip()) seqs.append('') else: seqs[-1] += line.rstrip() model_input_hdf5 = '%s/model_in.h5' % options.out_dir if options.input_activity_file: # one hot code seqs_1hot, targets = dna_io.load_data_1hot( input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False) # read in target names target_labels = open( options.input_activity_file).readline().strip().split('\t') else: # load sequences seqs_1hot = dna_io.load_sequences(input_file, permute=False) targets = None target_labels = None # sample if options.sample: sample_i = np.array( random.sample(xrange(seqs_1hot.shape[0]), options.sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] if targets is not None: targets = targets[sample_i] # reshape sequences for torch seqs_1hot = seqs_1hot.reshape( (seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4)) # write as test data to a HDF5 file h5f = h5py.File(model_input_hdf5, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() except (IOError, IndexError): # input_file is HDF5 try: model_input_hdf5 = input_file # load (sampled) test data from HDF5 hdf5_in = h5py.File(input_file, 'r') seqs_1hot = np.array(hdf5_in['test_in']) targets = np.array(hdf5_in['test_out']) try: # TEMP seq_headers = np.array(hdf5_in['test_headers']) target_labels = np.array(hdf5_in['target_labels']) except: seq_headers = None target_labels = None hdf5_in.close() # sample if options.sample: sample_i = np.array( random.sample(xrange(seqs_1hot.shape[0]), options.sample)) seqs_1hot = seqs_1hot[sample_i] seq_headers = seq_headers[sample_i] targets = targets[sample_i] # write sampled data to a new HDF5 file model_input_hdf5 = '%s/model_in.h5' % options.out_dir h5f = h5py.File(model_input_hdf5, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() # convert to ACGT sequences seqs = dna_io.vecs2dna(seqs_1hot) except IOError: parser.error('Could not parse input file as FASTA or HDF5.') ################################################################# # Torch predict modifications ################################################################# if options.model_hdf5_file is None: options.model_hdf5_file = '%s/model_out.h5' % options.out_dir torch_cmd = 'basset_net_predict.lua %s %s %s' % ( model_file, model_input_hdf5, options.model_hdf5_file) print torch_cmd subprocess.call(torch_cmd, shell=True) ################################################################# # load modification predictions ################################################################# hdf5_in = h5py.File(options.model_hdf5_file, 'r') reprs = [] l = 1 while 'reprs%d' % l in hdf5_in.keys(): reprs.append(np.array(hdf5_in['reprs%d' % l])) l += 1 hdf5_in.close() ################################################################# # plot ################################################################# print len(reprs) for l in range(len(reprs)): for si in range(len(seq_headers)): plt.figure() # just write the sequence out above it # or maybe I'll ultimately want to write an # influence version. yea probably. print reprs[l][si].shape sns.heatmap(reprs[l][si], linewidths=0, xticklabels=False) plt.savefig('%s/%s_l%d.pdf' % (options.out_dir, header_filename(seq_headers[si]), l)) plt.close()