def dna_length_1hot(seq, length): ''' Adjust the sequence length and compute a 1hot coding. ''' if length < len(seq): # trim the sequence seq_trim = (len(seq) - length) // 2 seq = seq[seq_trim:seq_trim + length] elif length > len(seq): # extend with N's nfront = (length - len(seq)) // 2 nback = length - len(seq) - nfront seq = 'N' * nfront + seq + 'N' * nback seq_1hot = dna_one_hot(seq) return seq_1hot, seq
def dna_length_1hot(seq, length): ''' Adjust the sequence length and compute a 1hot coding. ''' if length < len(seq): # trim the sequence seq_trim = (len(seq)-length)//2 seq = seq[seq_trim:seq_trim+length] elif length > len(seq): # extend with N's nfront = (length-len(seq))//2 nback = length - len(seq) - nfront seq = 'N'*nfront + seq + 'N'*nback seq_1hot = dna_one_hot(seq) return seq_1hot, seq
def snps_seq1_fixed(snps, genome_fasta, seq_len): ''' Produce an array of one hot coded sequences for a list of SNPs. Attrs: snps [SNP] : list of SNPs genome_fasta (str) : genome FASTA file seq_len (int) : sequence length to code Return: seq_vecs (array) : one hot coded sequences surrounding the SNPs seqs [str] : sequences seq_headers [str] : headers for sequences snps [SNP] : list of SNPs without unmatched entries ''' left_len = seq_len / 2 - 1 right_len = seq_len / 2 # open genome FASTA genome = pysam.Fastafile(genome_fasta) # initialize one hot coded vector list seq_vecs_list = [] # save sequence strings, too seqs = [] # name sequences seq_headers = [] # array of snps without unmatched entries snps_cleaned = [] for snp in snps: # specify positions in GFF-style 1-based seq_start = snp.pos - left_len seq_end = snp.pos + right_len + len(snp.ref_allele) - snp.longest_alt() # extract sequence as BED style seq = genome.fetch(snp.chrom, seq_start - 1, seq_end).upper() # verify that ref allele matches ref sequence seq_ref = seq[left_len:left_len + len(snp.ref_allele)] if seq_ref != snp.ref_allele: print >> sys.stderr, 'WARNING: skipping %s because reference allele does not match reference genome: %s vs %s' % ( snp.rsid, snp.ref_allele, seq_ref) continue snps_cleaned.append(snp) # one hot code ref allele seq_vecs_list.append(dna_one_hot(seq[:seq_len], seq_len)) seqs.append(seq[:seq_len]) # name ref allele seq_headers.append('%s_%s' % (snp.rsid, cap_allele(snp.ref_allele))) for alt_al in snp.alt_alleles: # remove ref allele and include alt allele seq_alt = seq[:left_len] + alt_al + seq[left_len + len(snp.ref_allele):] # one hot code seq_vecs_list.append(dna_one_hot(seq_alt, seq_len)) seqs.append(seq_alt) # name seq_headers.append('%s_%s' % (snp.rsid, cap_allele(alt_al))) # stack seq_vecs = np.vstack(seq_vecs_list) return seq_vecs, seqs, seq_headers, snps_cleaned
def main(): usage = 'usage: %prog [options] <model_file> <profile_file> <fasta_file>' parser = OptionParser(usage) parser.add_option( '-a', dest='input_activity_file', help='Optional activity table corresponding to an input FASTA file') parser.add_option( '-e', dest='norm_even', default=False, action='store_true', help= 'Normalize the weights for the positive and negative datasets to be even [Default: %default]' ) parser.add_option('--cuda', dest='cuda', default=False, action='store_true', help='Run on GPGPU [Default: %default]') parser.add_option('--cudnn', dest='cudnn', default=False, action='store_true', help='Run on GPGPU w/cuDNN [Default: %default]') parser.add_option('-o', dest='out_dir', default='refine', help='Output directory [Default: %default]') parser.add_option( '-r', dest='norm_preds_file', default=None, help= 'Prediction means file used to normalize predictions to have equal frequency' ) parser.add_option( '-s', dest='early_stop', default=.05, type='float', help= 'Proportion by which the mutation must improve to be accepted [Default: %default]' ) parser.add_option( '-z', dest='weight_zero', default=1.0, type='float', help= 'Adjust the weights for the zero samples by this value [Default: %default]' ) (options, args) = parser.parse_args() if len(args) != 3: parser.error( 'Must provide Basset model file, activity profile file, and sequence FASTA file' ) else: model_file = args[0] profile_file = args[1] input_file = args[2] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) gpgpu_str = '' if options.cudnn: gpgpu_str = '-cudnn' elif options.cuda: gpgpu_str = '-cuda' ################################################################# # prep sequence ################################################################# # load sequence seq = '' for line in open(input_file): if line[0] == '>': header = line[1:].rstrip() else: seq += line.rstrip() # convert to one hot coding seq_1hot = dna_io.dna_one_hot(seq) seq_1hot = np.reshape(seq_1hot, (1, 4, 1, len(seq))) # make initial predictions seq_preds = predict_seq(model_file, seq_1hot, gpgpu_str, options.out_dir) num_targets = seq_preds.shape[0] ################################################################# # prep profile ################################################################# activity_profile, profile_weights, profile_mask, target_labels = load_profile( profile_file, num_targets, options.norm_even, options.weight_zero) # normalize predictions if options.norm_preds_file is not None: pred_means = np.load(options.norm_preds_file) # aim for profile weighted average aim_mean = np.average(pred_means[profile_mask], weights=profile_weights[profile_mask]) # normalize for ti in range(num_targets): seq_preds[ti] = znorm(seq_preds[ti], pred_means[ti], aim_mean) ################################################################# # iteratively refine ################################################################# nts = 'ACGT' local_max = False refined_profile_list = [seq_preds[profile_mask]] ri = 1 while not local_max: print('Refinement stage %d' % ri, flush=True) # write sequence to HDF5 seq_hdf5_file = '%s/seq%d.h5' % (options.out_dir, ri) seq_hdf5_out = h5py.File(seq_hdf5_file, 'w') seq_hdf5_out.create_dataset('test_in', data=seq_1hot) seq_hdf5_out.close() # perform saturated mutagenesis sat_hdf5_file = '%s/satmut%d.h5' % (options.out_dir, ri) torch_cmd = '%s/src/basset_sat_predict.lua %s -rc %s %s %s' % ( os.environ['BASSETDIR'], gpgpu_str, model_file, seq_hdf5_file, sat_hdf5_file) subprocess.call(torch_cmd, shell=True) # read results into 4 x L x T sat_hdf5_in = h5py.File(sat_hdf5_file, 'r') seq_mod_preds = np.array(sat_hdf5_in['seq_mod_preds']) seq_mod_preds = seq_mod_preds.squeeze() sat_hdf5_in.close() # normalize if options.norm_preds_file is not None: for ti in range(seq_mod_preds.shape[2]): seq_mod_preds[:, :, ti] = znorm(seq_mod_preds[:, :, ti], pred_means[ti], aim_mean) # find sequence prediction ni, li = get_real_nt(seq) seq_pred = seq_mod_preds[ni, li, :] # set to min seq_dist = log_loss(activity_profile[profile_mask], seq_mod_preds[ni, li, profile_mask], sample_weight=profile_weights[profile_mask]) min_dist = seq_dist min_entry = (li, ni) local_max = True # consider mutated sequences for li in range(len(seq)): for ni in range(4): if seq_1hot[0, ni, 0, li] == 0: # compute distance mut_dist = log_loss( activity_profile[profile_mask], seq_mod_preds[ni, li, profile_mask], sample_weight=profile_weights[profile_mask]) # compare to min if mut_dist * 1.05 < min_dist: local_max = False min_dist = mut_dist min_entry = (li, ni) # update if local_max: print(' Maximized') else: # update trace li, ni = min_entry print(' Mutate %d %s --> %s' % (li, seq[li], nts[ni])) print(' Distance decreases from %.3f to %.3f' % (seq_dist, min_dist), flush=True) # update sequence seq = seq[:li] + nts[ni] + seq[li + 1:] dna_io.one_hot_set(seq_1hot[0], li, nts[ni]) # save profile refined_profile_list.append(seq_mod_preds[ni, li, profile_mask]) ri += 1 ################################################################# # finish ################################################################# refined_profiles = np.array(refined_profile_list) # print refinement table table_out = open('%s/final_table.txt' % options.out_dir, 'w') for ri in range(refined_profiles.shape[0]): pi = 0 for ti in range(num_targets): if profile_mask[ti]: cols = (ri, ti, refined_profiles[ri, pi]) print('%-3d %3d %.3f' % cols, file=table_out) pi += 1 table_out.close() # heat map if len(refined_profile_list) > 1: plt.figure() g = sns.clustermap(np.transpose(refined_profiles), col_cluster=False, metric='euclidean', linewidths=0, yticklabels=target_labels[profile_mask], xticklabels=False) plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) plt.savefig('%s/final_heat.pdf' % options.out_dir) plt.close() # output sequence final_fasta_file = '%s/final_seq.fa' % options.out_dir final_fasta_out = open(final_fasta_file, 'w') print('>%s\n%s' % (header, seq), file=final_fasta_out) final_fasta_out.close() # perform a new saturated mutagenesis satmut_targets = ','.join( [str(ti) for ti in range(len(activity_profile)) if profile_mask[ti]]) if gpgpu_str != '': gpgpu_str = '-%s' % gpgpu_str cmd = 'basset_sat.py %s -n 500 -o %s/final_satmut -t %s %s %s' % ( gpgpu_str, options.out_dir, satmut_targets, model_file, final_fasta_file) subprocess.call(cmd, shell=True)
def snps_seq1(snps, genome_fasta, seq_len): ''' Produce an array of one hot coded sequences for a list of SNPs. Attrs: snps [SNP] : list of SNPs genome_fasta (str) : genome FASTA file seq_len (int) : sequence length to code Return: seq_vecs (array) : one hot coded sequences surrounding the SNPs seq_headers [str] : headers for sequences ''' left_len = seq_len/2 - 1 right_len = seq_len/2 # open genome FASTA genome = pysam.Fastafile(genome_fasta) # initialize one hot coded vector list seq_vecs_list = [] # save sequence strings, too seqs = [] # name sequences seq_headers = [] for snp in snps: # specify positions in GFF-style 1-based seq_start = snp.pos - left_len seq_end = snp.pos + right_len + len(snp.ref_allele) - snp.longest_alt() # extract sequence as BED style seq = genome.fetch(snp.chrom, seq_start-1, seq_end).upper() # verify that ref allele matches ref sequence seq_ref = seq[left_len:left_len+len(snp.ref_allele)] if seq_ref != snp.ref_allele: print >> sys.stderr, 'WARNING: skipping %s because reference allele does not match reference genome: %s vs %s' % (snp.rsid, snp.ref_allele, seq_ref) continue # one hot code ref allele seq_vecs_list.append(dna_one_hot(seq[:seq_len], seq_len)) seqs.append(seq[:seq_len]) # name ref allele seq_headers.append('%s_%s' % (snp.rsid, cap_allele(snp.ref_allele))) for alt_al in snp.alt_alleles: # remove ref allele and include alt allele seq_alt = seq[:left_len] + alt_al + seq[left_len+len(snp.ref_allele):] # one hot code seq_vecs_list.append(dna_one_hot(seq_alt, seq_len)) seqs.append(seq_alt) # name seq_headers.append('%s_%s' % (snp.rsid, cap_allele(alt_al))) # stack seq_vecs = np.vstack(seq_vecs_list) return seq_vecs, seqs, seq_headers
def main(): usage = 'usage: %prog [options] <model_file> <profile_file> <fasta_file>' parser = OptionParser(usage) parser.add_option('-a', dest='input_activity_file', help='Optional activity table corresponding to an input FASTA file') parser.add_option('-e', dest='norm_even', default=False, action='store_true', help='Normalize the weights for the positive and negative datasets to be even [Default: %default]') parser.add_option('--cuda', dest='cuda', default=False, action='store_true', help='Run on GPGPU [Default: %default]') parser.add_option('--cudnn', dest='cudnn', default=False, action='store_true', help='Run on GPGPU w/cuDNN [Default: %default]') parser.add_option('-o', dest='out_dir', default='refine', help='Output directory [Default: %default]') parser.add_option('-r', dest='norm_preds_file', default=None, help='Prediction means file used to normalize predictions to have equal frequency') parser.add_option('-s', dest='early_stop', default=.05, type='float', help='Proportion by which the mutation must improve to be accepted [Default: %default]') parser.add_option('-z', dest='weight_zero', default=1.0, type='float', help='Adjust the weights for the zero samples by this value [Default: %default]') (options,args) = parser.parse_args() if len(args) != 3: parser.error('Must provide Basset model file, activity profile file, and sequence FASTA file') else: model_file = args[0] profile_file = args[1] input_file = args[2] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) gpgpu_str = '' if options.cudnn: gpgpu_str = '-cudnn' elif options.cuda: gpgpu_str = '-cuda' ################################################################# # prep sequence ################################################################# # load sequence seq = '' for line in open(input_file): if line[0] == '>': header = line[1:].rstrip() else: seq += line.rstrip() # convert to one hot coding seq_1hot = dna_io.dna_one_hot(seq) seq_1hot = np.reshape(seq_1hot, (1,4,1,len(seq))) # make initial predictions seq_preds = predict_seq(model_file, seq_1hot, gpgpu_str, options.out_dir) num_targets = seq_preds.shape[0] ################################################################# # prep profile ################################################################# activity_profile, profile_weights, profile_mask, target_labels = load_profile(profile_file, num_targets, options.norm_even, options.weight_zero) # normalize predictions if options.norm_preds_file is not None: pred_means = np.load(options.norm_preds_file) # aim for profile weighted average aim_mean = np.average(pred_means[profile_mask], weights=profile_weights[profile_mask]) # normalize for ti in range(num_targets): seq_preds[ti] = znorm(seq_preds[ti], pred_means[ti], aim_mean) ################################################################# # iteratively refine ################################################################# nts = 'ACGT' local_max = False refined_profile_list = [seq_preds[profile_mask]] ri = 1 while not local_max: print('Refinement stage %d' % ri, flush=True) # write sequence to HDF5 seq_hdf5_file = '%s/seq%d.h5' % (options.out_dir,ri) seq_hdf5_out = h5py.File(seq_hdf5_file, 'w') seq_hdf5_out.create_dataset('test_in', data=seq_1hot) seq_hdf5_out.close() # perform saturated mutagenesis sat_hdf5_file = '%s/satmut%d.h5' % (options.out_dir,ri) torch_cmd = 'basset_sat_predict.lua %s -rc %s %s %s' % (gpgpu_str, model_file, seq_hdf5_file, sat_hdf5_file) subprocess.call(torch_cmd, shell=True) # read results into 4 x L x T sat_hdf5_in = h5py.File(sat_hdf5_file, 'r') seq_mod_preds = np.array(sat_hdf5_in['seq_mod_preds']) seq_mod_preds = seq_mod_preds.squeeze() sat_hdf5_in.close() # normalize if options.norm_preds_file is not None: for ti in range(seq_mod_preds.shape[2]): seq_mod_preds[:,:,ti] = znorm(seq_mod_preds[:,:,ti], pred_means[ti], aim_mean) # find sequence prediction ni, li = get_real_nt(seq) seq_pred = seq_mod_preds[ni,li,:] # set to min seq_dist = log_loss(activity_profile[profile_mask], seq_mod_preds[ni,li,profile_mask], sample_weight=profile_weights[profile_mask]) min_dist = seq_dist min_entry = (li,ni) local_max = True # consider mutated sequences for li in range(len(seq)): for ni in range(4): if seq_1hot[0,ni,0,li] == 0: # compute distance mut_dist = log_loss(activity_profile[profile_mask], seq_mod_preds[ni,li,profile_mask], sample_weight=profile_weights[profile_mask]) # compare to min if mut_dist*1.05 < min_dist: local_max = False min_dist = mut_dist min_entry = (li,ni) # update if local_max: print(' Maximized') else: # update trace li, ni = min_entry print(' Mutate %d %s --> %s' % (li, seq[li], nts[ni])) print(' Distance decreases from %.3f to %.3f' % (seq_dist, min_dist), flush=True) # update sequence seq = seq[:li] + nts[ni] + seq[li+1:] dna_io.one_hot_set(seq_1hot[0], li, nts[ni]) # save profile refined_profile_list.append(seq_mod_preds[ni,li,profile_mask]) ri += 1 ################################################################# # finish ################################################################# refined_profiles = np.array(refined_profile_list) # print refinement table table_out = open('%s/final_table.txt' % options.out_dir, 'w') for ri in range(refined_profiles.shape[0]): pi = 0 for ti in range(num_targets): if profile_mask[ti]: cols = (ri, ti, refined_profiles[ri,pi]) print('%-3d %3d %.3f' % cols, file=table_out) pi += 1 table_out.close() # heat map if len(refined_profile_list) > 1: plt.figure() g = sns.clustermap(np.transpose(refined_profiles), col_cluster=False, metric='euclidean', linewidths=0, yticklabels=target_labels[profile_mask], xticklabels=False) plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) plt.savefig('%s/final_heat.pdf' % options.out_dir) plt.close() # output sequence final_fasta_file = '%s/final_seq.fa' % options.out_dir final_fasta_out = open(final_fasta_file, 'w') print('>%s\n%s' % (header, seq), file=final_fasta_out) final_fasta_out.close() # perform a new saturated mutagenesis satmut_targets = ','.join([str(ti) for ti in range(len(activity_profile)) if profile_mask[ti]]) if gpgpu_str != '': gpgpu_str = '-%s' % gpgpu_str cmd = 'basset_sat.py %s -n 500 -o %s/final_satmut -t %s %s %s' % (gpgpu_str, options.out_dir, satmut_targets, model_file, final_fasta_file) subprocess.call(cmd, shell=True)