def kmer_kernel(seq, k, canonical=True): kmer_counts = {} if canonical: seq_rc = dna.rc(seq) for i in range(len(seq)-k+1): kmer = seq[i:i+k] if kmer.find('N') == -1: kmer_counts[kmer] = kmer_counts.get(kmer,0) + 1 if canonical: kmer_rc = seq_rc[i:i+k] kmer_counts[kmer_rc] = kmer_counts.get(kmer_rc,0) + 1 if canonical: kmer_counts = dna.canonical_kmers(kmer_counts) # normalize # kmer_sum = float(sum(kmer_counts.values())) kmer_sum = float(sum(np.square(list(kmer_counts.values())))) vec = {} for kmer in kmer_counts: vec[kmer] = kmer_counts[kmer] / kmer_sum return vec
def header_gff(header, seq, gff_file, options): header_seqs = {} for line in open(gff_file): a = line.split('\t') a[-1] = a[-1].rstrip() if (not options.exon or a[2] == 'exon') and a[0] == header: kv = gff.gtf_kv(a[8]) #head_id = kv.get(options.header_key,a[8]+'_'+a[0]+':'+a[3]+'-'+a[4]) head_id = kv.get(options.header_key,a[8]) if options.gene_too: head_id += ' gene=%s' % kv.get('gene_id','') feat_start = int(a[3]) feat_end = int(a[4]) feat_seq = seq[feat_start-1:feat_end] if a[6] == '+': header_seqs[head_id] = header_seqs.get(head_id,'') + feat_seq else: header_seqs[head_id] = dna.rc(feat_seq) + header_seqs.get(head_id,'') for header in header_seqs: print '>%s' % header if options.split_lines: i = 0 while i < len(header_seqs[header]): print header_seqs[header][i:i+60] i += 60 else: print header_seqs[header]
def header_gff(header, seq, gff_file, options): header_seqs = {} for line in open(gff_file): a = line.split('\t') a[-1] = a[-1].rstrip() if (not options.exon or a[2] == 'exon') and a[0] == header: try: kv = gff.gtf_kv(a[8]) except: kv = {} head_id = kv.get(options.header_key,a[0]+':'+a[3]+'-'+a[4]) #head_id = kv.get(options.header_key,a[8]) if options.gene_too: head_id += ' gene=%s' % kv.get('gene_id','') feat_start = int(a[3]) feat_end = int(a[4]) feat_seq = seq[feat_start-1:feat_end] if a[6] == '+': header_seqs[head_id] = header_seqs.get(head_id,'') + feat_seq else: header_seqs[head_id] = dna.rc(feat_seq) + header_seqs.get(head_id,'') for header in header_seqs: print '>%s' % header if options.split_lines: i = 0 while i < len(header_seqs[header]): print header_seqs[header][i:i+60] i += 60 else: print header_seqs[header]
def kmer_kernel(seq, k, canonical=True): kmer_counts = {} if canonical: seq_rc = dna.rc(seq) for i in range(len(seq) - k + 1): kmer = seq[i:i + k] if kmer.find('N') == -1: kmer_counts[kmer] = kmer_counts.get(kmer, 0) + 1 if canonical: kmer_rc = seq_rc[i:i + k] kmer_counts[kmer_rc] = kmer_counts.get(kmer_rc, 0) + 1 if canonical: kmer_counts = dna.canonical_kmers(kmer_counts) # normalize # kmer_sum = float(sum(kmer_counts.values())) kmer_sum = float(sum(np.square(list(kmer_counts.values())))) vec = {} for kmer in kmer_counts: vec[kmer] = kmer_counts[kmer] / kmer_sum return vec
def header_bed(header, seq, bed_file, options): for line in open(bed_file): a = line.split('\t') a[-1] = a[-1].rstrip() if a[0] == header: feat_start = int(a[1]) feat_end = int(a[2]) feat_strand = '+' if len(a) > 5 and a[5] == '-': feat_strand = '-' feat_header = '' if len(a) > 3 and a[3] != '.': feat_header = a[3] + ':' feat_header += '%s:%d-%d:%s' % (header,feat_start,feat_end,feat_strand) if feat_strand == '+': feat_seq = seq[feat_start:feat_end] else: feat_seq = dna.rc(seq[feat_start:feat_end]) #print '>%s\n%s' % (feat_header, feat_seq) print '>%s' % feat_header i = 0 while i < len(feat_seq): print feat_seq[i:i+60] i += 60
def make_te_read_fastas(te_gff, bam_file, read_tes, out_dir, stranded, max_reads): # open TE read fasta files te_fastas = {} for line in open(te_gff): a = line.split('\t') dfam_te = gff.gtf_kv(a[8])['dfam'] if not (dfam_te,'fwd') in te_fastas: te_fastas[(dfam_te,'fwd')] = open('%s/%s_fwd.fa' % (out_dir,dfam_te), 'w') te_fastas[(dfam_te,'rev')] = open('%s/%s_rev.fa' % (out_dir,dfam_te), 'w') # initialize counters for total reads te_totals = {} for dfam_te, orient in te_fastas: te_totals[dfam_te, orient] = 0 # print reads to fasta files for aligned_read in pysam.Samfile(bam_file, 'rb'): this_read_tes = read_tes.get(aligned_read.qname,{}) for dfam_te in this_read_tes.keys(): if this_read_tes[dfam_te] != None: (rstrand, tstrand) = this_read_tes[dfam_te] # only print if we match the read strand if (aligned_read.is_reverse and rstrand == '-') or (not aligned_read.is_reverse and rstrand == '+'): # TE determines reversal if tstrand == '+': rseq = aligned_read.seq else: rseq = dna.rc(aligned_read.seq) # count, and print if not stranded or rstrand == tstrand: te_totals[(dfam_te,'fwd')] += 1 if te_totals[(dfam_te,'fwd')] < max_reads: print >> te_fastas[(dfam_te,'fwd')], '>%s\n%s' % (aligned_read.qname,rseq) else: te_totals[(dfam_te,'rev')] += 1 if te_totals[(dfam_te,'rev')] < max_reads: print >> te_fastas[(dfam_te,'rev')], '>%s\n%s' % (aligned_read.qname,rseq) # specify printed this_read_tes[dfam_te] = None # post-process fasta files te_renorm = {} for dfam_te, orient in te_fastas: # close te_fastas[(dfam_te, orient)].close() # return renormalization factors if te_totals[(dfam_te,orient)] > 10: te_renorm[(dfam_te,orient)] = max(1.0, te_totals[(dfam_te,orient)]/float(max_reads)) return te_renorm
def process_chr(chrom, seq, promoters, out_fa, out_gff, promoter_length, acgt_t): # grab promoters for prom in promoters: if prom.strand == '+': prom_seq = seq[prom.start:prom.start+promoter_length] else: prom_seq = dna.rc(seq[prom.start:prom.start+promoter_length]) if acgt_pct(prom_seq) > acgt_t: print >> out_fa, '>%s\n%s' % (prom.gtf_kv['transcript_id'], prom_seq) gff_dat = [chrom, '.', 'promoter', str(prom.start+1), str(prom.start+promoter_length+1-1), '.', prom.strand, '.', gff.kv_gtf(prom.gtf_kv)] print >> out_gff, '\t'.join(gff_dat)
def header_bed(header, seq, bed_file, options): for line in open(bed_file): a = line.split('\t') a[-1] = a[-1].rstrip() if a[0] == header: # determine start and end feat_start = int(a[1]) feat_end = int(a[2]) if options.length_match: feat_mid = int(0.5 * feat_start + 0.5 * feat_end) feat_start = feat_mid - options.length_match / 2 feat_end = feat_mid + options.length_match / 2 # determine strand feat_strand = '+' if len(a) > 5 and a[5] == '-': feat_strand = '-' # determine header if options.add_coords_header: feat_header = '%s:%s-%s:%s' % (header, a[1], a[2], feat_strand) else: feat_header = '' if len(a) > 3 and a[3] != '.': feat_header = a[3] # determine sequence feat_seq = '' # if negative index, start with N's if feat_start < 0: feat_seq += 'N' * (-feat_start) feat_start = 0 # grab the genome sequence feat_seq += seq[feat_start:feat_end] # if it's too short, extend with N's if options.length_match and len(feat_seq) < options.length_match: feat_seq += 'N' * (options.length_match - len(feat_seq)) # reverse complement if feat_strand == '-': feat_seq = dna.rc(seq[feat_start:feat_end]) #print '>%s\n%s' % (feat_header, feat_seq) print '>%s' % feat_header i = 0 while i < len(feat_seq): print feat_seq[i:i + 60] i += 60
def header_bed(header, seq, bed_file, options): for line in open(bed_file): a = line.split('\t') a[-1] = a[-1].rstrip() if a[0] == header: # determine start and end feat_start = int(a[1]) feat_end = int(a[2]) if options.length_match: feat_mid = int(0.5*feat_start + 0.5*feat_end) feat_start = feat_mid - options.length_match/2 feat_end = feat_mid + options.length_match/2 # determine strand feat_strand = '+' if len(a) > 5 and a[5] == '-': feat_strand = '-' # determine header if options.add_coords_header: feat_header = '%s:%s-%s:%s' % (header,a[1],a[2],feat_strand) else: feat_header = '' if len(a) > 3 and a[3] != '.': feat_header = a[3] # determine sequence feat_seq = '' # if negative index, start with N's if feat_start < 0: feat_seq += 'N'*(-feat_start) feat_start = 0 # grab the genome sequence feat_seq += seq[feat_start:feat_end] # if it's too short, extend with N's if options.length_match and len(feat_seq) < options.length_match: feat_seq += 'N'*(options.length_match - len(feat_seq)) # reverse complement if feat_strand == '-': feat_seq = dna.rc(seq[feat_start:feat_end]) #print '>%s\n%s' % (feat_header, feat_seq) print '>%s' % feat_header i = 0 while i < len(feat_seq): print feat_seq[i:i+60] i += 60
def process_chrom(transcripts_gtf, chrom, seq, transcript_seqs, transcript_genes): # find chr transcripts for line in open(transcripts_gtf): a = line.split('\t') if a[0] == chrom: kv = gff.gtf_kv(a[8]) tid = kv['transcript_id'] gid = kv['gene_id'] exon_start = int(a[3]) exon_end = int(a[4]) exon_seq = seq[exon_start-1:exon_end] if a[6] == '+': transcript_seqs[tid] = transcript_seqs.get(tid,'') + exon_seq else: transcript_seqs[tid] = dna.rc(exon_seq) + transcript_seqs.get(tid,'') transcript_genes[tid] = gid
def process_chr(chrom, seq, promoters, out_fa, out_gff, promoter_length, acgt_t): # grab promoters for prom in promoters: if prom.strand == '+': prom_seq = seq[prom.start:prom.start + promoter_length] else: prom_seq = dna.rc(seq[prom.start:prom.start + promoter_length]) if acgt_pct(prom_seq) > acgt_t: print >> out_fa, '>%s\n%s' % (prom.gtf_kv['transcript_id'], prom_seq) gff_dat = [ chrom, '.', 'promoter', str(prom.start + 1), str(prom.start + promoter_length + 1 - 1), '.', prom.strand, '.', gff.kv_gtf(prom.gtf_kv) ] print >> out_gff, '\t'.join(gff_dat)
def header_bed_id(header, seq, bed_file, options): header_seqs = {} for line in open(bed_file): a = line.split('\t') a[-1] = a[-1].rstrip() if a[0] == header: feat_start = int(a[1]) feat_end = int(a[2]) head_id = a[3] feat_seq = seq[feat_start:feat_end] if a[5] == '+': header_seqs[head_id] = header_seqs.get(head_id,'') + feat_seq else: header_seqs[head_id] = dna.rc(feat_seq) + header_seqs.get(head_id,'') for head_id in header_seqs: print '>%s\n%s' % (head_id,header_seqs[head_id])
def header_bed_id(header, seq, bed_file, options): header_seqs = {} for line in open(bed_file): a = line.split('\t') a[-1] = a[-1].rstrip() if a[0] == header: feat_start = int(a[1]) feat_end = int(a[2]) head_id = a[3] feat_seq = seq[feat_start:feat_end] if a[5] == '+': header_seqs[head_id] = header_seqs.get(head_id, '') + feat_seq else: header_seqs[head_id] = dna.rc(feat_seq) + header_seqs.get( head_id, '') for head_id in header_seqs: print '>%s\n%s' % (head_id, header_seqs[head_id])
def main(): usage = 'usage: %prog [options] <transcript id>' parser = OptionParser(usage) parser.add_option( '-a', dest='align_t', type='float', default=0.01, help='Minimum % of the transcript that must align [Default: %default]') parser.add_option( '-l', dest='lncrna_gtf', default='/Users/dk/research/common/data/lncrna/lnc_catalog.gtf', help='lncRNA gtf file [Default: %default]') parser.add_option( '-m', dest='merge_t', type='int', default=40, help= 'Minimum distance between alignment blocks to merge into a single exon [Default: %default]' ) (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide transcript id and genome') else: transcript_id = args[0] # get human genome hg19 = worldbase.Bio.Seq.Genome.HUMAN.hg19() # get gene exon intervals gene_ivals = [] transcript_length = 0 for line in open(options.lncrna_gtf): a = line.split('\t') kv = gff.gtf_kv(a[8]) if kv['transcript_id'] == transcript_id: chrom = a[0] start = int(a[3]) end = int(a[4]) strand = 1 * (a[6] == '+') - 1 * ( a[6] == '-') # assuming all orientations are the same gene_id = kv['gene_id'] gene_ivals.append(hg19[chrom][start:end]) transcript_length += end - start # get hg19 msa msa = worldbase.Bio.MSA.UCSC.hg19_multiz46way() # map returned sequences back to genome name idDict = ~(msa.seqDict) # hash alignments by genome genome_blocks = {} for gi in gene_ivals: for src, dest, edg in msa[gi].edges(): genome_blocks.setdefault(idDict[dest], []).append(dest) #print repr(gi), repr(src), repr(dest), idDict[dest], edg.length() # check for enough alignment for gen_chr in genome_blocks.keys(): aligned_nt = sum([b.stop - b.start for b in genome_blocks[gen_chr]]) print gen_chr, aligned_nt, float(aligned_nt) / transcript_length if aligned_nt < options.align_t * transcript_length: del genome_blocks[gen_chr] # for each genome worldbase_genomes = worldbase.dir('Bio.Seq.Genome') for gen_chr in genome_blocks: genome_blocks[gen_chr].sort(block_cmp) # make gtf lines / merge alignments b = genome_blocks[gen_chr][0] gff_strand = '+' * (b.orientation == strand) + '-' * (b.orientation != strand) gff_cols = [[ b.id, 'PygrTransMap', 'exon', b._abs_interval[0] + 1, b._abs_interval[1], '.', gff_strand, '.', 'gene_id "%s"; transcript_id "%s"; exon_number "1";' % ( gene_id, transcript_id, ) ]] exon_num = 2 for i in range(1, len(genome_blocks[gen_chr])): if gff_cols[-1][4] + options.merge_t >= genome_blocks[gen_chr][ i]._abs_interval[0]: # merge with prior gff_cols[-1][4] = genome_blocks[gen_chr][i]._abs_interval[1] else: # add new exon b = genome_blocks[gen_chr][i] gff_cols.append([ b.id, 'PygrTransMap', 'exon', b._abs_interval[0] + 1, b._abs_interval[1], '.', gff_strand, '.', 'gene_id "%s"; transcript_id "%s"; exon_number "%d";' % (gene_id, transcript_id, exon_num) ]) exon_num += 1 # print gtf gtf_out = open('%s_%s.gtf' % (transcript_id, gen_chr), 'w') for gc in gff_cols: print >> gtf_out, '\t'.join([str(c) for c in gc]) gtf_out.close() # get genomic sequence gen = gen_chr[:gen_chr.find('.')] chrom = gen_chr[gen_chr.find('.') + 1:] wb_gen = [wgen for wgen in worldbase_genomes if wgen.find(gen) != -1] if len(wb_gen) > 1: print >> sys.stderr, 'Detected >1 worldbase genome matching %s' % gen print >> sys.stderr, ' '.join(wb_gen) gen_seq = worldbase.__call__(wb_gen[0]) # get transcript sequence seq = '' for gc in gff_cols: seq += str(gen_seq[gc[0]][gc[3] - 1:gc[4]]) if gff_cols[0][6] == '-': seq = dna.rc(seq) # print fasta fasta_out = open('%s_%s.fa' % (transcript_id, gen_chr), 'w') print >> fasta_out, '>%s_gene=%s_%s\n%s' % (transcript_id, gene_id, gen_chr, seq) fasta_out.close()
def main(): usage = 'usage: %prog [options] <transcript id>' parser = OptionParser(usage) parser.add_option('-a', dest='align_t', type='float', default=0.01, help='Minimum % of the transcript that must align [Default: %default]') parser.add_option('-l', dest='lncrna_gtf', default='/Users/dk/research/common/data/lncrna_mm9/lnc_catalog.gtf', help='lncRNA gtf file [Default: %default]') parser.add_option('-m', dest='merge_t', type='int', default=40, help='Minimum distance between alignment blocks to merge into a single exon [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide transcript id') else: transcript_id = args[0] # get human genome mm9 = worldbase.Bio.Seq.Genome.MOUSE.mm9() # get gene exon intervals gene_ivals = [] transcript_length = 0 for line in open(options.lncrna_gtf): a = line.split('\t') kv = gff.gtf_kv(a[8]) if kv['transcript_id'] == transcript_id: chrom = a[0] start = int(a[3]) end = int(a[4]) strand = 1*(a[6]=='+') - 1*(a[6]=='-') # assuming all orientations are the same gene_id = kv['gene_id'] gene_ivals.append(mm9[chrom][start:end]) transcript_length += end-start # get mm9 msa msa = worldbase.Bio.MSA.UCSC.mm9_multiz30way() # map returned sequences back to genome name idDict = ~(msa.seqDict) # hash alignments by genome genome_blocks = {} for gi in gene_ivals: for src, dest, edg in msa[gi].edges(): genome_blocks.setdefault(idDict[dest],[]).append(dest) #print repr(gi), repr(src), repr(dest), idDict[dest], edg.length() # check for enough alignment for gen_chr in genome_blocks.keys(): aligned_nt = sum([b.stop-b.start for b in genome_blocks[gen_chr]]) print gen_chr, aligned_nt, float(aligned_nt)/transcript_length if aligned_nt < options.align_t*transcript_length: del genome_blocks[gen_chr] # for each genome worldbase_genomes = worldbase.dir('Bio.Seq.Genome') for gen_chr in genome_blocks: genome_blocks[gen_chr].sort(block_cmp) # make gtf lines / merge alignments b = genome_blocks[gen_chr][0] gff_strand = '+'*(b.orientation==strand) + '-'*(b.orientation!=strand) gff_cols = [[b.id, 'PygrTransMap', 'exon', b._abs_interval[0]+1, b._abs_interval[1], '.', gff_strand, '.', 'gene_id "%s"; transcript_id "%s"; exon_number "1";' % (gene_id, transcript_id,)]] exon_num = 2 for i in range(1,len(genome_blocks[gen_chr])): if gff_cols[-1][4] + options.merge_t >= genome_blocks[gen_chr][i]._abs_interval[0]: # merge with prior gff_cols[-1][4] = genome_blocks[gen_chr][i]._abs_interval[1] else: # add new exon b = genome_blocks[gen_chr][i] gff_cols.append([b.id, 'PygrTransMap', 'exon', b._abs_interval[0]+1, b._abs_interval[1], '.', gff_strand, '.', 'gene_id "%s"; transcript_id "%s"; exon_number "%d";' % (gene_id, transcript_id, exon_num)]) exon_num += 1 # print gtf gtf_out = open('%s_%s.gtf' % (transcript_id, gen_chr), 'w') for gc in gff_cols: print >> gtf_out, '\t'.join([str(c) for c in gc]) gtf_out.close() # get genomic sequence gen = gen_chr[:gen_chr.find('.')] chrom = gen_chr[gen_chr.find('.')+1:] wb_gen = [wgen for wgen in worldbase_genomes if wgen.find(gen) != -1] if len(wb_gen) > 1: print >> sys.stderr, 'Detected >1 worldbase genome matching %s' % gen print >> sys.stderr, ' '.join(wb_gen) gen_seq = worldbase.__call__(wb_gen[0]) # get transcript sequence seq = '' for gc in gff_cols: seq += str(gen_seq[gc[0]][gc[3]-1:gc[4]]) if gff_cols[0][6] == '-': seq = dna.rc(seq) # print fasta fasta_out = open('%s_%s.fa' % (transcript_id, gen_chr), 'w') print >> fasta_out, '>%s_gene=%s_%s\n%s' % (transcript_id,gene_id,gen_chr,seq) fasta_out.close()