Ejemplo n.º 1
0
def kmer_kernel(seq, k, canonical=True):
    kmer_counts = {}

    if canonical:
        seq_rc = dna.rc(seq)

    for i in range(len(seq)-k+1):
        kmer = seq[i:i+k]
        if kmer.find('N') == -1:
            kmer_counts[kmer] = kmer_counts.get(kmer,0) + 1

            if canonical:
                kmer_rc = seq_rc[i:i+k]
                kmer_counts[kmer_rc] = kmer_counts.get(kmer_rc,0) + 1

    if canonical:
        kmer_counts = dna.canonical_kmers(kmer_counts)

    # normalize
    # kmer_sum = float(sum(kmer_counts.values()))
    kmer_sum = float(sum(np.square(list(kmer_counts.values()))))

    vec = {}
    for kmer in kmer_counts:
        vec[kmer] = kmer_counts[kmer] / kmer_sum

    return vec
Ejemplo n.º 2
0
def header_gff(header, seq, gff_file, options):
    header_seqs = {}
    for line in open(gff_file):
        a = line.split('\t')
        a[-1] = a[-1].rstrip()
        if (not options.exon or a[2] == 'exon') and a[0] == header:
            kv = gff.gtf_kv(a[8])
            #head_id = kv.get(options.header_key,a[8]+'_'+a[0]+':'+a[3]+'-'+a[4])
            head_id = kv.get(options.header_key,a[8])
            if options.gene_too:
                head_id += ' gene=%s' % kv.get('gene_id','')

            feat_start = int(a[3])
            feat_end = int(a[4])

            feat_seq = seq[feat_start-1:feat_end]
            if a[6] == '+':
                header_seqs[head_id] = header_seqs.get(head_id,'') + feat_seq
            else:
                header_seqs[head_id] = dna.rc(feat_seq) + header_seqs.get(head_id,'')

    for header in header_seqs:
        print '>%s' % header
        if options.split_lines:
            i = 0
            while i < len(header_seqs[header]):
                print header_seqs[header][i:i+60]
                i += 60
        else:
            print header_seqs[header]
Ejemplo n.º 3
0
def header_gff(header, seq, gff_file, options):
    header_seqs = {}
    for line in open(gff_file):
        a = line.split('\t')
        a[-1] = a[-1].rstrip()
        if (not options.exon or a[2] == 'exon') and a[0] == header:
            try:
                kv = gff.gtf_kv(a[8])
            except:
                kv = {}

            head_id = kv.get(options.header_key,a[0]+':'+a[3]+'-'+a[4])
            #head_id = kv.get(options.header_key,a[8])

            if options.gene_too:
                head_id += ' gene=%s' % kv.get('gene_id','')

            feat_start = int(a[3])
            feat_end = int(a[4])

            feat_seq = seq[feat_start-1:feat_end]
            if a[6] == '+':
                header_seqs[head_id] = header_seqs.get(head_id,'') + feat_seq
            else:
                header_seqs[head_id] = dna.rc(feat_seq) + header_seqs.get(head_id,'')

    for header in header_seqs:
        print '>%s' % header
        if options.split_lines:
            i = 0
            while i < len(header_seqs[header]):
                print header_seqs[header][i:i+60]
                i += 60
        else:
            print header_seqs[header]
Ejemplo n.º 4
0
def kmer_kernel(seq, k, canonical=True):
    kmer_counts = {}

    if canonical:
        seq_rc = dna.rc(seq)

    for i in range(len(seq) - k + 1):
        kmer = seq[i:i + k]
        if kmer.find('N') == -1:
            kmer_counts[kmer] = kmer_counts.get(kmer, 0) + 1

            if canonical:
                kmer_rc = seq_rc[i:i + k]
                kmer_counts[kmer_rc] = kmer_counts.get(kmer_rc, 0) + 1

    if canonical:
        kmer_counts = dna.canonical_kmers(kmer_counts)

    # normalize
    # kmer_sum = float(sum(kmer_counts.values()))
    kmer_sum = float(sum(np.square(list(kmer_counts.values()))))

    vec = {}
    for kmer in kmer_counts:
        vec[kmer] = kmer_counts[kmer] / kmer_sum

    return vec
Ejemplo n.º 5
0
def header_bed(header, seq, bed_file, options):
    for line in open(bed_file):
        a = line.split('\t')
        a[-1] = a[-1].rstrip()

        if a[0] == header:
            feat_start = int(a[1])
            feat_end = int(a[2])

            feat_strand = '+'
            if len(a) > 5 and  a[5] == '-':
                feat_strand = '-'

            feat_header = ''
            if len(a) > 3 and a[3] != '.':
                feat_header = a[3] + ':'
            feat_header += '%s:%d-%d:%s' % (header,feat_start,feat_end,feat_strand)
            
            if feat_strand == '+':
                feat_seq = seq[feat_start:feat_end]
            else:
                feat_seq = dna.rc(seq[feat_start:feat_end])

            #print '>%s\n%s' % (feat_header, feat_seq)
            print '>%s' % feat_header
            i = 0
            while i < len(feat_seq):
                print feat_seq[i:i+60]
                i += 60
Ejemplo n.º 6
0
def make_te_read_fastas(te_gff, bam_file, read_tes, out_dir, stranded, max_reads):
    # open TE read fasta files
    te_fastas = {}
    for line in open(te_gff):
        a = line.split('\t')
        dfam_te = gff.gtf_kv(a[8])['dfam']
        if not (dfam_te,'fwd') in te_fastas:
            te_fastas[(dfam_te,'fwd')] = open('%s/%s_fwd.fa' % (out_dir,dfam_te), 'w')
            te_fastas[(dfam_te,'rev')] = open('%s/%s_rev.fa' % (out_dir,dfam_te), 'w')

    # initialize counters for total reads
    te_totals = {}
    for dfam_te, orient in te_fastas:
        te_totals[dfam_te, orient] = 0

    # print reads to fasta files
    for aligned_read in pysam.Samfile(bam_file, 'rb'):
        this_read_tes = read_tes.get(aligned_read.qname,{})

        for dfam_te in this_read_tes.keys():
            if this_read_tes[dfam_te] != None:
                (rstrand, tstrand) = this_read_tes[dfam_te]

                # only print if we match the read strand
                if (aligned_read.is_reverse and rstrand == '-') or (not aligned_read.is_reverse and rstrand == '+'):
                    # TE determines reversal
                    if tstrand == '+':
                        rseq = aligned_read.seq
                    else:
                        rseq = dna.rc(aligned_read.seq)

                    # count, and print
                    if not stranded or rstrand == tstrand:
                        te_totals[(dfam_te,'fwd')] += 1
                        if te_totals[(dfam_te,'fwd')] < max_reads:
                            print >> te_fastas[(dfam_te,'fwd')], '>%s\n%s' % (aligned_read.qname,rseq)
                    else:
                        te_totals[(dfam_te,'rev')] += 1
                        if te_totals[(dfam_te,'rev')] < max_reads:
                            print >> te_fastas[(dfam_te,'rev')], '>%s\n%s' % (aligned_read.qname,rseq)

                    # specify printed
                    this_read_tes[dfam_te] = None

    # post-process fasta files
    te_renorm = {}
    for dfam_te, orient in te_fastas:
        # close
        te_fastas[(dfam_te, orient)].close()

        # return renormalization factors
        if te_totals[(dfam_te,orient)] > 10:
            te_renorm[(dfam_te,orient)] = max(1.0, te_totals[(dfam_te,orient)]/float(max_reads))

    return te_renorm
Ejemplo n.º 7
0
def process_chr(chrom, seq, promoters, out_fa, out_gff, promoter_length, acgt_t):
    # grab promoters
    for prom in promoters:
        if prom.strand == '+':
            prom_seq = seq[prom.start:prom.start+promoter_length]
        else:
            prom_seq = dna.rc(seq[prom.start:prom.start+promoter_length])
        if acgt_pct(prom_seq) > acgt_t:
            print >> out_fa, '>%s\n%s' % (prom.gtf_kv['transcript_id'], prom_seq)
            gff_dat = [chrom, '.', 'promoter', str(prom.start+1), str(prom.start+promoter_length+1-1), '.', prom.strand, '.', gff.kv_gtf(prom.gtf_kv)]
            print >> out_gff, '\t'.join(gff_dat)
Ejemplo n.º 8
0
def header_bed(header, seq, bed_file, options):
    for line in open(bed_file):
        a = line.split('\t')
        a[-1] = a[-1].rstrip()

        if a[0] == header:
            # determine start and end
            feat_start = int(a[1])
            feat_end = int(a[2])
            if options.length_match:
                feat_mid = int(0.5 * feat_start + 0.5 * feat_end)
                feat_start = feat_mid - options.length_match / 2
                feat_end = feat_mid + options.length_match / 2

            # determine strand
            feat_strand = '+'
            if len(a) > 5 and a[5] == '-':
                feat_strand = '-'

            # determine header
            if options.add_coords_header:
                feat_header = '%s:%s-%s:%s' % (header, a[1], a[2], feat_strand)
            else:
                feat_header = ''
                if len(a) > 3 and a[3] != '.':
                    feat_header = a[3]

            # determine sequence
            feat_seq = ''

            # if negative index, start with N's
            if feat_start < 0:
                feat_seq += 'N' * (-feat_start)
                feat_start = 0

            # grab the genome sequence
            feat_seq += seq[feat_start:feat_end]

            # if it's too short, extend with N's
            if options.length_match and len(feat_seq) < options.length_match:
                feat_seq += 'N' * (options.length_match - len(feat_seq))

            # reverse complement
            if feat_strand == '-':
                feat_seq = dna.rc(seq[feat_start:feat_end])

            #print '>%s\n%s' % (feat_header, feat_seq)
            print '>%s' % feat_header
            i = 0
            while i < len(feat_seq):
                print feat_seq[i:i + 60]
                i += 60
Ejemplo n.º 9
0
def header_bed(header, seq, bed_file, options):
    for line in open(bed_file):
        a = line.split('\t')
        a[-1] = a[-1].rstrip()

        if a[0] == header:
            # determine start and end 
            feat_start = int(a[1])
            feat_end = int(a[2])
            if options.length_match:
                feat_mid = int(0.5*feat_start + 0.5*feat_end)
                feat_start = feat_mid - options.length_match/2
                feat_end = feat_mid + options.length_match/2

            # determine strand
            feat_strand = '+'
            if len(a) > 5 and  a[5] == '-':
                feat_strand = '-'

            # determine header
            if options.add_coords_header:
                feat_header = '%s:%s-%s:%s' % (header,a[1],a[2],feat_strand)
            else:
                feat_header = ''
                if len(a) > 3 and a[3] != '.':
                    feat_header = a[3]
            
            # determine sequence
            feat_seq = ''
            
            # if negative index, start with N's
            if feat_start < 0:
                feat_seq += 'N'*(-feat_start)
                feat_start = 0

            # grab the genome sequence
            feat_seq += seq[feat_start:feat_end]

            # if it's too short, extend with N's
            if options.length_match and len(feat_seq) < options.length_match:
                feat_seq += 'N'*(options.length_match - len(feat_seq))

            # reverse complement
            if feat_strand == '-':
                feat_seq = dna.rc(seq[feat_start:feat_end])

            #print '>%s\n%s' % (feat_header, feat_seq)
            print '>%s' % feat_header
            i = 0
            while i < len(feat_seq):
                print feat_seq[i:i+60]
                i += 60
Ejemplo n.º 10
0
def process_chrom(transcripts_gtf, chrom, seq, transcript_seqs, transcript_genes):
    # find chr transcripts
    for line in open(transcripts_gtf):
        a = line.split('\t')
        if a[0] == chrom:
            kv = gff.gtf_kv(a[8])
            tid = kv['transcript_id']
            gid = kv['gene_id']

            exon_start = int(a[3])
            exon_end = int(a[4])

            exon_seq = seq[exon_start-1:exon_end]
            if a[6] == '+':
                transcript_seqs[tid] = transcript_seqs.get(tid,'') + exon_seq
            else:
                transcript_seqs[tid] = dna.rc(exon_seq) + transcript_seqs.get(tid,'')

            transcript_genes[tid] = gid
Ejemplo n.º 11
0
def process_chr(chrom, seq, promoters, out_fa, out_gff, promoter_length,
                acgt_t):
    # grab promoters
    for prom in promoters:
        if prom.strand == '+':
            prom_seq = seq[prom.start:prom.start + promoter_length]
        else:
            prom_seq = dna.rc(seq[prom.start:prom.start + promoter_length])
        if acgt_pct(prom_seq) > acgt_t:
            print >> out_fa, '>%s\n%s' % (prom.gtf_kv['transcript_id'],
                                          prom_seq)
            gff_dat = [
                chrom, '.', 'promoter',
                str(prom.start + 1),
                str(prom.start + promoter_length + 1 - 1), '.', prom.strand,
                '.',
                gff.kv_gtf(prom.gtf_kv)
            ]
            print >> out_gff, '\t'.join(gff_dat)
Ejemplo n.º 12
0
def header_bed_id(header, seq, bed_file, options):
    header_seqs = {}
    for line in open(bed_file):
        a = line.split('\t')
        a[-1] = a[-1].rstrip()

        if a[0] == header:
            feat_start = int(a[1])
            feat_end = int(a[2])
            head_id = a[3]
            
            feat_seq = seq[feat_start:feat_end]

            if a[5] == '+':
                header_seqs[head_id] = header_seqs.get(head_id,'') + feat_seq
            else:
                header_seqs[head_id] = dna.rc(feat_seq) + header_seqs.get(head_id,'')

    for head_id in header_seqs:
        print '>%s\n%s' % (head_id,header_seqs[head_id])
Ejemplo n.º 13
0
def header_bed_id(header, seq, bed_file, options):
    header_seqs = {}
    for line in open(bed_file):
        a = line.split('\t')
        a[-1] = a[-1].rstrip()

        if a[0] == header:
            feat_start = int(a[1])
            feat_end = int(a[2])
            head_id = a[3]

            feat_seq = seq[feat_start:feat_end]

            if a[5] == '+':
                header_seqs[head_id] = header_seqs.get(head_id, '') + feat_seq
            else:
                header_seqs[head_id] = dna.rc(feat_seq) + header_seqs.get(
                    head_id, '')

    for head_id in header_seqs:
        print '>%s\n%s' % (head_id, header_seqs[head_id])
Ejemplo n.º 14
0
def main():
    usage = 'usage: %prog [options] <transcript id>'
    parser = OptionParser(usage)
    parser.add_option(
        '-a',
        dest='align_t',
        type='float',
        default=0.01,
        help='Minimum % of the transcript that must align [Default: %default]')
    parser.add_option(
        '-l',
        dest='lncrna_gtf',
        default='/Users/dk/research/common/data/lncrna/lnc_catalog.gtf',
        help='lncRNA gtf file [Default: %default]')
    parser.add_option(
        '-m',
        dest='merge_t',
        type='int',
        default=40,
        help=
        'Minimum distance between alignment blocks to merge into a single exon [Default: %default]'
    )
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide transcript id and genome')
    else:
        transcript_id = args[0]

    # get human genome
    hg19 = worldbase.Bio.Seq.Genome.HUMAN.hg19()

    # get gene exon intervals
    gene_ivals = []
    transcript_length = 0
    for line in open(options.lncrna_gtf):
        a = line.split('\t')
        kv = gff.gtf_kv(a[8])
        if kv['transcript_id'] == transcript_id:
            chrom = a[0]
            start = int(a[3])
            end = int(a[4])
            strand = 1 * (a[6] == '+') - 1 * (
                a[6] == '-')  # assuming all orientations are the same
            gene_id = kv['gene_id']

            gene_ivals.append(hg19[chrom][start:end])
            transcript_length += end - start

    # get hg19 msa
    msa = worldbase.Bio.MSA.UCSC.hg19_multiz46way()

    # map returned sequences back to genome name
    idDict = ~(msa.seqDict)

    # hash alignments by genome
    genome_blocks = {}
    for gi in gene_ivals:
        for src, dest, edg in msa[gi].edges():
            genome_blocks.setdefault(idDict[dest], []).append(dest)
            #print repr(gi), repr(src), repr(dest), idDict[dest], edg.length()

    # check for enough alignment
    for gen_chr in genome_blocks.keys():
        aligned_nt = sum([b.stop - b.start for b in genome_blocks[gen_chr]])
        print gen_chr, aligned_nt, float(aligned_nt) / transcript_length
        if aligned_nt < options.align_t * transcript_length:
            del genome_blocks[gen_chr]

    # for each genome
    worldbase_genomes = worldbase.dir('Bio.Seq.Genome')
    for gen_chr in genome_blocks:
        genome_blocks[gen_chr].sort(block_cmp)

        # make gtf lines / merge alignments
        b = genome_blocks[gen_chr][0]
        gff_strand = '+' * (b.orientation
                            == strand) + '-' * (b.orientation != strand)
        gff_cols = [[
            b.id, 'PygrTransMap', 'exon', b._abs_interval[0] + 1,
            b._abs_interval[1], '.', gff_strand, '.',
            'gene_id "%s"; transcript_id "%s"; exon_number "1";' % (
                gene_id,
                transcript_id,
            )
        ]]
        exon_num = 2
        for i in range(1, len(genome_blocks[gen_chr])):
            if gff_cols[-1][4] + options.merge_t >= genome_blocks[gen_chr][
                    i]._abs_interval[0]:
                # merge with prior
                gff_cols[-1][4] = genome_blocks[gen_chr][i]._abs_interval[1]
            else:
                # add new exon
                b = genome_blocks[gen_chr][i]
                gff_cols.append([
                    b.id, 'PygrTransMap', 'exon', b._abs_interval[0] + 1,
                    b._abs_interval[1], '.', gff_strand, '.',
                    'gene_id "%s"; transcript_id "%s"; exon_number "%d";' %
                    (gene_id, transcript_id, exon_num)
                ])
                exon_num += 1

        # print gtf
        gtf_out = open('%s_%s.gtf' % (transcript_id, gen_chr), 'w')
        for gc in gff_cols:
            print >> gtf_out, '\t'.join([str(c) for c in gc])
        gtf_out.close()

        # get genomic sequence
        gen = gen_chr[:gen_chr.find('.')]
        chrom = gen_chr[gen_chr.find('.') + 1:]
        wb_gen = [wgen for wgen in worldbase_genomes if wgen.find(gen) != -1]
        if len(wb_gen) > 1:
            print >> sys.stderr, 'Detected >1 worldbase genome matching %s' % gen
            print >> sys.stderr, ' '.join(wb_gen)
        gen_seq = worldbase.__call__(wb_gen[0])

        # get transcript sequence
        seq = ''
        for gc in gff_cols:
            seq += str(gen_seq[gc[0]][gc[3] - 1:gc[4]])
        if gff_cols[0][6] == '-':
            seq = dna.rc(seq)

        # print fasta
        fasta_out = open('%s_%s.fa' % (transcript_id, gen_chr), 'w')
        print >> fasta_out, '>%s_gene=%s_%s\n%s' % (transcript_id, gene_id,
                                                    gen_chr, seq)
        fasta_out.close()
Ejemplo n.º 15
0
def main():
    usage = 'usage: %prog [options] <transcript id>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='align_t', type='float', default=0.01, help='Minimum % of the transcript that must align [Default: %default]')
    parser.add_option('-l', dest='lncrna_gtf', default='/Users/dk/research/common/data/lncrna_mm9/lnc_catalog.gtf', help='lncRNA gtf file [Default: %default]')
    parser.add_option('-m', dest='merge_t', type='int', default=40, help='Minimum distance between alignment blocks to merge into a single exon [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide transcript id')
    else:
        transcript_id = args[0]

    # get human genome
    mm9 = worldbase.Bio.Seq.Genome.MOUSE.mm9()

    # get gene exon intervals
    gene_ivals = []
    transcript_length = 0
    for line in open(options.lncrna_gtf):
        a = line.split('\t')
        kv = gff.gtf_kv(a[8])
        if kv['transcript_id'] == transcript_id:
            chrom = a[0]
            start = int(a[3])
            end = int(a[4])
            strand = 1*(a[6]=='+') - 1*(a[6]=='-') # assuming all orientations are the same
            gene_id = kv['gene_id']

            gene_ivals.append(mm9[chrom][start:end])
            transcript_length += end-start

    # get mm9 msa
    msa = worldbase.Bio.MSA.UCSC.mm9_multiz30way()

    # map returned sequences back to genome name
    idDict = ~(msa.seqDict)

    # hash alignments by genome
    genome_blocks = {}
    for gi in gene_ivals:
        for src, dest, edg in msa[gi].edges():
            genome_blocks.setdefault(idDict[dest],[]).append(dest)
            #print repr(gi), repr(src), repr(dest), idDict[dest], edg.length()

    # check for enough alignment
    for gen_chr in genome_blocks.keys():
        aligned_nt = sum([b.stop-b.start for b in genome_blocks[gen_chr]])
        print gen_chr, aligned_nt, float(aligned_nt)/transcript_length
        if aligned_nt < options.align_t*transcript_length:
            del genome_blocks[gen_chr]

    # for each genome
    worldbase_genomes = worldbase.dir('Bio.Seq.Genome')
    for gen_chr in genome_blocks:
        genome_blocks[gen_chr].sort(block_cmp)

        # make gtf lines / merge alignments
        b = genome_blocks[gen_chr][0]
        gff_strand = '+'*(b.orientation==strand) + '-'*(b.orientation!=strand)
        gff_cols = [[b.id, 'PygrTransMap', 'exon', b._abs_interval[0]+1, b._abs_interval[1], '.', gff_strand, '.', 'gene_id "%s"; transcript_id "%s"; exon_number "1";' % (gene_id, transcript_id,)]]
        exon_num = 2
        for i in range(1,len(genome_blocks[gen_chr])):
            if gff_cols[-1][4] + options.merge_t >= genome_blocks[gen_chr][i]._abs_interval[0]:
                # merge with prior
                gff_cols[-1][4] = genome_blocks[gen_chr][i]._abs_interval[1]
            else:
                # add new exon
                b = genome_blocks[gen_chr][i]
                gff_cols.append([b.id, 'PygrTransMap', 'exon', b._abs_interval[0]+1, b._abs_interval[1], '.', gff_strand, '.', 'gene_id "%s"; transcript_id "%s"; exon_number "%d";' % (gene_id, transcript_id, exon_num)])
                exon_num += 1

        # print gtf
        gtf_out = open('%s_%s.gtf' % (transcript_id, gen_chr), 'w')
        for gc in gff_cols:
            print >> gtf_out, '\t'.join([str(c) for c in gc])
        gtf_out.close()

        # get genomic sequence
        gen = gen_chr[:gen_chr.find('.')]
        chrom = gen_chr[gen_chr.find('.')+1:]
        wb_gen = [wgen for wgen in worldbase_genomes if wgen.find(gen) != -1]
        if len(wb_gen) > 1:
            print >> sys.stderr, 'Detected >1 worldbase genome matching %s' % gen
            print >> sys.stderr, ' '.join(wb_gen)         
        gen_seq = worldbase.__call__(wb_gen[0])

        # get transcript sequence
        seq = ''
        for gc in gff_cols:
            seq += str(gen_seq[gc[0]][gc[3]-1:gc[4]])
        if gff_cols[0][6] == '-':
            seq = dna.rc(seq)

        # print fasta
        fasta_out = open('%s_%s.fa' % (transcript_id, gen_chr), 'w')
        print >> fasta_out, '>%s_gene=%s_%s\n%s' % (transcript_id,gene_id,gen_chr,seq)
        fasta_out.close()