Esempio n. 1
0
def check_dir_download(self):
    found = worldbase.dir(download=True)
    found.sort()
    found2 = worldbase.dir('', download=True)
    found2.sort()
    assert len(found) == 0
    assert found == found2
Esempio n. 2
0
def check_dir_download(self):
    found = worldbase.dir(download=True)
    found.sort()
    found2 = worldbase.dir('', download=True)
    found2.sort()
    assert len(found) == 0
    assert found == found2
Esempio n. 3
0
def check_dir_re(self):
    expected=['Bio.Annotation.annoDB', 'Bio.Annotation.map',
                'Bio.Seq.Swissprot.sp42', 'Bio.Seq.frag', 'Bio.Seq.spmap']
    expected.sort()
    found = worldbase.dir('^Bio', 'r')
    found.sort()
    assert found == expected

    expected = ['Bio.Seq.Swissprot.sp42', 'Bio.Seq.spmap']
    expected.sort()
    found = worldbase.dir('^Bio\..+\.sp', 'r')
    found.sort()
    assert found == expected
Esempio n. 4
0
def check_dir_re(self):
    expected = [
        'Bio.Annotation.annoDB', 'Bio.Annotation.map',
        'Bio.Seq.Swissprot.sp42', 'Bio.Seq.frag', 'Bio.Seq.spmap'
    ]
    expected.sort()
    found = worldbase.dir('^Bio', 'r')
    found.sort()
    assert found == expected

    expected = ['Bio.Seq.Swissprot.sp42', 'Bio.Seq.spmap']
    expected.sort()
    found = worldbase.dir('^Bio\..+\.sp', 'r')
    found.sort()
    assert found == expected
Esempio n. 5
0
def check_dir_noargs(self):
    found = worldbase.dir()
    found.sort()
    found2 = worldbase.dir('')
    found2.sort()
    assert found == found2
Esempio n. 6
0
def check_dir_noargs(self):
    found = worldbase.dir()
    found.sort()
    found2 = worldbase.dir('')
    found2.sort()
    assert found == found2
Esempio n. 7
0
def main():
    usage = 'usage: %prog [options] <transcript id>'
    parser = OptionParser(usage)
    parser.add_option(
        '-a',
        dest='align_t',
        type='float',
        default=0.01,
        help='Minimum % of the transcript that must align [Default: %default]')
    parser.add_option(
        '-l',
        dest='lncrna_gtf',
        default='/Users/dk/research/common/data/lncrna/lnc_catalog.gtf',
        help='lncRNA gtf file [Default: %default]')
    parser.add_option(
        '-m',
        dest='merge_t',
        type='int',
        default=40,
        help=
        'Minimum distance between alignment blocks to merge into a single exon [Default: %default]'
    )
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide transcript id and genome')
    else:
        transcript_id = args[0]

    # get human genome
    hg19 = worldbase.Bio.Seq.Genome.HUMAN.hg19()

    # get gene exon intervals
    gene_ivals = []
    transcript_length = 0
    for line in open(options.lncrna_gtf):
        a = line.split('\t')
        kv = gff.gtf_kv(a[8])
        if kv['transcript_id'] == transcript_id:
            chrom = a[0]
            start = int(a[3])
            end = int(a[4])
            strand = 1 * (a[6] == '+') - 1 * (
                a[6] == '-')  # assuming all orientations are the same
            gene_id = kv['gene_id']

            gene_ivals.append(hg19[chrom][start:end])
            transcript_length += end - start

    # get hg19 msa
    msa = worldbase.Bio.MSA.UCSC.hg19_multiz46way()

    # map returned sequences back to genome name
    idDict = ~(msa.seqDict)

    # hash alignments by genome
    genome_blocks = {}
    for gi in gene_ivals:
        for src, dest, edg in msa[gi].edges():
            genome_blocks.setdefault(idDict[dest], []).append(dest)
            #print repr(gi), repr(src), repr(dest), idDict[dest], edg.length()

    # check for enough alignment
    for gen_chr in genome_blocks.keys():
        aligned_nt = sum([b.stop - b.start for b in genome_blocks[gen_chr]])
        print gen_chr, aligned_nt, float(aligned_nt) / transcript_length
        if aligned_nt < options.align_t * transcript_length:
            del genome_blocks[gen_chr]

    # for each genome
    worldbase_genomes = worldbase.dir('Bio.Seq.Genome')
    for gen_chr in genome_blocks:
        genome_blocks[gen_chr].sort(block_cmp)

        # make gtf lines / merge alignments
        b = genome_blocks[gen_chr][0]
        gff_strand = '+' * (b.orientation
                            == strand) + '-' * (b.orientation != strand)
        gff_cols = [[
            b.id, 'PygrTransMap', 'exon', b._abs_interval[0] + 1,
            b._abs_interval[1], '.', gff_strand, '.',
            'gene_id "%s"; transcript_id "%s"; exon_number "1";' % (
                gene_id,
                transcript_id,
            )
        ]]
        exon_num = 2
        for i in range(1, len(genome_blocks[gen_chr])):
            if gff_cols[-1][4] + options.merge_t >= genome_blocks[gen_chr][
                    i]._abs_interval[0]:
                # merge with prior
                gff_cols[-1][4] = genome_blocks[gen_chr][i]._abs_interval[1]
            else:
                # add new exon
                b = genome_blocks[gen_chr][i]
                gff_cols.append([
                    b.id, 'PygrTransMap', 'exon', b._abs_interval[0] + 1,
                    b._abs_interval[1], '.', gff_strand, '.',
                    'gene_id "%s"; transcript_id "%s"; exon_number "%d";' %
                    (gene_id, transcript_id, exon_num)
                ])
                exon_num += 1

        # print gtf
        gtf_out = open('%s_%s.gtf' % (transcript_id, gen_chr), 'w')
        for gc in gff_cols:
            print >> gtf_out, '\t'.join([str(c) for c in gc])
        gtf_out.close()

        # get genomic sequence
        gen = gen_chr[:gen_chr.find('.')]
        chrom = gen_chr[gen_chr.find('.') + 1:]
        wb_gen = [wgen for wgen in worldbase_genomes if wgen.find(gen) != -1]
        if len(wb_gen) > 1:
            print >> sys.stderr, 'Detected >1 worldbase genome matching %s' % gen
            print >> sys.stderr, ' '.join(wb_gen)
        gen_seq = worldbase.__call__(wb_gen[0])

        # get transcript sequence
        seq = ''
        for gc in gff_cols:
            seq += str(gen_seq[gc[0]][gc[3] - 1:gc[4]])
        if gff_cols[0][6] == '-':
            seq = dna.rc(seq)

        # print fasta
        fasta_out = open('%s_%s.fa' % (transcript_id, gen_chr), 'w')
        print >> fasta_out, '>%s_gene=%s_%s\n%s' % (transcript_id, gene_id,
                                                    gen_chr, seq)
        fasta_out.close()
Esempio n. 8
0
def main():
    usage = 'usage: %prog [options] <transcript id>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='align_t', type='float', default=0.01, help='Minimum % of the transcript that must align [Default: %default]')
    parser.add_option('-l', dest='lncrna_gtf', default='/Users/dk/research/common/data/lncrna_mm9/lnc_catalog.gtf', help='lncRNA gtf file [Default: %default]')
    parser.add_option('-m', dest='merge_t', type='int', default=40, help='Minimum distance between alignment blocks to merge into a single exon [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide transcript id')
    else:
        transcript_id = args[0]

    # get human genome
    mm9 = worldbase.Bio.Seq.Genome.MOUSE.mm9()

    # get gene exon intervals
    gene_ivals = []
    transcript_length = 0
    for line in open(options.lncrna_gtf):
        a = line.split('\t')
        kv = gff.gtf_kv(a[8])
        if kv['transcript_id'] == transcript_id:
            chrom = a[0]
            start = int(a[3])
            end = int(a[4])
            strand = 1*(a[6]=='+') - 1*(a[6]=='-') # assuming all orientations are the same
            gene_id = kv['gene_id']

            gene_ivals.append(mm9[chrom][start:end])
            transcript_length += end-start

    # get mm9 msa
    msa = worldbase.Bio.MSA.UCSC.mm9_multiz30way()

    # map returned sequences back to genome name
    idDict = ~(msa.seqDict)

    # hash alignments by genome
    genome_blocks = {}
    for gi in gene_ivals:
        for src, dest, edg in msa[gi].edges():
            genome_blocks.setdefault(idDict[dest],[]).append(dest)
            #print repr(gi), repr(src), repr(dest), idDict[dest], edg.length()

    # check for enough alignment
    for gen_chr in genome_blocks.keys():
        aligned_nt = sum([b.stop-b.start for b in genome_blocks[gen_chr]])
        print gen_chr, aligned_nt, float(aligned_nt)/transcript_length
        if aligned_nt < options.align_t*transcript_length:
            del genome_blocks[gen_chr]

    # for each genome
    worldbase_genomes = worldbase.dir('Bio.Seq.Genome')
    for gen_chr in genome_blocks:
        genome_blocks[gen_chr].sort(block_cmp)

        # make gtf lines / merge alignments
        b = genome_blocks[gen_chr][0]
        gff_strand = '+'*(b.orientation==strand) + '-'*(b.orientation!=strand)
        gff_cols = [[b.id, 'PygrTransMap', 'exon', b._abs_interval[0]+1, b._abs_interval[1], '.', gff_strand, '.', 'gene_id "%s"; transcript_id "%s"; exon_number "1";' % (gene_id, transcript_id,)]]
        exon_num = 2
        for i in range(1,len(genome_blocks[gen_chr])):
            if gff_cols[-1][4] + options.merge_t >= genome_blocks[gen_chr][i]._abs_interval[0]:
                # merge with prior
                gff_cols[-1][4] = genome_blocks[gen_chr][i]._abs_interval[1]
            else:
                # add new exon
                b = genome_blocks[gen_chr][i]
                gff_cols.append([b.id, 'PygrTransMap', 'exon', b._abs_interval[0]+1, b._abs_interval[1], '.', gff_strand, '.', 'gene_id "%s"; transcript_id "%s"; exon_number "%d";' % (gene_id, transcript_id, exon_num)])
                exon_num += 1

        # print gtf
        gtf_out = open('%s_%s.gtf' % (transcript_id, gen_chr), 'w')
        for gc in gff_cols:
            print >> gtf_out, '\t'.join([str(c) for c in gc])
        gtf_out.close()

        # get genomic sequence
        gen = gen_chr[:gen_chr.find('.')]
        chrom = gen_chr[gen_chr.find('.')+1:]
        wb_gen = [wgen for wgen in worldbase_genomes if wgen.find(gen) != -1]
        if len(wb_gen) > 1:
            print >> sys.stderr, 'Detected >1 worldbase genome matching %s' % gen
            print >> sys.stderr, ' '.join(wb_gen)         
        gen_seq = worldbase.__call__(wb_gen[0])

        # get transcript sequence
        seq = ''
        for gc in gff_cols:
            seq += str(gen_seq[gc[0]][gc[3]-1:gc[4]])
        if gff_cols[0][6] == '-':
            seq = dna.rc(seq)

        # print fasta
        fasta_out = open('%s_%s.fa' % (transcript_id, gen_chr), 'w')
        print >> fasta_out, '>%s_gene=%s_%s\n%s' % (transcript_id,gene_id,gen_chr,seq)
        fasta_out.close()