Example #1
0
def ParseGFF(fname):
    """get contents from a decent GFF file.
    """
    smap={1:'+' , -1:'-' }
    for cid in ['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'ChrM', 'ChrC']: ## change according to the GFF file TODO 
        fh = open(fname, 'rU')
        limit_info = dict(gff_id=[cid], gff_source=['TAIR10']) ## change the source TODO According to the file type 
        for rec in GFF.parse(fh, limit_info=limit_info):
            for each_rec in rec.features:
                if each_rec.type=='gene':
                    for child in each_rec.sub_features:
                        print child.id, 'NULL', rec.id, smap[child.strand], child.type, child.location._start.position, child.location._end.position, each_rec.qualifiers['Note'][0]
                        cnt=1
                        for cod in child.sub_features:
                            fid=None
                            if cod.type=="five_prime_UTR":
                                fid=child.id+'_'+str(cnt)
                                cod.type='UTR5'
                            elif cod.type=="three_prime_UTR":
                                fid=child.id+'_'+str(cnt)
                                cod.type='UTR3'
                            elif cod.type=="CDS":
                                fid=child.id+'_'+str(cnt)
                            if fid:
                                print fid, child.id, rec.id, smap[cod.strand], cod.type, cod.location._start.position, cod.location._end.position,'NULL' 
                                cnt+=1
                    break
        fh.close()
Example #2
0
def __main__():
    
    try:
        gff_file = sys.argv[1]
        chrid = sys.argv[2]
        source = sys.argv[3]
    except:
        sys.stderr.write('Access denied for a GFF file !\n')
        sys.exit(-1)
    
    #gff_source = ['rheMac2_ensGene', 'rheMac2_refGene', 'rheMac2_refSeqAnno', 'rheMac2_transMapAlnUcscGenes']
    limit_parse = dict(
                gff_id = [chrid],
                gff_source = [source]
                )

    chrpos = dict()
    fh = open(gff_file)
    for rec in GFF.parse(fh, limit_info=limit_parse):
        for element in rec.features:
            chrpos[(element.location._start.position, element.location._end.position)] = 1
    fh.close()

    sorted_chrpos = [sort_pos for sort_pos in sorted(chrpos)]

    WriteSortedGFF(gff_file, limit_parse, sorted_chrpos)
Example #3
0
def ParserCommonType(fname, source):

    gff_type = ['transcript', 'exon']
    source_type = zip([source] * len(gff_type), gff_type)
    filter_type = dict(gff_source_type=source_type,
                       gff_id=['I', 'II', 'III', 'IV', 'V', 'X'])
    gene_cnt, genes_data = 1, []
    gid = ordered_dict()
    gfh = open(fname)
    for rec in GFF.parse(gfh, limit_info=filter_type):
        for feature in rec.features:
            if feature.qualifiers['gene'][0] in gid:
                gene = gid[feature.qualifiers['gene'][0]]
                gene['is_alt_spliced'] = 1
                gene['transcripts'].append(feature.id)
                exon_pos = []
                for xlevel in feature.sub_features:
                    exon_pos.append([
                        xlevel.location._start.position + 1,
                        xlevel.location._end.position
                    ])
                if orient == '-':
                    if exon_pos != [] and len(exon_pos) != 1:
                        if exon_pos[0][0] > exon_pos[-1][0]:
                            exon_pos.reverse()
                gene["exons"].append(exon_pos)
                gid[feature.qualifiers['gene'][0]] = gene
            else:
                gene = init_gene()
                gene['id'] = gene_cnt
                gene['name'] = feature.qualifiers['gene'][0]
                gene['chr'] = rec.id
                gene['source'] = feature.qualifiers['source'][0]
                orient = None
                if feature.strand == 1:
                    orient = '+'
                else:
                    orient = '-'
                gene['strand'] = orient
                gene['is_alt_spliced'] = 0
                gene['transcripts'].append(feature.id)
                exon_pos = []
                for xlevel in feature.sub_features:
                    exon_pos.append([
                        xlevel.location._start.position + 1,
                        xlevel.location._end.position
                    ])
                if orient == '-':
                    if exon_pos != [] and len(exon_pos) != 1:
                        if exon_pos[0][0] > exon_pos[-1][0]:
                            exon_pos.reverse()
                gene["exons"].append(exon_pos)
                gene_cnt += 1
                gid[feature.qualifiers['gene'][0]] = gene
    gfh.close()
    for ent in gid.ordered_items():
        cand = OrganizePacket(ent[1])
        genes_data.append(cand)
    return genes_data
Example #4
0
def get_annotation(fname, rfname):
    """Parse genome annotation from GFF3 file.
    """
    smap={1:'+',  -1:'-'}
    from Core import GFF
    te_featdb, teg_featdb, ribo_featdb, oth_featdb=dict(), dict(), dict(), dict()
    for cid in ['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'ChrM', 'ChrC']: 
        sys.stderr.write(cid+".....\n")
        fh = open(fname, 'rU')
        limit_info = dict(gff_id=[cid], gff_source=['TAIR10']) 
        for rec in GFF.parse(fh, limit_info=limit_info):
            for each_rec in rec.features:
                if each_rec.type=='gene':
                    for child in each_rec.sub_features:
                        if child.type in ['mRNA', 'miRNA', 'ncRNA', 'snoRNA', 'snRNA', 'tRNA']:
                            if cid in oth_featdb:
                                oth_featdb[cid][(int(each_rec.location._start.position), 
                                    int(each_rec.location._end.position))]=smap[each_rec.strand]
                            else:
                                oth_featdb[cid]={(int(each_rec.location._start.position), 
                                    int(each_rec.location._end.position)):
                                    smap[each_rec.strand]}
                elif each_rec.type=='transposable_element_gene':
                    if cid in teg_featdb:
                        teg_featdb[cid][(int(each_rec.location._start.position), 
                                int(each_rec.location._end.position))]=smap[each_rec.strand]
                    else:
                        teg_featdb[cid]={(int(each_rec.location._start.position), 
                                int(each_rec.location._end.position)): 
                                smap[each_rec.strand]}
                elif each_rec.type=='transposable_element':
                    if cid in te_featdb:
                        te_featdb[cid][(each_rec.id, int(each_rec.location._start.position), 
                                int(each_rec.location._end.position))]=smap[each_rec.strand]
                    else:
                        te_featdb[cid]={(each_rec.id, int(each_rec.location._start.position), 
                                int(each_rec.location._end.position)): 
                                smap[each_rec.strand]}
                elif each_rec.type=='pseudogene':
                    if cid in oth_featdb:
                        oth_featdb[cid][(int(each_rec.location._start.position), 
                                int(each_rec.location._end.position))]=smap[each_rec.strand]
                    else:
                        oth_featdb[cid]={(int(each_rec.location._start.position), 
                                int(each_rec.location._end.position)): 
                                smap[each_rec.strand]}
        fh.close()
    ## add additional rRNA related location from A. thaliana
    fh=open(rfname, 'rU')
    for line in fh:
        line=line.strip('\n\r').split(' ')
        if line[0] in ribo_featdb:
            ribo_featdb[line[0]][(int(line[1]), int(line[2]))]=line[3]
        else:
            ribo_featdb[line[0]]={(int(line[1]), int(line[2])):line[3]}
    fh.close()
    return te_featdb, teg_featdb, ribo_featdb, oth_featdb
Example #5
0
def get_Feature(fname):
    """Extract genome annotation information from a provided GFF file.
    """
    from Core import GFF
    te_featdb, psd_featdb, cg_featdb, oth_featdb, ribo_featdb = dict(), dict(), dict(), dict(), dict()
    for cid in ['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'ChrM', 'ChrC']: ## change according to the GFF file TODO According to the file type add chromosome number automatically.
        print cid
        fh = open(fname, 'rU')
        limit_info = dict(gff_id=[cid], gff_source=['TAIR10']) ## change the source TODO According to the file type add source flag automatically 
        for rec in GFF.parse(fh, limit_info=limit_info):
            for each_rec in rec.features:
                if each_rec.type=='gene':
                    for child in each_rec.sub_features:
                        if child.type=='mRNA':
                            if cid in cg_featdb:
                                cg_featdb[cid][(int(each_rec.location._start.position), 
                                    int(each_rec.location._end.position))]=each_rec.strand
                            else:
                                cg_featdb[cid]={(int(each_rec.location._start.position), 
                                    int(each_rec.location._end.position)):
                                    each_rec.strand}
                        elif each_rec.sub_features[0].type in ['miRNA', 'ncRNA', 'snoRNA', 'snRNA', 'tRNA']:
                            if cid in oth_featdb:
                                oth_featdb[cid][(int(each_rec.location._start.position), 
                                    int(each_rec.location._end.position))]=each_rec.strand
                            else:
                                oth_featdb[cid]={(int(each_rec.location._start.position), 
                                    int(each_rec.location._end.position)):
                                    each_rec.strand}
                        elif each_rec.sub_features[0].type=='rRNA':
                            if cid in ribo_featdb:
                                ribo_featdb[cid][(int(each_rec.location._start.position), 
                                    int(each_rec.location._end.position))]=each_rec.strand
                            else:
                                ribo_featdb[cid]={(int(each_rec.location._start.position), 
                                    int(each_rec.location._end.position)): 
                                    each_rec.strand}
                elif each_rec.type=='pseudogene':
                    if cid in psd_featdb:
                        psd_featdb[cid][(int(each_rec.location._start.position), 
                                int(each_rec.location._end.position))]=each_rec.strand
                    else:
                        psd_featdb[cid]={(int(each_rec.location._start.position), 
                                int(each_rec.location._end.position)): 
                                each_rec.strand}
                elif each_rec.type in ['transposable_element', 'transposable_element_gene']:
                    if cid in te_featdb:
                        te_featdb[cid][(int(each_rec.location._start.position), 
                                int(each_rec.location._end.position))]=each_rec.strand
                    else:
                        te_featdb[cid]={(int(each_rec.location._start.position), 
                                int(each_rec.location._end.position)): 
                                each_rec.strand}
        fh.close()
    return te_featdb, psd_featdb, cg_featdb, oth_featdb, ribo_featdb
Example #6
0
def __main__():
    try:
        gff_fname = sys.argv[1]
        fasta_fname = sys.argv[2]
        gb_fname = sys.argv[3]
    except:
        print __doc__
        sys.exit(-1)
    fasta_rec = SeqIO.to_dict(SeqIO.parse(fasta_fname, "fasta", generic_dna))
    gff_rec = GFF.parse(gff_fname, fasta_rec)
    SeqIO.write(gff_rec, gb_fname, "genbank")
Example #7
0
def __main__():
    try:
        gff_fname = sys.argv[1]
        fasta_fname = sys.argv[2]
        gb_fname = sys.argv[3]
    except: 
        print __doc__
        sys.exit(-1)
    fasta_rec = SeqIO.to_dict(SeqIO.parse(fasta_fname, "fasta", generic_dna))
    gff_rec = GFF.parse(gff_fname, fasta_rec)
    SeqIO.write(gff_rec, gb_fname, "genbank")
Example #8
0
def ParserCommonType(fname, source):

    gff_type=['transcript', 'exon']
    source_type = zip([source] * len(gff_type), gff_type)
    filter_type = dict(gff_source_type = source_type, gff_id = ['I', 'II', 'III', 'IV', 'V', 'X'])
    gene_cnt, genes_data = 1, []
    gid = ordered_dict()
    gfh = open(fname)
    for rec in GFF.parse(gfh, limit_info=filter_type):
        for feature in rec.features:
            if feature.qualifiers['gene'][0] in gid:
                gene = gid[feature.qualifiers['gene'][0]]
                gene['is_alt_spliced'] = 1
                gene['transcripts'].append(feature.id)
                exon_pos = [] 
                for xlevel in feature.sub_features:
                    exon_pos.append([xlevel.location._start.position + 1, xlevel.location._end.position])
                if orient == '-':
                    if exon_pos != [] and len(exon_pos) != 1:
                        if exon_pos[0][0] > exon_pos[-1][0]: 
                            exon_pos.reverse()
                gene["exons"].append(exon_pos)
                gid[feature.qualifiers['gene'][0]]=gene
            else:
                gene = init_gene()
                gene['id'] = gene_cnt
                gene['name'] = feature.qualifiers['gene'][0]
                gene['chr'] = rec.id
                gene['source'] = feature.qualifiers['source'][0]
                orient = None
                if feature.strand == 1:
                    orient = '+'
                else:
                    orient = '-'
                gene['strand'] = orient
                gene['is_alt_spliced'] = 0
                gene['transcripts'].append(feature.id)
                exon_pos = [] 
                for xlevel in feature.sub_features:
                    exon_pos.append([xlevel.location._start.position + 1, xlevel.location._end.position])
                if orient == '-':
                    if exon_pos != [] and len(exon_pos) != 1:
                        if exon_pos[0][0] > exon_pos[-1][0]: 
                            exon_pos.reverse()
                gene["exons"].append(exon_pos)
                gene_cnt += 1
                gid[feature.qualifiers['gene'][0]]=gene
    gfh.close()
    for ent in gid.ordered_items():
        cand = OrganizePacket(ent[1])
        genes_data.append(cand)
    return genes_data
Example #9
0
def ParseGFF(fname):
    """get contents from a decent GFF file.
    """
    smap = {1: '+', -1: '-'}
    for cid in ['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'ChrM',
                'ChrC']:  ## change according to the GFF file TODO
        fh = open(fname, 'rU')
        limit_info = dict(gff_id=[cid], gff_source=[
            'TAIR10'
        ])  ## change the source TODO According to the file type
        for rec in GFF.parse(fh, limit_info=limit_info):
            for each_rec in rec.features:
                if each_rec.type == 'gene':
                    for child in each_rec.sub_features:
                        print child.id, 'NULL', rec.id, smap[
                            child.
                            strand], child.type, child.location._start.position, child.location._end.position, each_rec.qualifiers[
                                'Note'][0]
                        cnt = 1
                        for cod in child.sub_features:
                            fid = None
                            if cod.type == "five_prime_UTR":
                                fid = child.id + '_' + str(cnt)
                                cod.type = 'UTR5'
                            elif cod.type == "three_prime_UTR":
                                fid = child.id + '_' + str(cnt)
                                cod.type = 'UTR3'
                            elif cod.type == "CDS":
                                fid = child.id + '_' + str(cnt)
                            if fid:
                                print fid, child.id, rec.id, smap[
                                    cod.
                                    strand], cod.type, cod.location._start.position, cod.location._end.position, 'NULL'
                                cnt += 1
                    break
        fh.close()
Example #10
0
def __main__():

    try:
        gff_file = sys.argv[1]
        chrid = sys.argv[2]
        source = sys.argv[3]
    except:
        sys.stderr.write('Access denied for a GFF file !\n')
        sys.exit(-1)

    #gff_source = ['rheMac2_ensGene', 'rheMac2_refGene', 'rheMac2_refSeqAnno', 'rheMac2_transMapAlnUcscGenes']
    limit_parse = dict(gff_id=[chrid], gff_source=[source])

    chrpos = dict()
    fh = open(gff_file)
    for rec in GFF.parse(fh, limit_info=limit_parse):
        for element in rec.features:
            chrpos[(element.location._start.position,
                    element.location._end.position)] = 1
    fh.close()

    sorted_chrpos = [sort_pos for sort_pos in sorted(chrpos)]

    WriteSortedGFF(gff_file, limit_parse, sorted_chrpos)
Example #11
0
def get_Feature(fname):
    """Extract genome annotation information from a provided GFF file.
    """
    from Core import GFF
    global te_featdb; global psd_featdb; global cg_featdb; global oth_featdb; global ribo_featdb
    for cid in ['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'ChrM', 'ChrC']: ## change according to the GFF file TODO According to the file type add chromosome number automatically.
        print cid
        fh = open(fname, 'rU')
        limit_info = dict(gff_id=[cid], gff_source=['TAIR10']) ## change the source TODO According to the file type add source flag automatically 
        for rec in GFF.parse(fh, limit_info=limit_info):
            for each_rec in rec.features:
                if each_rec.type=='gene':
                    for child in each_rec.sub_features:
                        if child.type=='mRNA':
                            if cid in cg_featdb:
                                cg_featdb[cid][(int(each_rec.location._start.position), 
                                    int(each_rec.location._end.position))]=each_rec.strand
                            else:
                                cg_featdb[cid]={(int(each_rec.location._start.position), 
                                    int(each_rec.location._end.position)):
                                    each_rec.strand}
                        elif each_rec.sub_features[0].type in ['miRNA', 'ncRNA', 'snoRNA', 'snRNA', 'tRNA']:
                            if cid in oth_featdb:
                                oth_featdb[cid][(int(each_rec.location._start.position), 
                                    int(each_rec.location._end.position))]=each_rec.strand
                            else:
                                oth_featdb[cid]={(int(each_rec.location._start.position), 
                                    int(each_rec.location._end.position)):
                                    each_rec.strand}
                        elif each_rec.sub_features[0].type=='rRNA':
                            if cid in ribo_featdb:
                                ribo_featdb[cid][(int(each_rec.location._start.position), 
                                    int(each_rec.location._end.position))]=each_rec.strand
                            else:
                                ribo_featdb[cid]={(int(each_rec.location._start.position), 
                                    int(each_rec.location._end.position)): 
                                    each_rec.strand}
                elif each_rec.type=='pseudogene':
                    if cid in psd_featdb:
                        psd_featdb[cid][(int(each_rec.location._start.position), 
                                int(each_rec.location._end.position))]=each_rec.strand
                    else:
                        psd_featdb[cid]={(int(each_rec.location._start.position), 
                                int(each_rec.location._end.position)): 
                                each_rec.strand}
                elif each_rec.type in ['transposable_element', 'transposable_element_gene']:
                    if cid in te_featdb:
                        te_featdb[cid][(int(each_rec.location._start.position), 
                                int(each_rec.location._end.position))]=each_rec.strand
                    else:
                        te_featdb[cid]={(int(each_rec.location._start.position), 
                                int(each_rec.location._end.position)): 
                                each_rec.strand}
        fh.close()
    ribo_featdb['Chr3'][(14199917, 14203578)]=1 ## Unannotated rRNA from A thaliana genome. 
    ribo_featdb['Chr2'][(5784, 9683)]=1 ## Unannotated rRNA from A thaliana genome. 
    ribo_featdb['Chr2'][(2821, 3704)]=1 #
    ribo_featdb['Chr3'][(14196614, 14197675)]=1 #
    ribo_featdb['Chr3'][(14194052, 14194611)]=1 #
    ribo_featdb['Chr3'][(14199498, 14199751)]=1 #
    ribo_featdb['Chr3'][(14195564, 14195739)]=1 #
    ribo_featdb['ChrM'][(11426, 11883)]=-1 #
    ribo_featdb['ChrM'][(364594, 365124)]=1 #"""
    return te_featdb, psd_featdb, cg_featdb, oth_featdb, ribo_featdb
Example #12
0
def get_annotation(fname, rfname):
    """Parse genome annotation from GFF3 file.
    """
    smap = {1: '+', -1: '-'}
    from Core import GFF
    te_featdb, teg_featdb, ribo_featdb, oth_featdb = dict(), dict(), dict(
    ), dict()
    for cid in ['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'ChrM', 'ChrC']:
        sys.stderr.write(cid + ".....\n")
        fh = open(fname, 'rU')
        limit_info = dict(gff_id=[cid], gff_source=['TAIR10'])
        for rec in GFF.parse(fh, limit_info=limit_info):
            for each_rec in rec.features:
                if each_rec.type == 'gene':
                    for child in each_rec.sub_features:
                        if child.type in [
                                'mRNA', 'miRNA', 'ncRNA', 'snoRNA', 'snRNA',
                                'tRNA'
                        ]:
                            if cid in oth_featdb:
                                oth_featdb[cid][(
                                    int(each_rec.location._start.position),
                                    int(each_rec.location._end.position)
                                )] = smap[each_rec.strand]
                            else:
                                oth_featdb[cid] = {
                                    (int(each_rec.location._start.position),
                                     int(each_rec.location._end.position)):
                                    smap[each_rec.strand]
                                }
                elif each_rec.type == 'transposable_element_gene':
                    if cid in teg_featdb:
                        teg_featdb[cid][(int(
                            each_rec.location._start.position),
                                         int(each_rec.location._end.position)
                                         )] = smap[each_rec.strand]
                    else:
                        teg_featdb[cid] = {
                            (int(each_rec.location._start.position),
                             int(each_rec.location._end.position)):
                            smap[each_rec.strand]
                        }
                elif each_rec.type == 'transposable_element':
                    if cid in te_featdb:
                        te_featdb[cid][(each_rec.id,
                                        int(each_rec.location._start.position),
                                        int(each_rec.location._end.position)
                                        )] = smap[each_rec.strand]
                    else:
                        te_featdb[
                            cid] = {
                                (each_rec.id,
                                 int(each_rec.location._start.position),
                                 int(each_rec.location._end.position)):
                                smap[each_rec.strand]
                            }
                elif each_rec.type == 'pseudogene':
                    if cid in oth_featdb:
                        oth_featdb[cid][(int(
                            each_rec.location._start.position),
                                         int(each_rec.location._end.position)
                                         )] = smap[each_rec.strand]
                    else:
                        oth_featdb[cid] = {
                            (int(each_rec.location._start.position),
                             int(each_rec.location._end.position)):
                            smap[each_rec.strand]
                        }
        fh.close()
    ## add additional rRNA related location from A. thaliana
    fh = open(rfname, 'rU')
    for line in fh:
        line = line.strip('\n\r').split(' ')
        if line[0] in ribo_featdb:
            ribo_featdb[line[0]][(int(line[1]), int(line[2]))] = line[3]
        else:
            ribo_featdb[line[0]] = {(int(line[1]), int(line[2])): line[3]}
    fh.close()
    return te_featdb, teg_featdb, ribo_featdb, oth_featdb
Example #13
0
def get_Feature(fname):
    """Extract genome annotation information from a provided GFF file.
    """
    from Core import GFF
    te_featdb, psd_featdb, cg_featdb, oth_featdb, ribo_featdb = dict(), dict(
    ), dict(), dict(), dict()
    for cid in [
            'Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'ChrM', 'ChrC'
    ]:  ## change according to the GFF file TODO According to the file type add chromosome number automatically.
        print cid
        fh = open(fname, 'rU')
        limit_info = dict(
            gff_id=[cid], gff_source=['TAIR10']
        )  ## change the source TODO According to the file type add source flag automatically
        for rec in GFF.parse(fh, limit_info=limit_info):
            for each_rec in rec.features:
                if each_rec.type == 'gene':
                    for child in each_rec.sub_features:
                        if child.type == 'mRNA':
                            if cid in cg_featdb:
                                cg_featdb[cid][(
                                    int(each_rec.location._start.position),
                                    int(each_rec.location._end.position)
                                )] = each_rec.strand
                            else:
                                cg_featdb[cid] = {
                                    (int(each_rec.location._start.position),
                                     int(each_rec.location._end.position)):
                                    each_rec.strand
                                }
                        elif each_rec.sub_features[0].type in [
                                'miRNA', 'ncRNA', 'snoRNA', 'snRNA', 'tRNA'
                        ]:
                            if cid in oth_featdb:
                                oth_featdb[cid][(
                                    int(each_rec.location._start.position),
                                    int(each_rec.location._end.position)
                                )] = each_rec.strand
                            else:
                                oth_featdb[cid] = {
                                    (int(each_rec.location._start.position),
                                     int(each_rec.location._end.position)):
                                    each_rec.strand
                                }
                        elif each_rec.sub_features[0].type == 'rRNA':
                            if cid in ribo_featdb:
                                ribo_featdb[cid][(
                                    int(each_rec.location._start.position),
                                    int(each_rec.location._end.position)
                                )] = each_rec.strand
                            else:
                                ribo_featdb[cid] = {
                                    (int(each_rec.location._start.position),
                                     int(each_rec.location._end.position)):
                                    each_rec.strand
                                }
                elif each_rec.type == 'pseudogene':
                    if cid in psd_featdb:
                        psd_featdb[cid][(int(
                            each_rec.location._start.position),
                                         int(each_rec.location._end.position)
                                         )] = each_rec.strand
                    else:
                        psd_featdb[cid] = {
                            (int(each_rec.location._start.position),
                             int(each_rec.location._end.position)):
                            each_rec.strand
                        }
                elif each_rec.type in [
                        'transposable_element', 'transposable_element_gene'
                ]:
                    if cid in te_featdb:
                        te_featdb[cid][(int(each_rec.location._start.position),
                                        int(each_rec.location._end.position)
                                        )] = each_rec.strand
                    else:
                        te_featdb[cid] = {
                            (int(each_rec.location._start.position),
                             int(each_rec.location._end.position)):
                            each_rec.strand
                        }
        fh.close()
    return te_featdb, psd_featdb, cg_featdb, oth_featdb, ribo_featdb
Example #14
0
def WriteSortedGFF(fname, limit_parse, sorted_pos):

    fgh = open(fname)
    for rec in GFF.parse(fgh, limit_info=limit_parse):
        for sort_pos in sorted_pos:
            for feature in rec.features:
                if sort_pos == (feature.location._start.position, feature.location._end.position):
                    orient = None
                    if feature.strand == 1:
                        orient = '+'
                    else:
                        orient = '-'
                    gline = [rec.id,
                            feature.qualifiers['source'][0],
                            feature.type,
                            str(feature.location._start.position + 1),
                            str(feature.location._end.position),
                            '.',
                            orient,
                            '.',
                            'ID=' + feature.qualifiers['ID'][0] + ';Name=' + feature.qualifiers['Name'][0]
                            ]
                    print '\t'.join(gline)
                    for slevel in feature.sub_features: ## second level feature 
                        orient = None
                        if slevel.strand == 1:
                            orient = '+'
                        else:
                            orient = '-'
                        tline = [rec.id,
                                slevel.qualifiers['source'][0],
                                slevel.type,
                                str(slevel.location._start.position + 1),
                                str(slevel.location._end.position),
                                str(float(slevel.qualifiers['score'][0])),
                                orient, 
                                '.',
                                'ID=' + slevel.qualifiers['ID'][0] + ';Parent=' + slevel.qualifiers['Parent'][0]
                                ]
                        print '\t'.join(tline)
                        for tlevel in slevel.sub_features:
                            orient = None
                            if tlevel.strand == 1:
                                orient = '+'
                            else:
                                orient = '-'
                            xline = []
                            if tlevel.type == 'CDS':
                                xline = [rec.id,
                                        tlevel.qualifiers['source'][0],
                                        tlevel.type,
                                        str(tlevel.location._start.position + 1),
                                        str(tlevel.location._end.position),
                                        '.',
                                        orient,
                                        tlevel.qualifiers['phase'][0],
                                        'Parent=' + tlevel.qualifiers['Parent'][0]
                                        ]
                            else:
                                xline = [rec.id,
                                        tlevel.qualifiers['source'][0],
                                        tlevel.type,
                                        str(tlevel.location._start.position + 1),
                                        str(tlevel.location._end.position),
                                        '.',
                                        orient,
                                        '.',
                                        'Parent=' + tlevel.qualifiers['Parent'][0]
                                        ]
                            print '\t'.join(xline)    
                    #break
            #break ## one sorted position 
    fgh.close()
Example #15
0
def WriteSortedGFF(fname, limit_parse, sorted_pos):

    fgh = open(fname)
    for rec in GFF.parse(fgh, limit_info=limit_parse):
        for sort_pos in sorted_pos:
            for feature in rec.features:
                if sort_pos == (feature.location._start.position,
                                feature.location._end.position):
                    orient = None
                    if feature.strand == 1:
                        orient = '+'
                    else:
                        orient = '-'
                    gline = [
                        rec.id, feature.qualifiers['source'][0], feature.type,
                        str(feature.location._start.position + 1),
                        str(feature.location._end.position), '.', orient, '.',
                        'ID=' + feature.qualifiers['ID'][0] + ';Name=' +
                        feature.qualifiers['Name'][0]
                    ]
                    print '\t'.join(gline)
                    for slevel in feature.sub_features:  ## second level feature
                        orient = None
                        if slevel.strand == 1:
                            orient = '+'
                        else:
                            orient = '-'
                        tline = [
                            rec.id, slevel.qualifiers['source'][0],
                            slevel.type,
                            str(slevel.location._start.position + 1),
                            str(slevel.location._end.position),
                            str(float(slevel.qualifiers['score'][0])), orient,
                            '.', 'ID=' + slevel.qualifiers['ID'][0] +
                            ';Parent=' + slevel.qualifiers['Parent'][0]
                        ]
                        print '\t'.join(tline)
                        for tlevel in slevel.sub_features:
                            orient = None
                            if tlevel.strand == 1:
                                orient = '+'
                            else:
                                orient = '-'
                            xline = []
                            if tlevel.type == 'CDS':
                                xline = [
                                    rec.id, tlevel.qualifiers['source'][0],
                                    tlevel.type,
                                    str(tlevel.location._start.position + 1),
                                    str(tlevel.location._end.position), '.',
                                    orient, tlevel.qualifiers['phase'][0],
                                    'Parent=' + tlevel.qualifiers['Parent'][0]
                                ]
                            else:
                                xline = [
                                    rec.id, tlevel.qualifiers['source'][0],
                                    tlevel.type,
                                    str(tlevel.location._start.position + 1),
                                    str(tlevel.location._end.position), '.',
                                    orient, '.',
                                    'Parent=' + tlevel.qualifiers['Parent'][0]
                                ]
                            print '\t'.join(xline)
                    #break
            #break ## one sorted position
    fgh.close()
def get_Feature(fname):
    """Extract genome annotation information from a provided GFF file.
    """
    from Core import GFF
    global te_featdb
    global psd_featdb
    global cg_featdb
    global oth_featdb
    global ribo_featdb
    for cid in [
            'Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'ChrM', 'ChrC'
    ]:  ## change according to the GFF file TODO According to the file type add chromosome number automatically.
        print cid
        fh = open(fname, 'rU')
        limit_info = dict(
            gff_id=[cid], gff_source=['TAIR10']
        )  ## change the source TODO According to the file type add source flag automatically
        for rec in GFF.parse(fh, limit_info=limit_info):
            for each_rec in rec.features:
                if each_rec.type == 'gene':
                    for child in each_rec.sub_features:
                        if child.type == 'mRNA':
                            if cid in cg_featdb:
                                cg_featdb[cid][(
                                    int(each_rec.location._start.position),
                                    int(each_rec.location._end.position)
                                )] = each_rec.strand
                            else:
                                cg_featdb[cid] = {
                                    (int(each_rec.location._start.position),
                                     int(each_rec.location._end.position)):
                                    each_rec.strand
                                }
                        elif each_rec.sub_features[0].type in [
                                'miRNA', 'ncRNA', 'snoRNA', 'snRNA', 'tRNA'
                        ]:
                            if cid in oth_featdb:
                                oth_featdb[cid][(
                                    int(each_rec.location._start.position),
                                    int(each_rec.location._end.position)
                                )] = each_rec.strand
                            else:
                                oth_featdb[cid] = {
                                    (int(each_rec.location._start.position),
                                     int(each_rec.location._end.position)):
                                    each_rec.strand
                                }
                        elif each_rec.sub_features[0].type == 'rRNA':
                            if cid in ribo_featdb:
                                ribo_featdb[cid][(
                                    int(each_rec.location._start.position),
                                    int(each_rec.location._end.position)
                                )] = each_rec.strand
                            else:
                                ribo_featdb[cid] = {
                                    (int(each_rec.location._start.position),
                                     int(each_rec.location._end.position)):
                                    each_rec.strand
                                }
                elif each_rec.type == 'pseudogene':
                    if cid in psd_featdb:
                        psd_featdb[cid][(int(
                            each_rec.location._start.position),
                                         int(each_rec.location._end.position)
                                         )] = each_rec.strand
                    else:
                        psd_featdb[cid] = {
                            (int(each_rec.location._start.position),
                             int(each_rec.location._end.position)):
                            each_rec.strand
                        }
                elif each_rec.type in [
                        'transposable_element', 'transposable_element_gene'
                ]:
                    if cid in te_featdb:
                        te_featdb[cid][(int(each_rec.location._start.position),
                                        int(each_rec.location._end.position)
                                        )] = each_rec.strand
                    else:
                        te_featdb[cid] = {
                            (int(each_rec.location._start.position),
                             int(each_rec.location._end.position)):
                            each_rec.strand
                        }
        fh.close()
    ribo_featdb['Chr3'][(
        14199917, 14203578)] = 1  ## Unannotated rRNA from A thaliana genome.
    ribo_featdb['Chr2'][(
        5784, 9683)] = 1  ## Unannotated rRNA from A thaliana genome.
    ribo_featdb['Chr2'][(2821, 3704)] = 1  #
    ribo_featdb['Chr3'][(14196614, 14197675)] = 1  #
    ribo_featdb['Chr3'][(14194052, 14194611)] = 1  #
    ribo_featdb['Chr3'][(14199498, 14199751)] = 1  #
    ribo_featdb['Chr3'][(14195564, 14195739)] = 1  #
    ribo_featdb['ChrM'][(11426, 11883)] = -1  #
    ribo_featdb['ChrM'][(364594, 365124)] = 1  #"""
    return te_featdb, psd_featdb, cg_featdb, oth_featdb, ribo_featdb