Example #1
0
def getGTFcontent(gtf_file):
    """
    Extract GTF features 
    """

    GFH = _open_file(gtf_file)
    gtf_content, recall = dict(), None

    for rec in GFH:
        rec = rec.strip('\n\r')

        #skip empty line fasta identifier and commented line
        if not rec or rec[0] in  ['#', '>']:
            continue
        #skip the genome sequence 
        if not re.search('\t', rec):
            continue

        parts = rec.split('\t')
        assert len(parts) >= 8, rec 
    
        if re.search(r'^(start_codon|start-codon|startcodon)$', parts[2], re.IGNORECASE):
            continue

        gid= tid= gname= tname= ttype = None

        for attb in parts[-1].split(';'):
            if re.search(r'^\s?$', attb):
                continue

            attb = re.sub('"', '', attb).strip()
            attb = attb.split()

            if re.search(r'^(gene_id|geneid|name)$', attb[0], re.IGNORECASE): 
                gid = attb[1]
            elif re.search(r'^(transcript_id|transcriptId)$', attb[0], re.IGNORECASE):
                tid = attb[1]
            elif re.search(r'^(gene_name|genename)$', attb[0], re.IGNORECASE):
                gname = attb[1]
            elif re.search(r'^(transcript_name|transcriptname)$', attb[0], re.IGNORECASE):
                tname = attb[1]
            elif re.search(r'^(transcript_type)$', attb[0], re.IGNORECASE):
                ttype = attb[1]

        if gid == tid: #UCSC GTF files, gene & transcript have same identifier 
            gid = 'Gene:'+str(gid) 
            tid = 'Transcript:'+str(tid)

        if tid == None: #JGI GTF file dont have transcript ID for CDS line
            tid = recall 

        exon= cds= sp_cod= st_cod = []

        if re.search(r'^exon$', parts[2], re.IGNORECASE): 
            exon = [(int(parts[3]), int(parts[4]))]
        elif re.search(r'^CDS$', parts[2], re.IGNORECASE):
            cds = [(int(parts[3]), int(parts[4]))]
        elif re.search(r'^(stop_codon|stop-codon|stopcodon)$', parts[2], re.IGNORECASE):
            sp_cod = [(int(parts[3]), int(parts[4]))]
        else: #other lines are not required to GFF line 
            continue

        #creating feature connections 
        if parts[0] in gtf_content: # adding to existing chromosome
            if (gid, parts[1]) in gtf_content[parts[0]].keys(): # adding to existing gene 
                if tid in gtf_content[parts[0]][(gid, parts[1])].keys(): # adding to existing transcript
                    if exon:
                        gtf_content[parts[0]][(gid, parts[1])][tid]['exon'].append(exon[0])
                    elif cds:
                        gtf_content[parts[0]][(gid, parts[1])][tid]['CDS'].append(cds[0])
                    elif sp_cod:    
                        gtf_content[parts[0]][(gid, parts[1])][tid]['sp_cod'].append(sp_cod[0])
                else: # inserting new transcript
                    gtf_content[parts[0]][(gid, parts[1])][tid] = dict(exon = exon, 
                                                            CDS = cds, 
                                                            sp_cod = sp_cod, 
                                                            info = [parts[6], parts[5], gname, tname, ttype])
            else: # inserting new gene 
                gtf_content[parts[0]][(gid, parts[1])] = {tid : dict(exon = exon, 
                                                    CDS = cds,
                                                    sp_cod = sp_cod, 
                                                    info = [parts[6], parts[5], gname, tname, ttype])}
        else: # inserting new chromosome identifier 
            gtf_content[parts[0]] = {(gid, parts[1]) : {tid : dict(exon = exon, 
                                            CDS = cds,
                                            sp_cod = sp_cod, 
                                            info = [parts[6], parts[5], gname, tname, ttype])}}
        recall = tid #set previous id for CDS line 

    GFH.close()
    return gtf_content
Example #2
0
def getGTFcontent(gtf_file):
    """
    Extract GTF features 
    """

    GFH = _open_file(gtf_file)
    gtf_content, recall = dict(), None

    for rec in GFH:
        rec = rec.strip('\n\r')

        #skip empty line fasta identifier and commented line
        if not rec or rec[0] in ['#', '>']:
            continue
        #skip the genome sequence
        if not re.search('\t', rec):
            continue

        parts = rec.split('\t')
        assert len(parts) >= 8, rec

        if re.search(r'^(start_codon|start-codon|startcodon)$', parts[2],
                     re.IGNORECASE):
            continue

        gid = tid = gname = tname = ttype = None

        for attb in parts[-1].split(';'):
            if re.search(r'^\s?$', attb):
                continue

            attb = re.sub('"', '', attb).strip()
            attb = attb.split()

            if re.search(r'^(gene_id|geneid|name)$', attb[0], re.IGNORECASE):
                gid = attb[1]
            elif re.search(r'^(transcript_id|transcriptId)$', attb[0],
                           re.IGNORECASE):
                tid = attb[1]
            elif re.search(r'^(gene_name|genename)$', attb[0], re.IGNORECASE):
                gname = attb[1]
            elif re.search(r'^(transcript_name|transcriptname)$', attb[0],
                           re.IGNORECASE):
                tname = attb[1]
            elif re.search(r'^(transcript_type)$', attb[0], re.IGNORECASE):
                ttype = attb[1]

        if gid == tid:  #UCSC GTF files, gene & transcript have same identifier
            gid = 'Gene:' + str(gid)
            tid = 'Transcript:' + str(tid)

        if tid == None:  #JGI GTF file dont have transcript ID for CDS line
            tid = recall

        exon = cds = sp_cod = st_cod = []

        if re.search(r'^exon$', parts[2], re.IGNORECASE):
            exon = [(int(parts[3]), int(parts[4]))]
        elif re.search(r'^CDS$', parts[2], re.IGNORECASE):
            cds = [(int(parts[3]), int(parts[4]))]
        elif re.search(r'^(stop_codon|stop-codon|stopcodon)$', parts[2],
                       re.IGNORECASE):
            sp_cod = [(int(parts[3]), int(parts[4]))]
        else:  #other lines are not required to GFF line
            continue

        #creating feature connections
        if parts[0] in gtf_content:  # adding to existing chromosome
            if (gid, parts[1]
                ) in gtf_content[parts[0]].keys():  # adding to existing gene
                if tid in gtf_content[parts[0]][(
                        gid,
                        parts[1])].keys():  # adding to existing transcript
                    if exon:
                        gtf_content[parts[0]][(gid,
                                               parts[1])][tid]['exon'].append(
                                                   exon[0])
                    elif cds:
                        gtf_content[parts[0]][(gid,
                                               parts[1])][tid]['CDS'].append(
                                                   cds[0])
                    elif sp_cod:
                        gtf_content[parts[0]][(
                            gid, parts[1])][tid]['sp_cod'].append(sp_cod[0])
                else:  # inserting new transcript
                    gtf_content[parts[0]][(gid, parts[1])][tid] = dict(
                        exon=exon,
                        CDS=cds,
                        sp_cod=sp_cod,
                        info=[parts[6], parts[5], gname, tname, ttype])
            else:  # inserting new gene
                gtf_content[parts[0]][(gid, parts[1])] = {
                    tid:
                    dict(exon=exon,
                         CDS=cds,
                         sp_cod=sp_cod,
                         info=[parts[6], parts[5], gname, tname, ttype])
                }
        else:  # inserting new chromosome identifier
            gtf_content[parts[0]] = {
                (gid, parts[1]): {
                    tid:
                    dict(exon=exon,
                         CDS=cds,
                         sp_cod=sp_cod,
                         info=[parts[6], parts[5], gname, tname, ttype])
                }
            }
        recall = tid  #set previous id for CDS line

    GFH.close()
    return gtf_content
Example #3
0
    pc_final_map = dict()
    for ptype, ctypes in pc_map.items():
        unique_ctypes = list(set(ctypes))
        unique_ctypes.sort()
        pc_final_map[ptype] = unique_ctypes
    # some cases the GFF file represents a single feature type 
    if not pc_final_map:
        for fid, stypes in parent_sts.items():
            pc_final_map[stypes] = dict()
    # generate a report on feature id mapping in the file 
    print '------------------------------------------------------'
    print 'Parent feature type | Associated child feature type(s)'
    print '------------------------------------------------------'
    for key, value in pc_final_map.items():
        print key[0], key[1]
        for child_to in value:
            print '\t\t|',child_to[0], child_to[1]
        print 
    print '------------------------------------------------------'

if __name__=='__main__':
    try:
        gff_file = sys.argv[1]
    except:
        print "Incorrect arguments supplied"
        print __doc__
        sys.exit(-1)
    
    gff_handle = _open_file(gff_file)
    parent_child_id_map(gff_handle)
Example #4
0
def gbk_parse(fname):
    """
    Extract genome annotation recods from genbank format 
    """
    fhand = _open_file(gbkfname)
    unk = 1 

    for record in SeqIO.parse(fhand, "genbank"):

        gene_tags = dict()
        tx_tags = collections.defaultdict(list) 
        exon = collections.defaultdict(list) 
        cds = collections.defaultdict(list) 
        mol_type, chr_id = None, None 

        for rec in record.features:

            if rec.type == 'source':
                mol_type = rec.qualifiers['mol_type'][0]
                try:
                    chr_id = rec.qualifiers['chromosome'][0]
                except:
                    chr_id = record.name 
                continue 

            strand='-'
            strand='+' if rec.strand>0 else strand
            
            fid = None 
            try:
                fid = rec.qualifiers['gene'][0]
            except:
                pass

            transcript_id = None
            try:
                transcript_id = rec.qualifiers['transcript_id'][0]
            except:
                pass 

            if re.search(r'gene', rec.type):
                gene_tags[fid] = (rec.location._start.position+1, 
                                    rec.location._end.position, 
                                    strand,
                                    rec.type,
                                    rec.qualifiers['note'][0])
            elif rec.type == 'exon':
                exon[fid].append((rec.location._start.position+1, 
                                    rec.location._end.position))
            elif rec.type=='CDS':
                cds[fid].append((rec.location._start.position+1, 
                                    rec.location._end.position))
            else: 
                # get all transcripts 
                if transcript_id: 
                    tx_tags[fid].append((rec.location._start.position+1,
                                    rec.location._end.position, 
                                    transcript_id,
                                    rec.type))
        # record extracted, generate feature table
        unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk)
        
        #break
    fhand.close()
    for ptype, ctypes in pc_map.items():
        unique_ctypes = list(set(ctypes))
        unique_ctypes.sort()
        pc_final_map[ptype] = unique_ctypes
    # some cases the GFF file represents a single feature type
    if not pc_final_map:
        for fid, stypes in parent_sts.items():
            pc_final_map[stypes] = dict()
    # generate a report on feature id mapping in the file
    print '------------------------------------------------------'
    print 'Parent feature type | Associated child feature type(s)'
    print '------------------------------------------------------'
    for key, value in pc_final_map.items():
        print key[0], key[1]
        for child_to in value:
            print '\t\t|', child_to[0], child_to[1]
        print
    print '------------------------------------------------------'


if __name__ == '__main__':
    try:
        gff_file = sys.argv[1]
    except:
        print "Incorrect arguments supplied"
        print __doc__
        sys.exit(-1)

    gff_handle = _open_file(gff_file)
    parent_child_id_map(gff_handle)
def bed_parse(qfile, source_name):
    """
    Process BED file 
    """

    BEDfh = _open_file(qfile)
    print "##gff-version 3"

    for rec in BEDfh:
        rec = rec.strip("\n\r")

        if not rec or rec[0] in ["#"]:
            continue
        if not re.search("\t", rec):
            continue

        line = rec.split("\t")
        assert len(line) >= 12, rec
        # checking the consistency b/w start of exon and number of exons
        if len(line[-1].split(",")) != len(line[-2].split(",")):
            continue

        rstart = line[-1].split(",")
        if rstart[-1] == "":
            rstart.pop()
        exon_len = line[-2].split(",")
        if exon_len[-1] == "":
            exon_len.pop()

        if line[5] != "+" and line[5] != "-":
            line[5] = "."  # replace the unknown strand with '.'

        pline = [
            str(line[0]),
            source_name,
            "gene",
            str(int(line[1]) + 1),
            line[2],
            line[4],
            line[5],
            ".",
            "ID=Gene:" + line[3] + ";Name=Gene:" + line[3],
        ]
        print "\t".join(pline)

        pline = [
            str(line[0]),
            source_name,
            "transcript",
            str(int(line[1]) + 1),
            line[2],
            line[4],
            line[5],
            ".",
            "ID=" + line[3] + ";Name=" + line[3] + ";Parent=Gene:" + line[3],
        ]
        print "\t".join(pline)

        st = int(line[1])
        for ex_cnt in range(int(line[-3])):
            start = st + int(rstart[ex_cnt]) + 1
            stop = start + int(exon_len[ex_cnt]) - 1

            if ex_cnt > 0:
                pline = [
                    str(line[0]),
                    source_name,
                    "intron",
                    str(intron_start),
                    str(start - 1),
                    line[4],
                    line[5],
                    ".",
                    "Parent=" + line[3],
                ]
                print "\t".join(pline)

            pline = [
                str(line[0]),
                source_name,
                "exon",
                str(start),
                str(stop),
                line[4],
                line[5],
                ".",
                "Parent=" + line[3],
            ]
            print "\t".join(pline)
            intron_start = stop + 1

    BEDfh.close()
def gbk_parse(fname):
    """
    Extract genome annotation recods from genbank format 
    """
    fhand = _open_file(gbkfname)
    unk = 1

    for record in SeqIO.parse(fhand, "genbank"):

        gene_tags = dict()
        tx_tags = collections.defaultdict(list)
        exon = collections.defaultdict(list)
        cds = collections.defaultdict(list)
        mol_type, chr_id = None, None

        for rec in record.features:

            if rec.type == 'source':
                mol_type = rec.qualifiers['mol_type'][0]
                try:
                    chr_id = rec.qualifiers['chromosome'][0]
                except:
                    chr_id = record.name
                continue

            strand = '-'
            strand = '+' if rec.strand > 0 else strand

            fid = None
            try:
                fid = rec.qualifiers['gene'][0]
            except:
                pass

            transcript_id = None
            try:
                transcript_id = rec.qualifiers['transcript_id'][0]
            except:
                pass

            if re.search(r'gene', rec.type):
                gene_tags[fid] = (rec.location._start.position + 1,
                                  rec.location._end.position, strand, rec.type,
                                  rec.qualifiers['note'][0])
            elif rec.type == 'exon':
                exon[fid].append((rec.location._start.position + 1,
                                  rec.location._end.position))
            elif rec.type == 'CDS':
                cds[fid].append((rec.location._start.position + 1,
                                 rec.location._end.position))
            else:
                # get all transcripts
                if transcript_id:
                    tx_tags[fid].append(
                        (rec.location._start.position + 1,
                         rec.location._end.position, transcript_id, rec.type))
        # record extracted, generate feature table
        unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds,
                            exon, unk)

        #break
    fhand.close()