Beispiel #1
0
def init():
    def get_scaf2seqid():
        #return secf2seqid dict
        #needs ygap_corr file
        scaf2seqid = {}
        f = open(ygap_corr)
        for line in f:
            scaf,seqid,chrname = line.split("\t")
            scaf = scaf.split("_")[-1]
            scaf2seqid[scaf] = seqid 
        return scaf2seqid
    #generate blast compare file 
    scaf2seqid = get_scaf2seqid()

    if REF:
        refs = []
        dubious = []
        for record in gff_parse.gffIterator(Ref_anno):
            if record.type == "gene" and record.attributes["orf_classification"][0] == "Dubious":
                dubious.append(record.attributes["ID"][0])
            elif record.type == "gene" and not record.attributes["ID"][0].startswith("Q"):
                seqid = record.seqid
                start = int(record.start)
                end = int(record.end)
                record.id = record.attributes["ID"][0]
                if record.strand == "+":
                    record.coord = "(%d..%d)"%(start,end)
                    record.seq = Ref_seq[seqid][start-1:end]
                elif record.strand == "-":
                    record.coord = "complement(%d..%d)"%(start,end)
                    record.seq = Ref_seq[seqid][start-1:end].reverse_complement()
                else:
                    print "Check strand:%s"%record.type
                refs.append(record)

    ygap = []
    for record in YGAPIterator(ygap_anno):
        if record.type == "PROTEIN" or record.type == "":
            seqid = int(record.seqid)
            start = int(record.start)
            end = int(record.end)
            if record.strand == "+":
                record.seq = ygap_seq[seqid][start-1:end]
            elif record.strand == "-":
                record.seq = ygap_seq[seqid][start-1:end].reverse_complement()
            else:
                print "Check strand:%s"%record.type
            ygap.append(record)

    devin = []
    for record in gff_parse.gffIterator(devin_gff):
        if record.type == "CDS" or record.type == "ORF":
            if REF:
                #exclude dubious genes
                flag = False
                for h**o in record.attributes["SGD"]:
                    if h**o in dubious:
                        flag = True
                if flag:
                    print "find %s is %s "%(record.attributes["Gene"],h**o)
                    continue
            seqid = int(record.seqid)
            record.seqid = scaf2seqid[record.seqid]
            start = int(record.start)
            end = int(record.end)
            record.orth = "%s_%s"%(record.score,"|".join(record.attributes["SGD"]))
            if record.strand == "+":
                record.coord = "(%d..%d)"%(start,end)
                record.seq = devin_seq[seqid][start-1:end]
            elif record.strand == "-":
                record.coord = "complement(%d..%d)"%(start,end)
                record.seq = devin_seq[seqid][start-1:end].reverse_complement()
            else:
                print "Check strand:%s"%record.type
            devin.append(record)

    if REF:
        write_fsa(ygap,devin,refs)
    else:
        write_fsa(ygap,devin)

    run_blast()
    return 0
Beispiel #2
0
        if line.startswith("#"): continue
        if not line: return
        r= record()
        (r.name,r.strand,r.start,r.end,r.ygob,r.seqid,r.shortname,\
            r.coord,r.orth,r.type,r.pillar,r.tag,r.anno) = line.split("\t")
        r.strand = "+" if r.strand == "1" else "-"
        yield r
    
    
class list(list):
    def __init__(self):
        self.name = ""

deven = list()
deven.name = "deven"
for record in gff_parse.gffIterator(deven_gff):
    deven.append(record)

augustus = list() 
augustus.name = "augustus"
for record in gff_parse.gffIterator(augustus_gff):
    record.seqid = record.seqid[5:]
    augustus.append(record)

ygap = list()
ygap.name = "ygap"
for record in YGAPIterator(ygap_anno):
    ygap.append(record)

def compare_gff(template,*gffs):
    temp = [read.seqid+"_"+read.start+"_"+read.end+"_"+read.strand for \
out_put_file = "/Users/bingwang/zen/yeast_anno_pipe/%sONscer.txt"%(sp)

#Prepare: numid2scaf dict
numid2scaf = {}
scaffold_list = []
f = open(correspondances_file)
for line in f:
    scaffold,numid,alpheid = line.strip().split("\t")
    numid2scaf[numid] = scaffold
    scaffold_list.append(scaffold)

#Prepare: scer_genes synteny list
#Prepare: scer2record dict
scer_genes = []
scer2record = {}
for record in gff_parse.gffIterator(open(ref_file)):
    if record.type == "gene" and record.attributes["orf_classification"][0] == "Dubious":
        #dubious.append(record.attributes["ID"][0])
        continue
    elif record.type == "gene" and not record.attributes["ID"][0].startswith("Q"):
        seqid = record.seqid
        start = int(record.start)
        end = int(record.end)
        record.id = record.attributes["ID"][0]
        scer_genes.append(record.id)
        scer2record[record.id] = record

#Prepare: scer2seub and seub2scer dicts
#Prepare: seub2record dict
scer2seub = {}
seub2scer = {}