def setUp(self, **kwargs): TestBase.setUp(self) dnaseq = testutil.datafile('dnaseq.fasta') tryannot = testutil.tempdatafile('tryannot') db = seqdb.BlastDB(dnaseq) try: db.__doc__ = 'little dna' self.pygrData.Bio.Test.dna = db annoDB = seqdb.AnnotationDB({1: ('seq1', 5, 10, 'fred'), 2: ('seq1', -60, -50, 'bob'), 3: ('seq2', -20, -10, 'mary')}, db, sliceAttrDict=dict(id=0, start=1, stop=2, name=3)) annoDB.__doc__ = 'trivial annotation' self.pygrData.Bio.Test.annoDB = annoDB nlmsa = cnestedlist.NLMSA(tryannot, 'w', pairwiseMode=True, bidirectional=False) try: for annID in annoDB: nlmsa.addAnnotation(annoDB[annID]) nlmsa.build() nlmsa.__doc__ = 'trivial map' self.pygrData.Bio.Test.map = nlmsa self.schema.Bio.Test.map = metabase.ManyToManyRelation(db, annoDB, bindAttrs=('exons', )) self.metabase.commit() self.metabase.clear_cache() finally: nlmsa.close() finally: db.close()
def populate_swissprot(): "Populate the current worldbase with swissprot data" # build BlastDB out of the sequences sp_hbb1 = testutil.datafile('sp_hbb1') sp = seqdb.BlastDB(sp_hbb1) sp.__doc__ = 'little swissprot' worldbase.Bio.Seq.Swissprot.sp42 = sp # also store a fragment hbb = sp['HBB1_TORMA'] ival = hbb[10:35] ival.__doc__ = 'fragment' worldbase.Bio.Seq.frag = ival # build a mapping to itself m = mapping.Mapping(sourceDB=sp, targetDB=sp) trypsin = sp['PRCA_ANAVA'] m[hbb] = trypsin m.__doc__ = 'map sp to itself' worldbase.Bio.Seq.spmap = m # create an annotation database and bind as exons attribute worldbase.schema.Bio.Seq.spmap = metabase.OneToManyRelation( sp, sp, bindAttrs=('buddy', )) annoDB = seqdb.AnnotationDB({1: ('HBB1_TORMA', 10, 50)}, sp, sliceAttrDict=dict(id=0, start=1, stop=2)) exon = annoDB[1] # generate the names where these will be stored tempdir = testutil.TempDir('exonAnnot') filename = tempdir.subfile('cnested') nlmsa = cnestedlist.NLMSA(filename, 'w', pairwiseMode=True, bidirectional=False) nlmsa.addAnnotation(exon) nlmsa.build() annoDB.__doc__ = 'a little annotation db' nlmsa.__doc__ = 'a little map' worldbase.Bio.Annotation.annoDB = annoDB worldbase.Bio.Annotation.map = nlmsa worldbase.schema.Bio.Annotation.map = \ metabase.ManyToManyRelation(sp, annoDB, bindAttrs=('exons', ))
def test_mysqlannot(self): 'Test building an AnnotationDB from MySQL' from pygr import seqdb, cnestedlist, sqlgraph dm2 = pygr.Data.getResource('TEST.Seq.Genome.dm2') # BUILD ANNOTATION DATABASE FOR REFSEQ EXONS: MYSQL VERSION exon_slices = sqlgraph.SQLTableClustered( '%s.pygr_refGene_exonAnnot%s_dm2' % (testInputDB, smallSamplePostfix), clusterKey='chromosome', maxCache=0) exon_db = seqdb.AnnotationDB(exon_slices, dm2, sliceAttrDict=dict(id='chromosome', gene_id='name', exon_id='exon_id')) msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_exonAnnot_SQL_dm2'), 'w', pairwiseMode=True, bidirectional=False) for id in exon_db: msa.addAnnotation(exon_db[id]) exon_db.clear_cache() # not really necessary; cache should autoGC exon_slices.clear_cache() msa.build() exon_db.__doc__ = 'SQL Exon Annotation Database for dm2' pygr.Data.addResource('TEST.Annotation.SQL.dm2.exons', exon_db) msa.__doc__ = 'SQL NLMSA Exon for dm2' pygr.Data.addResource('TEST.Annotation.NLMSA.SQL.dm2.exons', msa) exon_schema = pygr.Data.ManyToManyRelation(dm2, exon_db, bindAttrs=('exon2', )) exon_schema.__doc__ = 'SQL Exon Schema for dm2' pygr.Data.addSchema('TEST.Annotation.NLMSA.SQL.dm2.exons', exon_schema) # BUILD ANNOTATION DATABASE FOR REFSEQ SPLICES: MYSQL VERSION splice_slices = sqlgraph.SQLTableClustered( '%s.pygr_refGene_spliceAnnot%s_dm2' % (testInputDB, smallSamplePostfix), clusterKey='chromosome', maxCache=0) splice_db = seqdb.AnnotationDB(splice_slices, dm2, sliceAttrDict=dict(id='chromosome', gene_id='name', splice_id='splice_id')) msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_spliceAnnot_SQL_dm2'), 'w', pairwiseMode=True, bidirectional=False) for id in splice_db: msa.addAnnotation(splice_db[id]) splice_db.clear_cache() # not really necessary; cache should autoGC splice_slices.clear_cache() msa.build() splice_db.__doc__ = 'SQL Splice Annotation Database for dm2' pygr.Data.addResource('TEST.Annotation.SQL.dm2.splices', splice_db) msa.__doc__ = 'SQL NLMSA Splice for dm2' pygr.Data.addResource('TEST.Annotation.NLMSA.SQL.dm2.splices', msa) splice_schema = pygr.Data.ManyToManyRelation(dm2, splice_db, bindAttrs=('splice2', )) splice_schema.__doc__ = 'SQL Splice Schema for dm2' pygr.Data.addSchema('TEST.Annotation.NLMSA.SQL.dm2.splices', splice_schema) # BUILD ANNOTATION DATABASE FOR MOST CONSERVED ELEMENTS FROM UCSC: # MYSQL VERSION ucsc_slices = sqlgraph.SQLTableClustered( '%s.pygr_phastConsElements15way%s_dm2' % (testInputDB, smallSamplePostfix), clusterKey='chromosome', maxCache=0) ucsc_db = seqdb.AnnotationDB(ucsc_slices, dm2, sliceAttrDict=dict(id='chromosome', gene_id='name', ucsc_id='ucsc_id')) msa = cnestedlist.NLMSA(os.path.join(self.path, 'phastConsElements15way_SQL_dm2'), 'w', pairwiseMode=True, bidirectional=False) for id in ucsc_db: msa.addAnnotation(ucsc_db[id]) ucsc_db.clear_cache() # not really necessary; cache should autoGC ucsc_slices.clear_cache() msa.build() ucsc_db.__doc__ = 'SQL Most Conserved Elements for dm2' pygr.Data.addResource('TEST.Annotation.UCSC.SQL.dm2.mostconserved', ucsc_db) msa.__doc__ = 'SQL NLMSA for Most Conserved Elements for dm2' pygr.Data.addResource( 'TEST.Annotation.UCSC.NLMSA.SQL.dm2.mostconserved', msa) ucsc_schema = pygr.Data.ManyToManyRelation(dm2, ucsc_db, bindAttrs=('element2', )) ucsc_schema.__doc__ = \ 'SQL Schema for UCSC Most Conserved Elements for dm2' pygr.Data.addSchema('TEST.Annotation.UCSC.NLMSA.SQL.dm2.mostconserved', ucsc_schema) pygr.Data.save() pygr.Data.clear_cache() # QUERY TO EXON AND SPLICES ANNOTATION DATABASE dm2 = pygr.Data.getResource('TEST.Seq.Genome.dm2') exonmsa = pygr.Data.getResource('TEST.Annotation.NLMSA.SQL.dm2.exons') splicemsa = \ pygr.Data.getResource('TEST.Annotation.NLMSA.SQL.dm2.splices') conservedmsa = \ pygr.Data.getResource('TEST.Annotation.UCSC.NLMSA.SQL.dm2.mostconserved') exons = pygr.Data.getResource('TEST.Annotation.SQL.dm2.exons') splices = pygr.Data.getResource('TEST.Annotation.SQL.dm2.splices') mostconserved = \ pygr.Data.getResource('TEST.Annotation.UCSC.SQL.dm2.mostconserved') # OPEN DM2_MULTIZ15WAY NLMSA msa = cnestedlist.NLMSA(os.path.join(msaDir, 'dm2_multiz15way'), 'r', trypath=[seqDir]) exonAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Exons%s_dm2.txt' % smallSamplePostfix) intronAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Introns%s_dm2.txt' % smallSamplePostfix) newexonAnnotFileName = os.path.join(self.path, 'new_Exons_dm2.txt') newintronAnnotFileName = os.path.join(self.path, 'new_Introns_dm2.txt') tmpexonAnnotFileName = self.copyFile(exonAnnotFileName) tmpintronAnnotFileName = self.copyFile(intronAnnotFileName) if smallSampleKey: chrList = [smallSampleKey] else: chrList = dm2.seqLenDict.keys() chrList.sort() outfile = open(newexonAnnotFileName, 'w') for chrid in chrList: slice = dm2[chrid] try: ex1 = exonmsa[slice] except KeyError: continue else: exlist1 = [(ix.exon_id, ix) for ix in ex1.keys()] exlist1.sort() for ixx, exon in exlist1: saveList = [] tmp = exon.sequence tmpexon = exons[exon.exon_id] tmpslice = tmpexon.sequence # FOR REAL EXON COORDINATE wlist1 = 'EXON', chrid, tmpexon.exon_id, tmpexon.gene_id, \ tmpslice.start, tmpslice.stop try: out1 = conservedmsa[tmp] except KeyError: pass else: elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()] elementlist.sort() for iyy, element in elementlist: if element.stop - element.start < 100: continue score = int(string.split(element.gene_id, '=')[1]) if score < 100: continue tmp2 = element.sequence tmpelement = mostconserved[element.ucsc_id] # FOR REAL ELEMENT COORDINATE tmpslice2 = tmpelement.sequence wlist2 = wlist1 + (tmpelement.ucsc_id, tmpelement.gene_id, tmpslice2.start, tmpslice2.stop) slicestart, sliceend = max(tmp.start, tmp2.start),\ min(tmp.stop, tmp2.stop) tmp1 = msa.seqDict['dm2.' + chrid][slicestart: sliceend] edges = msa[tmp1].edges() for src, dest, e in edges: if src.stop - src.start < 100: continue palign, pident = e.pAligned(), e.pIdentity() if palign < 0.8 or pident < 0.8: continue palign, pident = '%.2f' % palign, \ '%.2f' % pident wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, (~msa.seqDict)[dest], str(dest), dest.start, dest.stop, palign, pident) saveList.append('\t'.join(map(str, wlist3)) + '\n') saveList.sort() for saveline in saveList: outfile.write(saveline) outfile.close() md5old = hashlib.md5() md5old.update(open(tmpexonAnnotFileName, 'r').read()) md5new = hashlib.md5() md5new.update(open(newexonAnnotFileName, 'r').read()) assert md5old.digest() == md5new.digest() outfile = open(newintronAnnotFileName, 'w') for chrid in chrList: slice = dm2[chrid] try: sp1 = splicemsa[slice] except: continue else: splist1 = [(ix.splice_id, ix) for ix in sp1.keys()] splist1.sort() for ixx, splice in splist1: saveList = [] tmp = splice.sequence tmpsplice = splices[splice.splice_id] tmpslice = tmpsplice.sequence # FOR REAL EXON COORDINATE wlist1 = 'INTRON', chrid, tmpsplice.splice_id, \ tmpsplice.gene_id, tmpslice.start, tmpslice.stop try: out1 = conservedmsa[tmp] except KeyError: pass else: elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()] elementlist.sort() for iyy, element in elementlist: if element.stop - element.start < 100: continue score = int(string.split(element.gene_id, '=')[1]) if score < 100: continue tmp2 = element.sequence tmpelement = mostconserved[element.ucsc_id] # FOR REAL ELEMENT COORDINATE tmpslice2 = tmpelement.sequence wlist2 = wlist1 + (tmpelement.ucsc_id, tmpelement.gene_id, tmpslice2.start, tmpslice2.stop) slicestart, sliceend = max(tmp.start, tmp2.start),\ min(tmp.stop, tmp2.stop) tmp1 = msa.seqDict['dm2.' + chrid][slicestart: sliceend] edges = msa[tmp1].edges() for src, dest, e in edges: if src.stop - src.start < 100: continue palign, pident = e.pAligned(), e.pIdentity() if palign < 0.8 or pident < 0.8: continue palign, pident = '%.2f' % palign, \ '%.2f' % pident wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, (~msa.seqDict)[dest], str(dest), dest.start, dest.stop, palign, pident) saveList.append('\t'.join(map(str, wlist3)) + '\n') saveList.sort() for saveline in saveList: outfile.write(saveline) outfile.close() md5old = hashlib.md5() md5old.update(open(tmpintronAnnotFileName, 'r').read()) md5new = hashlib.md5() md5new.update(open(newintronAnnotFileName, 'r').read()) assert md5old.digest() == md5new.digest()
def test_collectionannot(self): 'Test building an AnnotationDB from file' from pygr import seqdb, cnestedlist, sqlgraph dm2 = pygr.Data.getResource('TEST.Seq.Genome.dm2') # BUILD ANNOTATION DATABASE FOR REFSEQ EXONS exon_slices = Collection( filename=os.path.join(self.path, 'refGene_exonAnnot_dm2.cdb'), intKeys=True, mode='cr', writeback=False) exon_db = seqdb.AnnotationDB(exon_slices, dm2, sliceAttrDict=dict(id=0, exon_id=1, orientation=2, gene_id=3, start=4, stop=5)) msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_exonAnnot_dm2'), 'w', pairwiseMode=True, bidirectional=False) for lines in open(os.path.join(testInputDir, 'refGene_exonAnnot%s_dm2.txt' % smallSamplePostfix), 'r').xreadlines(): row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER exon_slices[row[1]] = row exon = exon_db[row[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON msa.addAnnotation(exon) # SAVE IT TO GENOME MAPPING exon_db.clear_cache() # not really necessary; cache should autoGC # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS exon_slices.close() msa.build() # FINALIZE GENOME ALIGNMENT INDEXES exon_db.__doc__ = 'Exon Annotation Database for dm2' pygr.Data.addResource('TEST.Annotation.dm2.exons', exon_db) msa.__doc__ = 'NLMSA Exon for dm2' pygr.Data.addResource('TEST.Annotation.NLMSA.dm2.exons', msa) exon_schema = pygr.Data.ManyToManyRelation(dm2, exon_db, bindAttrs=('exon1', )) exon_schema.__doc__ = 'Exon Schema for dm2' pygr.Data.addSchema('TEST.Annotation.NLMSA.dm2.exons', exon_schema) # BUILD ANNOTATION DATABASE FOR REFSEQ SPLICES splice_slices = Collection( filename=os.path.join(self.path, 'refGene_spliceAnnot_dm2.cdb'), intKeys=True, mode='cr', writeback=False) splice_db = seqdb.AnnotationDB(splice_slices, dm2, sliceAttrDict=dict(id=0, splice_id=1, orientation=2, gene_id=3, start=4, stop=5)) msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_spliceAnnot_dm2'), 'w', pairwiseMode=True, bidirectional=False) for lines in open(os.path.join(testInputDir, 'refGene_spliceAnnot%s_dm2.txt' % smallSamplePostfix), 'r').xreadlines(): row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER splice_slices[row[1]] = row # GET THE ANNOTATION OBJECT FOR THIS EXON splice = splice_db[row[1]] msa.addAnnotation(splice) # SAVE IT TO GENOME MAPPING splice_db.clear_cache() # not really necessary; cache should autoGC # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS splice_slices.close() msa.build() # FINALIZE GENOME ALIGNMENT INDEXES splice_db.__doc__ = 'Splice Annotation Database for dm2' pygr.Data.addResource('TEST.Annotation.dm2.splices', splice_db) msa.__doc__ = 'NLMSA Splice for dm2' pygr.Data.addResource('TEST.Annotation.NLMSA.dm2.splices', msa) splice_schema = pygr.Data.ManyToManyRelation(dm2, splice_db, bindAttrs=('splice1', )) splice_schema.__doc__ = 'Splice Schema for dm2' pygr.Data.addSchema('TEST.Annotation.NLMSA.dm2.splices', splice_schema) # BUILD ANNOTATION DATABASE FOR MOST CONSERVED ELEMENTS FROM UCSC ucsc_slices = Collection( filename=os.path.join(self.path, 'phastConsElements15way_dm2.cdb'), intKeys=True, mode='cr', writeback=False) ucsc_db = seqdb.AnnotationDB(ucsc_slices, dm2, sliceAttrDict=dict(id=0, ucsc_id=1, orientation=2, gene_id=3, start=4, stop=5)) msa = cnestedlist.NLMSA(os.path.join(self.path, 'phastConsElements15way_dm2'), 'w', pairwiseMode=True, bidirectional=False) for lines in open(os.path.join(testInputDir, 'phastConsElements15way%s_dm2.txt' % smallSamplePostfix), 'r').xreadlines(): row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER ucsc_slices[row[1]] = row ucsc = ucsc_db[row[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON msa.addAnnotation(ucsc) # SAVE IT TO GENOME MAPPING ucsc_db.clear_cache() # not really necessary; cache should autoGC # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS ucsc_slices.close() msa.build() # FINALIZE GENOME ALIGNMENT INDEXES ucsc_db.__doc__ = 'Most Conserved Elements for dm2' pygr.Data.addResource('TEST.Annotation.UCSC.dm2.mostconserved', ucsc_db) msa.__doc__ = 'NLMSA for Most Conserved Elements for dm2' pygr.Data.addResource('TEST.Annotation.UCSC.NLMSA.dm2.mostconserved', msa) ucsc_schema = pygr.Data.ManyToManyRelation(dm2, ucsc_db, bindAttrs=('element1', )) ucsc_schema.__doc__ = 'Schema for UCSC Most Conserved Elements for dm2' pygr.Data.addSchema('TEST.Annotation.UCSC.NLMSA.dm2.mostconserved', ucsc_schema) pygr.Data.save() pygr.Data.clear_cache() # force resources to reload when requested # QUERY TO EXON AND SPLICES ANNOTATION DATABASE dm2 = pygr.Data.getResource('TEST.Seq.Genome.dm2') exonmsa = pygr.Data.getResource('TEST.Annotation.NLMSA.dm2.exons') splicemsa = pygr.Data.getResource('TEST.Annotation.NLMSA.dm2.splices') conservedmsa = \ pygr.Data.getResource('TEST.Annotation.UCSC.NLMSA.dm2.mostconserved') exons = pygr.Data.getResource('TEST.Annotation.dm2.exons') splices = pygr.Data.getResource('TEST.Annotation.dm2.splices') mostconserved = \ pygr.Data.getResource('TEST.Annotation.UCSC.dm2.mostconserved') # OPEN DM2_MULTIZ15WAY NLMSA msa = cnestedlist.NLMSA(os.path.join(msaDir, 'dm2_multiz15way'), 'r', trypath=[seqDir]) exonAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Exons%s_dm2.txt' % smallSamplePostfix) intronAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Introns%s_dm2.txt' % smallSamplePostfix) newexonAnnotFileName = os.path.join(self.path, 'new_Exons_dm2.txt') newintronAnnotFileName = os.path.join(self.path, 'new_Introns_dm2.txt') tmpexonAnnotFileName = self.copyFile(exonAnnotFileName) tmpintronAnnotFileName = self.copyFile(intronAnnotFileName) if smallSampleKey: chrList = [smallSampleKey] else: chrList = dm2.seqLenDict.keys() chrList.sort() outfile = open(newexonAnnotFileName, 'w') for chrid in chrList: slice = dm2[chrid] try: ex1 = exonmsa[slice] except KeyError: continue else: exlist1 = [(ix.exon_id, ix) for ix in ex1.keys()] exlist1.sort() for ixx, exon in exlist1: saveList = [] tmp = exon.sequence tmpexon = exons[exon.exon_id] tmpslice = tmpexon.sequence # FOR REAL EXON COORDINATE wlist1 = 'EXON', chrid, tmpexon.exon_id, tmpexon.gene_id, \ tmpslice.start, tmpslice.stop try: out1 = conservedmsa[tmp] except KeyError: pass else: elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()] elementlist.sort() for iyy, element in elementlist: if element.stop - element.start < 100: continue score = int(string.split(element.gene_id, '=')[1]) if score < 100: continue tmp2 = element.sequence tmpelement = mostconserved[element.ucsc_id] # FOR REAL ELEMENT COORDINATE tmpslice2 = tmpelement.sequence wlist2 = wlist1 + (tmpelement.ucsc_id, tmpelement.gene_id, tmpslice2.start, tmpslice2.stop) slicestart, sliceend = max(tmp.start, tmp2.start),\ min(tmp.stop, tmp2.stop) tmp1 = msa.seqDict['dm2.' + chrid][slicestart: sliceend] edges = msa[tmp1].edges() for src, dest, e in edges: if src.stop - src.start < 100: continue palign, pident = e.pAligned(), e.pIdentity() if palign < 0.8 or pident < 0.8: continue palign, pident = '%.2f' % palign, \ '%.2f' % pident wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, (~msa.seqDict)[dest], str(dest), dest.start, dest.stop, palign, pident) saveList.append('\t'.join(map(str, wlist3)) + '\n') saveList.sort() for saveline in saveList: outfile.write(saveline) outfile.close() md5old = hashlib.md5() md5old.update(open(tmpexonAnnotFileName, 'r').read()) md5new = hashlib.md5() md5new.update(open(newexonAnnotFileName, 'r').read()) assert md5old.digest() == md5new.digest() outfile = open(newintronAnnotFileName, 'w') for chrid in chrList: slice = dm2[chrid] try: sp1 = splicemsa[slice] except: continue else: splist1 = [(ix.splice_id, ix) for ix in sp1.keys()] splist1.sort() for ixx, splice in splist1: saveList = [] tmp = splice.sequence tmpsplice = splices[splice.splice_id] tmpslice = tmpsplice.sequence # FOR REAL EXON COORDINATE wlist1 = 'INTRON', chrid, tmpsplice.splice_id, \ tmpsplice.gene_id, tmpslice.start, tmpslice.stop try: out1 = conservedmsa[tmp] except KeyError: pass else: elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()] elementlist.sort() for iyy, element in elementlist: if element.stop - element.start < 100: continue score = int(string.split(element.gene_id, '=')[1]) if score < 100: continue tmp2 = element.sequence tmpelement = mostconserved[element.ucsc_id] # FOR REAL ELEMENT COORDINATE tmpslice2 = tmpelement.sequence wlist2 = wlist1 + (tmpelement.ucsc_id, tmpelement.gene_id, tmpslice2.start, tmpslice2.stop) slicestart, sliceend = max(tmp.start, tmp2.start),\ min(tmp.stop, tmp2.stop) tmp1 = msa.seqDict['dm2.' + chrid][slicestart: sliceend] edges = msa[tmp1].edges() for src, dest, e in edges: if src.stop - src.start < 100: continue palign, pident = e.pAligned(), e.pIdentity() if palign < 0.8 or pident < 0.8: continue palign, pident = '%.2f' % palign, \ '%.2f' % pident wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, (~msa.seqDict)[dest], str(dest), dest.start, dest.stop, palign, pident) saveList.append('\t'.join(map(str, wlist3)) + '\n') saveList.sort() for saveline in saveList: outfile.write(saveline) outfile.close() md5old = hashlib.md5() md5old.update(open(tmpintronAnnotFileName, 'r').read()) md5new = hashlib.md5() md5new.update(open(newintronAnnotFileName, 'r').read()) assert md5old.digest() == md5new.digest()