def test_build(self): 'Test building an NLMSA and querying results' from pygr import seqdb, cnestedlist genomedict = {} for orgstr in msaSpeciesList: genomedict[orgstr] = pygr.Data.getResource('TEST.Seq.Genome.' + orgstr) uniondict = seqdb.PrefixUnionDict(genomedict) if smallSampleKey: maflist = (os.path.join(mafDir, smallSampleKey + '.maf'), ) else: maflist = glob.glob(os.path.join(mafDir, '*.maf')) maflist.sort() msaname = os.path.join(self.path, 'dm2_multiz15way') # 500MB VERSION msa1 = cnestedlist.NLMSA(msaname, 'w', uniondict, maflist, maxlen=536870912, maxint=22369620) msa1.__doc__ = 'TEST NLMSA for dm2 multiz15way' pygr.Data.addResource('TEST.MSA.UCSC.dm2_multiz15way', msa1) pygr.Data.save() msa = pygr.Data.getResource('TEST.MSA.UCSC.dm2_multiz15way') outfileName = os.path.join(testInputDir, 'splicesite_dm2%s.txt' % smallSamplePostfix) outputName = os.path.join( testInputDir, 'splicesite_dm2%s_multiz15way.txt' % smallSamplePostfix) newOutputName = os.path.join(self.path, 'splicesite_new1.txt') tmpInputName = self.copyFile(outfileName) tmpOutputName = self.copyFile(outputName) outfile = open(newOutputName, 'w') for lines in open(tmpInputName, 'r').xreadlines(): chrid, intstart, intend, nobs = string.split(lines.strip(), '\t') intstart, intend, nobs = int(intstart), int(intend), int(nobs) site1 = msa.seqDict['dm2' + '.' + chrid][intstart:intstart + 2] site2 = msa.seqDict['dm2' + '.' + chrid][intend - 2:intend] edges1 = msa[site1].edges() edges2 = msa[site2].edges() if len(edges1) == 0: # EMPTY EDGES wlist = str(site1), 'dm2', chrid, intstart, intstart + 2, \ '', '', '', '', '' outfile.write('\t'.join(map(str, wlist)) + '\n') if len(edges2) == 0: # EMPTY EDGES wlist = str(site2), 'dm2', chrid, intend - 2, intend, '', \ '', '', '', '' outfile.write('\t'.join(map(str, wlist)) + '\n') saveList = [] for src, dest, e in edges1: if len(str(src)) != 2 or len(str(dest)) != 2: continue dotindex = (~msa.seqDict)[src].index('.') srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \ (~msa.seqDict)[src][dotindex + 1:] dotindex = (~msa.seqDict)[dest].index('.') destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \ (~msa.seqDict)[dest][dotindex + 1:] wlist = str(src), srcspecies, src1, src.start, src.stop, \ str(dest), destspecies, dest1, dest.start, dest.stop saveList.append('\t'.join(map(str, wlist)) + '\n') for src, dest, e in edges2: if len(str(src)) != 2 or len(str(dest)) != 2: continue dotindex = (~msa.seqDict)[src].index('.') srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \ (~msa.seqDict)[src][dotindex + 1:] dotindex = (~msa.seqDict)[dest].index('.') destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \ (~msa.seqDict)[dest][dotindex + 1:] wlist = str(src), srcspecies, src1, src.start, src.stop, \ str(dest), destspecies, dest1, dest.start, dest.stop saveList.append('\t'.join(map(str, wlist)) + '\n') saveList.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS for saveline in saveList: outfile.write(saveline) outfile.close() md5old = hashlib.md5() md5old.update(open(newOutputName, 'r').read()) md5new = hashlib.md5() md5new.update(open(tmpOutputName, 'r').read()) assert md5old.digest() == md5new.digest() # TEXT<->BINARY TEST msafilelist = glob.glob(msaname + '*') msa.save_seq_dict() cnestedlist.dump_textfile( msaname, os.path.join(self.path, 'dm2_multiz15way.txt')) for filename in msafilelist: os.remove(filename) runPath = os.path.realpath(os.curdir) os.chdir(self.path) cnestedlist.textfile_to_binaries('dm2_multiz15way.txt') os.chdir(runPath) msa1 = cnestedlist.NLMSA(msaname, 'r') msa1.__doc__ = 'TEST NLMSA for dm2 multiz15way' pygr.Data.addResource('TEST.MSA.UCSC.dm2_multiz15way', msa1) pygr.Data.save() msa = pygr.Data.getResource('TEST.MSA.UCSC.dm2_multiz15way') newOutputName = os.path.join(self.path, 'splicesite_new2.txt') tmpInputName = self.copyFile(outfileName) tmpOutputName = self.copyFile(outputName) outfile = open(newOutputName, 'w') for lines in open(tmpInputName, 'r').xreadlines(): chrid, intstart, intend, nobs = string.split(lines.strip(), '\t') intstart, intend, nobs = int(intstart), int(intend), int(nobs) site1 = msa.seqDict['dm2' + '.' + chrid][intstart:intstart + 2] site2 = msa.seqDict['dm2' + '.' + chrid][intend - 2:intend] edges1 = msa[site1].edges() edges2 = msa[site2].edges() if len(edges1) == 0: # EMPTY EDGES wlist = str(site1), 'dm2', chrid, intstart, intstart + 2, \ '', '', '', '', '' outfile.write('\t'.join(map(str, wlist)) + '\n') if len(edges2) == 0: # EMPTY EDGES wlist = str(site2), 'dm2', chrid, intend - 2, intend, '', \ '', '', '', '' outfile.write('\t'.join(map(str, wlist)) + '\n') saveList = [] for src, dest, e in edges1: if len(str(src)) != 2 or len(str(dest)) != 2: continue dotindex = (~msa.seqDict)[src].index('.') srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \ (~msa.seqDict)[src][dotindex + 1:] dotindex = (~msa.seqDict)[dest].index('.') destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \ (~msa.seqDict)[dest][dotindex + 1:] wlist = str(src), srcspecies, src1, src.start, src.stop, \ str(dest), destspecies, dest1, dest.start, dest.stop saveList.append('\t'.join(map(str, wlist)) + '\n') for src, dest, e in edges2: if len(str(src)) != 2 or len(str(dest)) != 2: continue dotindex = (~msa.seqDict)[src].index('.') srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \ (~msa.seqDict)[src][dotindex + 1:] dotindex = (~msa.seqDict)[dest].index('.') destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \ (~msa.seqDict)[dest][dotindex + 1:] wlist = str(src), srcspecies, src1, src.start, src.stop, \ str(dest), destspecies, dest1, dest.start, dest.stop saveList.append('\t'.join(map(str, wlist)) + '\n') saveList.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS for saveline in saveList: outfile.write(saveline) outfile.close() md5old = hashlib.md5() md5old.update(open(newOutputName, 'r').read()) md5new = hashlib.md5() md5new.update(open(tmpOutputName, 'r').read()) assert md5old.digest() == md5new.digest()
def test_build(self): 'Test building an NLMSA and querying results' from pygr import seqdb, cnestedlist genomedict = {} for orgstr in msaSpeciesList: genomedict[orgstr] = pygr.Data.getResource('TEST.Seq.Genome.' + orgstr) uniondict = seqdb.PrefixUnionDict(genomedict) if smallSampleKey: axtlist = glob.glob(os.path.join(axtDir, '*' + os.sep + smallSampleKey + '.*.net.axt')) else: axtlist = glob.glob(os.path.join(axtDir, '*' + os.sep + '*.*.net.axt')) axtlist.sort() msaname = os.path.join(self.path, 'hg18_pairwise5way') # 500MB VERSION msa1 = cnestedlist.NLMSA(msaname, 'w', uniondict, axtFiles=axtlist, maxlen=536870912, maxint=22369620) msa1.__doc__ = 'TEST NLMSA for hg18 pairwise5way' pygr.Data.addResource('TEST.MSA.UCSC.hg18_pairwise5way', msa1) pygr.Data.save() msa = pygr.Data.getResource('TEST.MSA.UCSC.hg18_pairwise5way') outfileName = os.path.join(testInputDir, 'splicesite_hg18%s.txt' % smallSamplePostfix) outputName = os.path.join(testInputDir, 'splicesite_hg18%s_pairwise5way.txt' % smallSamplePostfix) newOutputName = 'splicesite_new1.txt' tmpInputName = self.copyFile(outfileName) tmpOutputName = self.copyFile(outputName) tmpNewOutputName = os.path.join(self.path, newOutputName) outfile = open(tmpNewOutputName, 'w') for lines in open(tmpInputName, 'r').xreadlines(): chrid, intstart, intend, nobs = string.split(lines.strip(), '\t') intstart, intend, nobs = int(intstart), int(intend), int(nobs) site1 = msa.seqDict['hg18' + '.' + chrid][intstart:intstart + 2] site2 = msa.seqDict['hg18' + '.' + chrid][intend - 2:intend] edges1 = msa[site1].edges() edges2 = msa[site2].edges() if len(edges1) == 0: # EMPTY EDGES wlist = str(site1), 'hg18', chrid, intstart, intstart + 2, \ '', '', '', '', '' outfile.write('\t'.join(map(str, wlist)) + '\n') if len(edges2) == 0: # EMPTY EDGES wlist = str(site2), 'hg18', chrid, intend - 2, intend, '', \ '', '', '', '' outfile.write('\t'.join(map(str, wlist)) + '\n') saveList = [] for src, dest, e in edges1: if len(str(src)) != 2 or len(str(dest)) != 2: continue dotindex = (~msa.seqDict)[src].index('.') srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \ (~msa.seqDict)[src][dotindex + 1:] dotindex = (~msa.seqDict)[dest].index('.') destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \ (~msa.seqDict)[dest][dotindex + 1:] wlist = str(src), srcspecies, src1, src.start, src.stop, \ str(dest), destspecies, dest1, dest.start, dest.stop saveList.append('\t'.join(map(str, wlist)) + '\n') for src, dest, e in edges2: if len(str(src)) != 2 or len(str(dest)) != 2: continue dotindex = (~msa.seqDict)[src].index('.') srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \ (~msa.seqDict)[src][dotindex + 1:] dotindex = (~msa.seqDict)[dest].index('.') destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \ (~msa.seqDict)[dest][dotindex + 1:] wlist = str(src), srcspecies, src1, src.start, src.stop, \ str(dest), destspecies, dest1, dest.start, dest.stop saveList.append('\t'.join(map(str, wlist)) + '\n') saveList.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS for saveline in saveList: outfile.write(saveline) outfile.close() md5old = hashlib.md5() md5old.update(open(tmpNewOutputName, 'r').read()) md5new = hashlib.md5() md5new.update(open(tmpOutputName, 'r').read()) assert md5old.digest() == md5new.digest() # TEXT<->BINARY TEST msafilelist = glob.glob(msaname + '*') msa.save_seq_dict() cnestedlist.dump_textfile(msaname, os.path.join(self.path, 'hg18_pairwise5way.txt')) for filename in msafilelist: os.remove(filename) runPath = os.path.realpath(os.curdir) os.chdir(self.path) cnestedlist.textfile_to_binaries('hg18_pairwise5way.txt') os.chdir(runPath) msa1 = cnestedlist.NLMSA(msaname, 'r') msa1.__doc__ = 'TEST NLMSA for hg18 pairwise5way' pygr.Data.addResource('TEST.MSA.UCSC.hg18_pairwise5way', msa1) pygr.Data.save() msa = pygr.Data.getResource('TEST.MSA.UCSC.hg18_pairwise5way') newOutputName = 'splicesite_new2.txt' tmpInputName = self.copyFile(outfileName) tmpOutputName = self.copyFile(outputName) tmpNewOutputName = os.path.join(self.path, newOutputName) outfile = open(tmpNewOutputName, 'w') for lines in open(tmpInputName, 'r').xreadlines(): chrid, intstart, intend, nobs = string.split(lines.strip(), '\t') intstart, intend, nobs = int(intstart), int(intend), int(nobs) site1 = msa.seqDict['hg18' + '.' + chrid][intstart:intstart + 2] site2 = msa.seqDict['hg18' + '.' + chrid][intend - 2:intend] edges1 = msa[site1].edges() edges2 = msa[site2].edges() if len(edges1) == 0: # EMPTY EDGES wlist = str(site1), 'hg18', chrid, intstart, intstart + 2, \ '', '', '', '', '' outfile.write('\t'.join(map(str, wlist)) + '\n') if len(edges2) == 0: # EMPTY EDGES wlist = str(site2), 'hg18', chrid, intend - 2, intend, '', \ '', '', '', '' outfile.write('\t'.join(map(str, wlist)) + '\n') saveList = [] for src, dest, e in edges1: if len(str(src)) != 2 or len(str(dest)) != 2: continue dotindex = (~msa.seqDict)[src].index('.') srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \ (~msa.seqDict)[src][dotindex + 1:] dotindex = (~msa.seqDict)[dest].index('.') destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \ (~msa.seqDict)[dest][dotindex + 1:] wlist = str(src), srcspecies, src1, src.start, src.stop, \ str(dest), destspecies, dest1, dest.start, dest.stop saveList.append('\t'.join(map(str, wlist)) + '\n') for src, dest, e in edges2: if len(str(src)) != 2 or len(str(dest)) != 2: continue dotindex = (~msa.seqDict)[src].index('.') srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \ (~msa.seqDict)[src][dotindex + 1:] dotindex = (~msa.seqDict)[dest].index('.') destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \ (~msa.seqDict)[dest][dotindex + 1:] wlist = str(src), srcspecies, src1, src.start, src.stop, \ str(dest), destspecies, dest1, dest.start, dest.stop saveList.append('\t'.join(map(str, wlist)) + '\n') saveList.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS for saveline in saveList: outfile.write(saveline) outfile.close() md5old = hashlib.md5() md5old.update(open(tmpNewOutputName, 'r').read()) md5new = hashlib.md5() md5new.update(open(tmpOutputName, 'r').read()) assert md5old.digest() == md5new.digest()
def build_test(self): # BUILD NLMSA AND QUERY RESULT COMPARISON from pygr import seqdb, cnestedlist genomedict = {} for orgstr in msaSpeciesList: genomedict[orgstr] = pygr.Data.getResource('TEST.Seq.Genome.' + orgstr) uniondict = seqdb.PrefixUnionDict(genomedict) import glob maflist = glob.glob(os.path.join(mafDir, 'chr4h.maf')) # CHR4H TESTING maflist.sort() msaname = os.path.join(self.path, 'dm2_multiz15way') msa1 = cnestedlist.NLMSA(msaname, 'w', uniondict, maflist, maxlen = 536870912, maxint = 22369620) # 500MB VERSION msa1.save_seq_dict() msa1.__doc__ = 'TEST NLMSA for dm2 multiz15way' pygr.Data.addResource('TEST.MSA.UCSC.dm2_multiz15way', msa1) pygr.Data.save() msa = pygr.Data.getResource('TEST.MSA.UCSC.dm2_multiz15way') outfileName = os.path.join(testInputDir, 'splicesite_dm2_chr4h.txt') # CHR4H TESTING outputName = os.path.join(testInputDir, 'splicesite_dm2_chr4h_multiz15way.txt') # CHR4H TESTING newOutputName = os.path.join(self.path, 'splicesite_new1.txt') tmpInputName = self.copyFile(outfileName) tmpOutputName = self.copyFile(outputName) outfile = open(newOutputName, 'w') for lines in open(tmpInputName, 'r').xreadlines(): chrid, intstart, intend, nobs = string.split(lines.strip(), '\t') intstart, intend, nobs = int(intstart), int(intend), int(nobs) site1 = msa.seqDict['dm2' + '.' + chrid][intstart:intstart+2] site2 = msa.seqDict['dm2' + '.' + chrid][intend-2:intend] edges1 = msa[site1].edges() edges2 = msa[site2].edges() if len(edges1) == 0: # EMPTY EDGES wlist = str(site1), 'dm2', chrid, intstart, intstart+2, '', '', '', '', '' outfile.write('\t'.join(map(str, wlist)) + '\n') if len(edges2) == 0: # EMPTY EDGES wlist = str(site2), 'dm2', chrid, intend-2, intend, '', '', '', '', '' outfile.write('\t'.join(map(str, wlist)) + '\n') saveList = [] for src, dest, e in edges1: if len(str(src)) != 2 or len(str(dest)) != 2: continue dotindex = (~msa.seqDict)[src].index('.') srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex+1:] dotindex = (~msa.seqDict)[dest].index('.') destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex+1:] wlist = str(src), srcspecies, src1, src.start, src.stop, str(dest), \ destspecies, dest1, dest.start, dest.stop saveList.append('\t'.join(map(str, wlist)) + '\n') for src, dest, e in edges2: if len(str(src)) != 2 or len(str(dest)) != 2: continue dotindex = (~msa.seqDict)[src].index('.') srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex+1:] dotindex = (~msa.seqDict)[dest].index('.') destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex+1:] wlist = str(src), srcspecies, src1, src.start, src.stop, str(dest), \ destspecies, dest1, dest.start, dest.stop saveList.append('\t'.join(map(str, wlist)) + '\n') saveList.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS for saveline in saveList: outfile.write(saveline) outfile.close() md5old = hashlib.md5() md5old.update(open(newOutputName, 'r').read()) md5new = hashlib.md5() md5new.update(open(tmpOutputName, 'r').read()) assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS # TEXT<->BINARY TEST msafilelist = glob.glob(msaname + '*') cnestedlist.dump_textfile(msaname, os.path.join(self.path, 'dm2_multiz15way.txt')) for filename in msafilelist: os.remove(filename) runPath = os.path.realpath(os.curdir) os.chdir(self.path) cnestedlist.textfile_to_binaries('dm2_multiz15way.txt') os.chdir(runPath) msa1 = cnestedlist.NLMSA(msaname, 'r') msa1.__doc__ = 'TEST NLMSA for dm2 multiz15way' pygr.Data.addResource('TEST.MSA.UCSC.dm2_multiz15way', msa1) pygr.Data.save() msa = pygr.Data.getResource('TEST.MSA.UCSC.dm2_multiz15way') newOutputName = os.path.join(self.path, 'splicesite_new2.txt') tmpInputName = self.copyFile(outfileName) tmpOutputName = self.copyFile(outputName) outfile = open(newOutputName, 'w') for lines in open(tmpInputName, 'r').xreadlines(): chrid, intstart, intend, nobs = string.split(lines.strip(), '\t') intstart, intend, nobs = int(intstart), int(intend), int(nobs) site1 = msa.seqDict['dm2' + '.' + chrid][intstart:intstart+2] site2 = msa.seqDict['dm2' + '.' + chrid][intend-2:intend] edges1 = msa[site1].edges() edges2 = msa[site2].edges() if len(edges1) == 0: # EMPTY EDGES wlist = str(site1), 'dm2', chrid, intstart, intstart+2, '', '', '', '', '' outfile.write('\t'.join(map(str, wlist)) + '\n') if len(edges2) == 0: # EMPTY EDGES wlist = str(site2), 'dm2', chrid, intend-2, intend, '', '', '', '', '' outfile.write('\t'.join(map(str, wlist)) + '\n') saveList = [] for src, dest, e in edges1: if len(str(src)) != 2 or len(str(dest)) != 2: continue dotindex = (~msa.seqDict)[src].index('.') srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex+1:] dotindex = (~msa.seqDict)[dest].index('.') destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex+1:] wlist = str(src), srcspecies, src1, src.start, src.stop, str(dest), \ destspecies, dest1, dest.start, dest.stop saveList.append('\t'.join(map(str, wlist)) + '\n') for src, dest, e in edges2: if len(str(src)) != 2 or len(str(dest)) != 2: continue dotindex = (~msa.seqDict)[src].index('.') srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex+1:] dotindex = (~msa.seqDict)[dest].index('.') destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex+1:] wlist = str(src), srcspecies, src1, src.start, src.stop, str(dest), \ destspecies, dest1, dest.start, dest.stop saveList.append('\t'.join(map(str, wlist)) + '\n') saveList.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS for saveline in saveList: outfile.write(saveline) outfile.close() md5old = hashlib.md5() md5old.update(open(newOutputName, 'r').read()) md5new = hashlib.md5() md5new.update(open(tmpOutputName, 'r').read()) assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
def build_test(self): # BUILD NLMSA AND QUERY RESULT COMPARISON from pygr import seqdb, cnestedlist genomedict = {} for orgstr in msaSpeciesList: genomedict[orgstr] = pygr.Data.getResource("TEST.Seq.Genome." + orgstr) uniondict = seqdb.PrefixUnionDict(genomedict) if smallSampleKey: maflist = (os.path.join(mafDir, smallSampleKey + ".maf"),) else: maflist = glob.glob(os.path.join(mafDir, "*.maf")) maflist.sort() msaname = os.path.join(self.path, "hg18_multiz28way") msa1 = cnestedlist.NLMSA(msaname, "w", uniondict, maflist, maxlen=536870912, maxint=22369620) # 500MB VERSION msa1.save_seq_dict() msa1.__doc__ = "TEST NLMSA for hg18 multiz28way" pygr.Data.addResource("TEST.MSA.UCSC.hg18_multiz28way", msa1) pygr.Data.save() msa = pygr.Data.getResource("TEST.MSA.UCSC.hg18_multiz28way") outfileName = os.path.join(testInputDir, "splicesite_hg18%s.txt" % smallSamplePostfix) outputName = os.path.join(testInputDir, "splicesite_hg18%s_multiz28way.txt" % smallSamplePostfix) newOutputName = os.path.join(self.path, "splicesite_new1.txt") tmpInputName = self.copyFile(outfileName) tmpOutputName = self.copyFile(outputName) outfile = open(newOutputName, "w") for lines in open(tmpInputName, "r").xreadlines(): chrid, intstart, intend, nobs = string.split(lines.strip(), "\t") intstart, intend, nobs = int(intstart), int(intend), int(nobs) site1 = msa.seqDict["hg18" + "." + chrid][intstart : intstart + 2] site2 = msa.seqDict["hg18" + "." + chrid][intend - 2 : intend] edges1 = msa[site1].edges() edges2 = msa[site2].edges() if len(edges1) == 0: # EMPTY EDGES wlist = str(site1), "hg18", chrid, intstart, intstart + 2, "", "", "", "", "" outfile.write("\t".join(map(str, wlist)) + "\n") if len(edges2) == 0: # EMPTY EDGES wlist = str(site2), "hg18", chrid, intend - 2, intend, "", "", "", "", "" outfile.write("\t".join(map(str, wlist)) + "\n") saveList = [] for src, dest, e in edges1: if len(str(src)) != 2 or len(str(dest)) != 2: continue dotindex = (~msa.seqDict)[src].index(".") srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex + 1 :] dotindex = (~msa.seqDict)[dest].index(".") destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex + 1 :] wlist = ( str(src), srcspecies, src1, src.start, src.stop, str(dest), destspecies, dest1, dest.start, dest.stop, ) saveList.append("\t".join(map(str, wlist)) + "\n") for src, dest, e in edges2: if len(str(src)) != 2 or len(str(dest)) != 2: continue dotindex = (~msa.seqDict)[src].index(".") srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex + 1 :] dotindex = (~msa.seqDict)[dest].index(".") destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex + 1 :] wlist = ( str(src), srcspecies, src1, src.start, src.stop, str(dest), destspecies, dest1, dest.start, dest.stop, ) saveList.append("\t".join(map(str, wlist)) + "\n") saveList.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS for saveline in saveList: outfile.write(saveline) outfile.close() import md5 md5old = md5.new() md5old.update(open(newOutputName, "r").read()) md5new = md5.new() md5new.update(open(tmpOutputName, "r").read()) assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS # TEXT<->BINARY TEST msafilelist = glob.glob(msaname + "*") cnestedlist.dump_textfile(msaname, os.path.join(self.path, "hg18_multiz28way.txt")) for filename in msafilelist: os.remove(filename) runPath = os.path.realpath(os.curdir) os.chdir(self.path) cnestedlist.textfile_to_binaries("hg18_multiz28way.txt") os.chdir(runPath) msa1 = cnestedlist.NLMSA(msaname, "r") msa1.__doc__ = "TEST NLMSA for hg18 multiz28way" pygr.Data.addResource("TEST.MSA.UCSC.hg18_multiz28way", msa1) pygr.Data.save() msa = pygr.Data.getResource("TEST.MSA.UCSC.hg18_multiz28way") newOutputName = os.path.join(self.path, "splicesite_new2.txt") tmpInputName = self.copyFile(outfileName) tmpOutputName = self.copyFile(outputName) outfile = open(newOutputName, "w") for lines in open(tmpInputName, "r").xreadlines(): chrid, intstart, intend, nobs = string.split(lines.strip(), "\t") intstart, intend, nobs = int(intstart), int(intend), int(nobs) site1 = msa.seqDict["hg18" + "." + chrid][intstart : intstart + 2] site2 = msa.seqDict["hg18" + "." + chrid][intend - 2 : intend] edges1 = msa[site1].edges() edges2 = msa[site2].edges() if len(edges1) == 0: # EMPTY EDGES wlist = str(site1), "hg18", chrid, intstart, intstart + 2, "", "", "", "", "" outfile.write("\t".join(map(str, wlist)) + "\n") if len(edges2) == 0: # EMPTY EDGES wlist = str(site2), "hg18", chrid, intend - 2, intend, "", "", "", "", "" outfile.write("\t".join(map(str, wlist)) + "\n") saveList = [] for src, dest, e in edges1: if len(str(src)) != 2 or len(str(dest)) != 2: continue dotindex = (~msa.seqDict)[src].index(".") srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex + 1 :] dotindex = (~msa.seqDict)[dest].index(".") destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex + 1 :] wlist = ( str(src), srcspecies, src1, src.start, src.stop, str(dest), destspecies, dest1, dest.start, dest.stop, ) saveList.append("\t".join(map(str, wlist)) + "\n") for src, dest, e in edges2: if len(str(src)) != 2 or len(str(dest)) != 2: continue dotindex = (~msa.seqDict)[src].index(".") srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex + 1 :] dotindex = (~msa.seqDict)[dest].index(".") destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex + 1 :] wlist = ( str(src), srcspecies, src1, src.start, src.stop, str(dest), destspecies, dest1, dest.start, dest.stop, ) saveList.append("\t".join(map(str, wlist)) + "\n") saveList.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS for saveline in saveList: outfile.write(saveline) outfile.close() import md5 md5old = md5.new() md5old.update(open(newOutputName, "r").read()) md5new = md5.new() md5new.update(open(tmpOutputName, "r").read()) assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS