def test_build_seqLenDict_with_reader(self): "Test that building things works properly when specifying a reader." class InfoBag(object): def __init__(self, **kw): self.__dict__.update(kw) # first, load the db & save the sequence info in a list l = [] db = SequenceFileDB(self.dbfile) try: for k, v in db.items(): info = InfoBag(id=k, length=len(v), sequence=str(v)) l.append(info) finally: # now, erase the existing files, and recreate the db. db.close() self.trash_intermediate_files() # create a fake reader with access to the saved info def my_fake_reader(fp, filename, info_list=l): return info_list # now try creating with the fake reader db = SequenceFileDB(self.dbfile, reader=my_fake_reader) # did it work? try: assert str(db.get('seq1')).startswith('atggtgtca') assert str(db.get('seq2')).startswith('GTGTTGAA') finally: db.close()
def test_basic_construction(self): db = SequenceFileDB(self.dbfile) try: assert str(db.get('seq1')).startswith('atggtgtca') assert str(db.get('seq2')).startswith('GTGTTGAA') finally: db.close()
def test_build_seqLenDict_with_bad_reader(self): "Test that building things fails properly with a bad reader." class InfoBag(object): def __init__(self, **kw): self.__dict__.update(kw) # first, load the db & save the sequence info in a list l = [] db = SequenceFileDB(self.dbfile) try: for k, v in db.items(): info = InfoBag(id=k, length=0, sequence=str(v)) l.append(info) finally: # now, erase the existing files, and recreate the db. db.close() self.trash_intermediate_files() # create a fake reader with access to the saved info def my_fake_reader(fp, filename, info_list=l): return info_list # now try creating with the fake reader try: db = SequenceFileDB(self.dbfile, reader=my_fake_reader) try: assert 0, "should not reach here; db construction should fail!" finally: db.close() except ValueError: pass # ValueError is expected
def test_basic_iadd(self): dnaseq = testutil.datafile('dnaseq.fasta') seqdb = SequenceFileDB(dnaseq) try: new_seq = seqdb['seq1'] self.db += new_seq assert new_seq in self.db name = (~self.db)[new_seq] assert name == 'dnaseq.seq1', name ### seqdb2 = SequenceFileDB(dnaseq) try: # Munge the filepath for testing. seqdb2.filepath = 'foo' new_seq2 = seqdb2['seq1'] self.db += new_seq2 name2 = (~self.db)[new_seq2] assert name2 == 'foo.seq1', name2 finally: seqdb2.close() finally: seqdb.close()
def test_inverse_add_behavior(self): dnaseq = testutil.datafile('dnaseq.fasta') seqdb = SequenceFileDB(dnaseq) try: seq = seqdb['seq1'] name = (~self.db)[seq] finally: seqdb.close() # only need to close if exception occurs
def test_funny_key2(self): "check handling of ID containing multiple separators" dnaseq = testutil.datafile('funnyseq.fasta') seqdb = SequenceFileDB(dnaseq) # contains 'seq1', 'seq2' try: pudb = PrefixUnionDict({'prefix': seqdb}) seq = pudb['prefix.seq.2.even.longer'] finally: seqdb.close()
def test_cache(self): "Sequence slice cache mechanics." dnaseq = testutil.datafile('dnaseq.fasta') db = SequenceFileDB(dnaseq) try: # create cache components cacheDict = {} cacheHint = db.cacheHint # get seq1 seq1 = db['seq1'] # _cache is only created on first cache attempt assert not hasattr(db, '_cache') # build an 'owner' object class AnonymousOwner(object): pass owner = AnonymousOwner() # save seq1 in cache cacheDict['seq1'] = (seq1.start, seq1.stop) cacheHint(cacheDict, owner) del cacheDict # 'owner' now holds reference # peek into _cache and assert that only the ival coordinates # are stored v = db._cache.values()[0] assert len(v['seq1']) == 2 del v # force a cache access & check that now we've stored actual string ival = str(seq1[5:10]) v = db._cache.values()[0] # ...check that we've stored actual string assert len(v['seq1']) == 3 # again force cache access, this time to the stored sequence string ival = str(seq1[5:10]) # now, eliminate all references to the cache proxy dict del owner # trash unused objects - not strictly necessary, because there are # no islands of circular references & so all objects are already # deallocated, but that's implementation dependent. gc.collect() # ok, cached values should now be gone. v = db._cache.values() assert len(v) == 0 finally: db.close()
def test_nlmsaslice_cache(self): "NLMSASlice sequence caching & removal" # set up sequences dnaseq = testutil.datafile('dnaseq.fasta') db = SequenceFileDB(dnaseq, autoGC=-1) # use pure WeakValueDict... try: gc.collect() assert len( db._weakValueDict) == 0, '_weakValueDict should be empty' seq1, seq2 = db['seq1'], db['seq2'] assert len(db._weakValueDict)==2, \ '_weakValueDict should have 2 seqs' # build referencing NLMSA mymap = NLMSA('test', 'memory', db, pairwiseMode=True) mymap += seq1 mymap[seq1] += seq2 mymap.build() # check: no cache assert not hasattr(db, '_cache'), 'should be no cache yet' seq1, seq2 = db['seq1'], db['seq2'] # re-retrieve # now retrieve a NLMSASlice, forcing entry of seq into cache ival = seq1[5:10] x = mymap[ival] assert len(db._cache.values()) != 0 n1 = len(db._cache) assert n1 == 1, "should be exactly one cache entry, not %d" % \ (n1, ) # ok, now trash referencing arguments & make sure of cleanup del x gc.collect() assert len(db._cache.values()) == 0 n2 = len(db._cache) assert n2 == 0, '%d objects remain; cache memory leak!' % n2 # FAIL because of __dealloc__ error in cnestedlist.NLMSASlice. # Drop our references, the cache should empty. del mymap, ival, seq1, seq2 gc.collect() # check that db._weakValueDict cache is empty assert len( db._weakValueDict) == 0, '_weakValueDict should be empty' finally: db.close()
def test_headerfile_create_conflict(self): "test non-empty prefixDict with a passed in PUD header file: conflict" subdb = SequenceFileDB(self.dbfile) try: header = testutil.datafile('prefixUnionDict-1.txt') try: db = PrefixUnionDict(filename=header, prefixDict={ 'foo' : subdb }) assert 0, "should not get here" except TypeError: pass finally: subdb.close()
def test_nlmsaslice_cache(self): "NLMSASlice sequence caching & removal" # set up sequences dnaseq = testutil.datafile('dnaseq.fasta') db = SequenceFileDB(dnaseq, autoGC=-1) # use pure WeakValueDict... try: gc.collect() assert len(db._weakValueDict)==0, '_weakValueDict should be empty' seq1, seq2 = db['seq1'], db['seq2'] assert len(db._weakValueDict)==2, \ '_weakValueDict should have 2 seqs' # build referencing NLMSA mymap = NLMSA('test', 'memory', db, pairwiseMode=True) mymap += seq1 mymap[seq1] += seq2 mymap.build() # check: no cache assert not hasattr(db, '_cache'), 'should be no cache yet' seq1, seq2 = db['seq1'], db['seq2'] # re-retrieve # now retrieve a NLMSASlice, forcing entry of seq into cache ival = seq1[5:10] x = mymap[ival] assert len(db._cache.values()) != 0 n1 = len(db._cache) assert n1 == 1, "should be exactly one cache entry, not %d" % \ (n1, ) # ok, now trash referencing arguments & make sure of cleanup del x gc.collect() assert len(db._cache.values()) == 0 n2 = len(db._cache) assert n2 == 0, '%d objects remain; cache memory leak!' % n2 # FAIL because of __dealloc__ error in cnestedlist.NLMSASlice. # Drop our references, the cache should empty. del mymap, ival, seq1, seq2 gc.collect() # check that db._weakValueDict cache is empty assert len(db._weakValueDict)==0, '_weakValueDict should be empty' finally: db.close()
def test_headerfile_create_conflict(self): "test non-empty prefixDict with a passed in PUD header file: conflict" subdb = SequenceFileDB(self.dbfile) try: header = testutil.datafile('prefixUnionDict-1.txt') try: db = PrefixUnionDict(filename=header, prefixDict={'foo': subdb}) assert 0, "should not get here" except TypeError: pass finally: subdb.close()
def test_no_db_info(self): dnaseq = testutil.datafile('dnaseq.fasta') seqdb = SequenceFileDB(dnaseq) try: new_seq = seqdb['seq1'] assert getattr(seqdb, '_persistent_id', None) is None del seqdb.filepath self.db += new_seq name = (~self.db)[new_seq] assert name == 'noname0.seq1' finally: seqdb.close()
def test_iadd_db_twice(self): dnaseq = testutil.datafile('dnaseq.fasta') seqdb = SequenceFileDB(dnaseq) try: new_seq = seqdb['seq1'] self.db += new_seq name1 = (~self.db)[new_seq] self.db += new_seq # should do nothing... name2 = (~self.db)[new_seq] assert name1 == name2 # ...leaving seq with same name. finally: seqdb.close()
def test_inverse_noadd_behavior(self): # compare with test_inverse_add_behavior... db = SeqPrefixUnionDict(addAll=False) dnaseq = testutil.datafile('dnaseq.fasta') seqdb = SequenceFileDB(dnaseq) try: seq = seqdb['seq1'] try: name = (~db)[seq] assert 0, "should not get here" except KeyError: pass finally: seqdb.close()
def test_headerfile_write_fail(self): subdb = SequenceFileDB(self.dbfile) try: del subdb.filepath # remove 'filepath' attribute for test db = PrefixUnionDict({'prefix': subdb}) assert len(db) == 2 assert 'prefix.seq1' in db output = testutil.tempdatafile('prefixUnionDict-write-fail.txt') try: db.writeHeaderFile(output) except AttributeError: pass finally: subdb.close() # closes both db and subdb
def test_iadd_duplicate_seqdb(self): dnaseq = testutil.datafile('dnaseq.fasta') seqdb = SequenceFileDB(dnaseq) try: seqdb2 = SequenceFileDB(dnaseq) try: new_seq = seqdb['seq1'] new_seq2 = seqdb2['seq1'] self.db += new_seq try: self.db += new_seq2 assert 0, "should never reach this point" except ValueError: pass finally: seqdb2.close() finally: seqdb.close()
class SeqPrefixUnionDict_Test(unittest.TestCase): """ Test SeqPrefixUnionDict. """ def setUp(self): dnaseq = testutil.datafile('dnaseq.fasta') self.seqdb = SequenceFileDB(dnaseq) # contains 'seq1', 'seq2' self.db = SeqPrefixUnionDict({'prefix': self.seqdb}) def tearDown(self): self.seqdb.close() def test_basic_iadd(self): dnaseq = testutil.datafile('dnaseq.fasta') seqdb = SequenceFileDB(dnaseq) try: new_seq = seqdb['seq1'] self.db += new_seq assert new_seq in self.db name = (~self.db)[new_seq] assert name == 'dnaseq.seq1', name ### seqdb2 = SequenceFileDB(dnaseq) try: # Munge the filepath for testing. seqdb2.filepath = 'foo' new_seq2 = seqdb2['seq1'] self.db += new_seq2 name2 = (~self.db)[new_seq2] assert name2 == 'foo.seq1', name2 finally: seqdb2.close() finally: seqdb.close() # NOTE, the important thing here is less the specific names that # are given (which are based on filepath) but that different names # are created for the various sequences when they are added. def test_iadd_db_twice(self): dnaseq = testutil.datafile('dnaseq.fasta') seqdb = SequenceFileDB(dnaseq) try: new_seq = seqdb['seq1'] self.db += new_seq name1 = (~self.db)[new_seq] self.db += new_seq # should do nothing... name2 = (~self.db)[new_seq] assert name1 == name2 # ...leaving seq with same name. finally: seqdb.close() def test_iadd_user_seq(self): seq = Sequence('ATGGCAGG', 'foo') self.db += seq name = (~self.db)[seq] assert name == 'user.foo' # created a new 'user' db. # ok, make sure it doesn't wipe out the old 'user' db... seq2 = Sequence('ATGGCAGG', 'foo2') self.db += seq2 name = (~self.db)[seq2] assert name == 'user.foo2' first_name = (~self.db)[seq] assert first_name == 'user.foo' def test_iadd_duplicate_seqdb(self): dnaseq = testutil.datafile('dnaseq.fasta') seqdb = SequenceFileDB(dnaseq) try: seqdb2 = SequenceFileDB(dnaseq) try: new_seq = seqdb['seq1'] new_seq2 = seqdb2['seq1'] self.db += new_seq try: self.db += new_seq2 assert 0, "should never reach this point" except ValueError: pass finally: seqdb2.close() finally: seqdb.close() def test_no_db_info(self): dnaseq = testutil.datafile('dnaseq.fasta') seqdb = SequenceFileDB(dnaseq) try: new_seq = seqdb['seq1'] assert getattr(seqdb, '_persistent_id', None) is None del seqdb.filepath self.db += new_seq name = (~self.db)[new_seq] assert name == 'noname0.seq1' finally: seqdb.close() def test_inverse_add_behavior(self): dnaseq = testutil.datafile('dnaseq.fasta') seqdb = SequenceFileDB(dnaseq) try: seq = seqdb['seq1'] name = (~self.db)[seq] finally: seqdb.close() # only need to close if exception occurs def test_inverse_noadd_behavior(self): # compare with test_inverse_add_behavior... db = SeqPrefixUnionDict(addAll=False) dnaseq = testutil.datafile('dnaseq.fasta') seqdb = SequenceFileDB(dnaseq) try: seq = seqdb['seq1'] try: name = (~db)[seq] assert 0, "should not get here" except KeyError: pass finally: seqdb.close()
class SequenceFileDB_Test(unittest.TestCase): """ Test for all of the basic dictionary functions on 'SequenceFileDB', among other things. """ def setUp(self): "Test setup" dnaseq = testutil.datafile('dnaseq.fasta') self.db = SequenceFileDB(dnaseq) # contains 'seq1', 'seq2' self.db._weakValueDict.clear() # clear the cache def tearDown(self): self.db.close() # must close SequenceFileDB! def test_len(self): assert len(self.db) == 2 def test_seqInfoDict_len(self): assert len(self.db.seqInfoDict) == 2 def test_no_file_given(self): "Make sure that a TypeError is raised when no file is available" try: db = SequenceFileDB() assert 0, "should not reach this point" except TypeError: pass def test_seq_descriptor(self): "Check the '.seq' attribute (tied to a descriptor)" s = self.db['seq1'] assert str(s) == str(s.seq) def test_cache(self): "SequenceDB cache test" assert len(self.db._weakValueDict) == 0 seq1 = self.db['seq1'] # cache populated? assert len(self.db._weakValueDict) == 1 assert 'seq1' in self.db._weakValueDict # cache functions? seq1_try2 = self.db['seq1'] assert seq1 is seq1_try2 def test_clear_cache(self): "SequenceDB clear_cache test" assert len(self.db._weakValueDict) == 0 seq1 = self.db['seq1'] # cache populated? assert len(self.db._weakValueDict) == 1 assert 'seq1' in self.db._weakValueDict # clear_cache functions? self.db.clear_cache() seq1_try3 = self.db['seq1'] assert seq1 is not seq1_try3 def test_keys(self): "SequenceFileDB keys" k = self.db.keys() k.sort() assert k == ['seq1', 'seq2'] def test_contains(self): "SequenceFileDB contains" assert 'seq1' in self.db, self.db.keys() assert 'seq2' in self.db assert 'foo' not in self.db def test_invert_class(self): "SequenceFileDB __invert__" seq = self.db['seq1'] inversedb = ~self.db assert inversedb[seq] == 'seq1' assert seq in inversedb assert 'foo' not in inversedb def test_keys_info(self): "SequenceFileDB keys info" k = self.db.seqInfoDict.keys() k.sort() assert k == ['seq1', 'seq2'] def test_contains_info(self): "SequenceFileDB contains info" assert 'seq1' in self.db.seqInfoDict assert 'seq2' in self.db.seqInfoDict assert 'foo' not in self.db.seqInfoDict def test_has_key(self): "SequenceFileDB has key" assert 'seq1' in self.db assert 'seq2' in self.db assert 'foo' not in self.db def test_get(self): "SequenceFileDB get" assert self.db.get('foo') is None assert self.db.get('seq1') is not None assert str(self.db.get('seq1')).startswith('atggtgtca') assert self.db.get('seq2') is not None assert str(self.db.get('seq2')).startswith('GTGTTGAA') def test_items(self): "SequenceFileDB items" i = [k for (k, v) in self.db.items()] i.sort() assert i == ['seq1', 'seq2'] def test_iterkeys(self): "SequenceFileDB iterkeys" kk = self.db.keys() kk.sort() ik = list(self.db.iterkeys()) ik.sort() assert kk == ik def test_itervalues(self): "SequenceFileDB itervalues" kv = self.db.values() kv.sort() iv = list(self.db.itervalues()) iv.sort() assert kv == iv def test_iteritems(self): "SequenceFileDB iteritems" ki = self.db.items() ki.sort() ii = list(self.db.iteritems()) ii.sort() assert ki == ii def test_readonly(self): "SequenceFileDB readonly" try: self.db.copy() # what should 'copy' do on SequenceFileDB? assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.clear() assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.setdefault('foo') assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.pop() assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.popitem() assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.update({}) assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass # test some things other than dict behavior def test_keyerror(self): """SequenceFileDB keyerror. Make sure that the SequenceFileDB KeyError is informative.""" try: self.db['foo'] except KeyError, e: assert "no key 'foo' in database <SequenceFileDB" in str(e), str(e)
class Hgvs2: def __init__(self, refseq_fn=fileconfig.FILECONFIG['REFGENE']): """ register hg19 reference genome sequence and NCBI RefSeq transcript coordinates""" self.genome_fn = fileconfig.FILECONFIG['REFGENOME_UCSC'] self.genome = None if not os.path.exists(self.genome_fn): raise IOError( 'Reference Genome Sequence (UCSC format) for %s is not found' % self.genome_fn) self.refseq_fn = refseq_fn self.refseq = None if not os.path.exists(self.refseq_fn): raise IOError('NCBI RefSeq transcript for %s is not found' % self.refseq_fn) def get_transcript(self, name): if self.refseq: return self.refseq.get(name) else: raise RuntimeError( 'first, load a transcript coordinate file![%s]' % self.refseq_fn) def load_resource(self): #load genome sequence print 'loading the genome sequence [%s] for HGVS...' % self.genome_fn self.genome = SequenceFileDB(self.genome_fn) print 'done.' #load refseq into dic print 'loading the refseq transcript [%s] for HGVS...' % self.refseq_fn fp = open(self.refseq_fn, 'r') self.refseq = pyhgvs.utils.read_transcripts(fp) fp.close() print 'done.' def close_resource(self): self.genome.close() self.refseq = None def to_cDNA(self, chrom, offset, ref, alt, refseq_acc): """ convert to HGVS nomenclature """ transcript = self.get_transcript(refseq_acc) if not chrom.startswith('chr'): chrom = 'chr%s' % chrom if chrom not in CHROMOSOMES: return None if not chrom in self.genome.keys(): return None cdna = pyhgvs.format_hgvs_name(chrom, offset, ref, alt, self.genome, transcript) if cdna: itms = cdna.split(':') if len(itms) > 1: cdna = itms[1] else: cdna = cdna return cdna else: return None def gdna_to_vcf(self, gdna): return pyhgvs.gdna_to_vcf(gdna, self.genome) def to_chrom_coordinate(self, cDNA): try: chrom, offset, ref, alt = pyhgvs.parse_hgvs_name(cDNA, self.genome, \ get_transcript = self.get_transcript) return chrom, offset, ref, alt except: print "[%s] cannot be coverted to chromosome coordinate" % cDNA return None, None, None, None