def test_basic_construction(self): db = SequenceFileDB(self.dbfile) try: assert str(db.get('seq1')).startswith('atggtgtca') assert str(db.get('seq2')).startswith('GTGTTGAA') finally: db.close()
def test_build_seqLenDict_with_reader(self): "Test that building things works properly when specifying a reader." class InfoBag(object): def __init__(self, **kw): self.__dict__.update(kw) # first, load the db & save the sequence info in a list l = [] db = SequenceFileDB(self.dbfile) try: for k, v in db.items(): info = InfoBag(id=k, length=len(v), sequence=str(v)) l.append(info) finally: # now, erase the existing files, and recreate the db. db.close() self.trash_intermediate_files() # create a fake reader with access to the saved info def my_fake_reader(fp, filename, info_list=l): return info_list # now try creating with the fake reader db = SequenceFileDB(self.dbfile, reader=my_fake_reader) # did it work? try: assert str(db.get('seq1')).startswith('atggtgtca') assert str(db.get('seq2')).startswith('GTGTTGAA') finally: db.close()
def test_inverse_add_behavior(self): dnaseq = testutil.datafile('dnaseq.fasta') seqdb = SequenceFileDB(dnaseq) try: seq = seqdb['seq1'] name = (~self.db)[seq] finally: seqdb.close() # only need to close if exception occurs
def test_funny_key2(self): "check handling of ID containing multiple separators" dnaseq = testutil.datafile('funnyseq.fasta') seqdb = SequenceFileDB(dnaseq) # contains 'seq1', 'seq2' try: pudb = PrefixUnionDict({'prefix': seqdb}) seq = pudb['prefix.seq.2.even.longer'] finally: seqdb.close()
def test_cache(self): "Sequence slice cache mechanics." dnaseq = testutil.datafile('dnaseq.fasta') db = SequenceFileDB(dnaseq) try: # create cache components cacheDict = {} cacheHint = db.cacheHint # get seq1 seq1 = db['seq1'] # _cache is only created on first cache attempt assert not hasattr(db, '_cache') # build an 'owner' object class AnonymousOwner(object): pass owner = AnonymousOwner() # save seq1 in cache cacheDict['seq1'] = (seq1.start, seq1.stop) cacheHint(cacheDict, owner) del cacheDict # 'owner' now holds reference # peek into _cache and assert that only the ival coordinates # are stored v = db._cache.values()[0] assert len(v['seq1']) == 2 del v # force a cache access & check that now we've stored actual string ival = str(seq1[5:10]) v = db._cache.values()[0] # ...check that we've stored actual string assert len(v['seq1']) == 3 # again force cache access, this time to the stored sequence string ival = str(seq1[5:10]) # now, eliminate all references to the cache proxy dict del owner # trash unused objects - not strictly necessary, because there are # no islands of circular references & so all objects are already # deallocated, but that's implementation dependent. gc.collect() # ok, cached values should now be gone. v = db._cache.values() assert len(v) == 0 finally: db.close()
def test_basic_iadd(self): dnaseq = testutil.datafile('dnaseq.fasta') seqdb = SequenceFileDB(dnaseq) try: new_seq = seqdb['seq1'] self.db += new_seq assert new_seq in self.db name = (~self.db)[new_seq] assert name == 'dnaseq.seq1', name ### seqdb2 = SequenceFileDB(dnaseq) try: # Munge the filepath for testing. seqdb2.filepath = 'foo' new_seq2 = seqdb2['seq1'] self.db += new_seq2 name2 = (~self.db)[new_seq2] assert name2 == 'foo.seq1', name2 finally: seqdb2.close() finally: seqdb.close()
def test_build_seqLenDict_with_bad_reader(self): "Test that building things fails properly with a bad reader." class InfoBag(object): def __init__(self, **kw): self.__dict__.update(kw) # first, load the db & save the sequence info in a list l = [] db = SequenceFileDB(self.dbfile) try: for k, v in db.items(): info = InfoBag(id=k, length=0, sequence=str(v)) l.append(info) finally: # now, erase the existing files, and recreate the db. db.close() self.trash_intermediate_files() # create a fake reader with access to the saved info def my_fake_reader(fp, filename, info_list=l): return info_list # now try creating with the fake reader try: db = SequenceFileDB(self.dbfile, reader=my_fake_reader) try: assert 0, "should not reach here; db construction should fail!" finally: db.close() except ValueError: pass # ValueError is expected
def test_nlmsaslice_cache(self): "NLMSASlice sequence caching & removal" # set up sequences dnaseq = testutil.datafile('dnaseq.fasta') db = SequenceFileDB(dnaseq, autoGC=-1) # use pure WeakValueDict... try: gc.collect() assert len( db._weakValueDict) == 0, '_weakValueDict should be empty' seq1, seq2 = db['seq1'], db['seq2'] assert len(db._weakValueDict)==2, \ '_weakValueDict should have 2 seqs' # build referencing NLMSA mymap = NLMSA('test', 'memory', db, pairwiseMode=True) mymap += seq1 mymap[seq1] += seq2 mymap.build() # check: no cache assert not hasattr(db, '_cache'), 'should be no cache yet' seq1, seq2 = db['seq1'], db['seq2'] # re-retrieve # now retrieve a NLMSASlice, forcing entry of seq into cache ival = seq1[5:10] x = mymap[ival] assert len(db._cache.values()) != 0 n1 = len(db._cache) assert n1 == 1, "should be exactly one cache entry, not %d" % \ (n1, ) # ok, now trash referencing arguments & make sure of cleanup del x gc.collect() assert len(db._cache.values()) == 0 n2 = len(db._cache) assert n2 == 0, '%d objects remain; cache memory leak!' % n2 # FAIL because of __dealloc__ error in cnestedlist.NLMSASlice. # Drop our references, the cache should empty. del mymap, ival, seq1, seq2 gc.collect() # check that db._weakValueDict cache is empty assert len( db._weakValueDict) == 0, '_weakValueDict should be empty' finally: db.close()
def test_headerfile_create_conflict(self): "test non-empty prefixDict with a passed in PUD header file: conflict" subdb = SequenceFileDB(self.dbfile) try: header = testutil.datafile('prefixUnionDict-1.txt') try: db = PrefixUnionDict(filename=header, prefixDict={ 'foo' : subdb }) assert 0, "should not get here" except TypeError: pass finally: subdb.close()
def load_resource(self): #load genome sequence print 'loading the genome sequence [%s] for HGVS...' % self.genome_fn self.genome = SequenceFileDB(self.genome_fn) print 'done.' #load refseq into dic print 'loading the refseq transcript [%s] for HGVS...' % self.refseq_fn fp = open(self.refseq_fn, 'r') self.refseq = pyhgvs.utils.read_transcripts(fp) fp.close() print 'done.'
def test_nlmsaslice_cache(self): "NLMSASlice sequence caching & removal" # set up sequences dnaseq = testutil.datafile('dnaseq.fasta') db = SequenceFileDB(dnaseq, autoGC=-1) # use pure WeakValueDict... try: gc.collect() assert len(db._weakValueDict)==0, '_weakValueDict should be empty' seq1, seq2 = db['seq1'], db['seq2'] assert len(db._weakValueDict)==2, \ '_weakValueDict should have 2 seqs' # build referencing NLMSA mymap = NLMSA('test', 'memory', db, pairwiseMode=True) mymap += seq1 mymap[seq1] += seq2 mymap.build() # check: no cache assert not hasattr(db, '_cache'), 'should be no cache yet' seq1, seq2 = db['seq1'], db['seq2'] # re-retrieve # now retrieve a NLMSASlice, forcing entry of seq into cache ival = seq1[5:10] x = mymap[ival] assert len(db._cache.values()) != 0 n1 = len(db._cache) assert n1 == 1, "should be exactly one cache entry, not %d" % \ (n1, ) # ok, now trash referencing arguments & make sure of cleanup del x gc.collect() assert len(db._cache.values()) == 0 n2 = len(db._cache) assert n2 == 0, '%d objects remain; cache memory leak!' % n2 # FAIL because of __dealloc__ error in cnestedlist.NLMSASlice. # Drop our references, the cache should empty. del mymap, ival, seq1, seq2 gc.collect() # check that db._weakValueDict cache is empty assert len(db._weakValueDict)==0, '_weakValueDict should be empty' finally: db.close()
def test_headerfile_create_conflict(self): "test non-empty prefixDict with a passed in PUD header file: conflict" subdb = SequenceFileDB(self.dbfile) try: header = testutil.datafile('prefixUnionDict-1.txt') try: db = PrefixUnionDict(filename=header, prefixDict={'foo': subdb}) assert 0, "should not get here" except TypeError: pass finally: subdb.close()
def test_iadd_db_twice(self): dnaseq = testutil.datafile('dnaseq.fasta') seqdb = SequenceFileDB(dnaseq) try: new_seq = seqdb['seq1'] self.db += new_seq name1 = (~self.db)[new_seq] self.db += new_seq # should do nothing... name2 = (~self.db)[new_seq] assert name1 == name2 # ...leaving seq with same name. finally: seqdb.close()
def test_no_db_info(self): dnaseq = testutil.datafile('dnaseq.fasta') seqdb = SequenceFileDB(dnaseq) try: new_seq = seqdb['seq1'] assert getattr(seqdb, '_persistent_id', None) is None del seqdb.filepath self.db += new_seq name = (~self.db)[new_seq] assert name == 'noname0.seq1' finally: seqdb.close()
def test_inverse_noadd_behavior(self): # compare with test_inverse_add_behavior... db = SeqPrefixUnionDict(addAll=False) dnaseq = testutil.datafile('dnaseq.fasta') seqdb = SequenceFileDB(dnaseq) try: seq = seqdb['seq1'] try: name = (~db)[seq] assert 0, "should not get here" except KeyError: pass finally: seqdb.close()
def test_no_file_given(self): "Make sure that a TypeError is raised when no file is available" try: db = SequenceFileDB() assert 0, "should not reach this point" except TypeError: pass
def test_headerfile_write_fail(self): subdb = SequenceFileDB(self.dbfile) try: del subdb.filepath # remove 'filepath' attribute for test db = PrefixUnionDict({'prefix': subdb}) assert len(db) == 2 assert 'prefix.seq1' in db output = testutil.tempdatafile('prefixUnionDict-write-fail.txt') try: db.writeHeaderFile(output) except AttributeError: pass finally: subdb.close() # closes both db and subdb
def translate(variant,transcripts,get_transcript): genome = SequenceFileDB('hg19.fa') #pip install bsddb3 is required try: chrom, offset, ref, alt = hgvs.parse_hgvs_name(variant, genome, get_transcript=get_transcript) except: return 1 return chrom, offset, ref, alt
def __init__(self, lookup=None, filename=None, db_filename=None, default_seq=None): """ A mock genome object that provides a pygr compatible interface. lookup: a list of ((chrom, start, end), seq) values that define a lookup table for genome sequence requests. filename: a stream or filename containing a lookup table. db_filename: a fasta file to use for genome sequence requests. All requests are recorded and can be writen to a lookup table file using the `write` method. default_seq: if given, this base will always be returned if region is unavailable. """ self._chroms = {} self._lookup = lookup if lookup is not None else {} self._genome = None self._default_seq = default_seq if db_filename: # Use a real genome database. if SequenceFileDB is None: raise ValueError('pygr is not available.') self._genome = SequenceFileDB(db_filename) elif filename: # Read genome sequence from lookup table. self.read(filename)
def codetest(): "Test the code here before adding to doctest @CTB" import pygr from pygr.seqdb import SequenceFileDB db = SequenceFileDB(os.path.join('data', 'partial-yeast.fasta')) chr02 = db['chr02'] start, stop = (87787, 86719) x = chr02[start:stop]
def main(): global REFGENE parser = argparse.ArgumentParser() parser.add_argument('-i', '--readable_input', help='readable input file for conversion.') parser.add_argument('-o', '--writable_output', help='writable output file for conversion.') parser.add_argument('-g', '--genome_path', help='Link to hg38.fa.') parser.add_argument('-r', '--reference_genome', default='./hg38.BRCA.refGene.txt', help='Link to hg38.BRCA.refgene.txt.') args = parser.parse_args() GENOME = SequenceFileDB(args.genome_path) REFGENE = args.reference_genome f_in = open(args.readable_input, "r") f_out = open(args.writable_output, "w") f_out.write("\t".join(OUTPUT_COLUMNS) + "\n") for index, line in enumerate(f_in): # # Clean the line by removing leading or trailing spaces adjacent to tabs. # line = re.sub("( )*\t( )*", "\t", line) items = np.array(line.rstrip().split("\t")) if index == 0: # Handle column names columns = np.array([i.replace(" ", "_") for i in items]) index_to_save = [np.where(columns == i)[0][0] for i in COLUMNS_TO_SAVE] column_idx = dict(zip(COLUMNS_TO_SAVE, index_to_save)) continue # # In the date last evaluated field, delete the time last evaluated if provided. # date_last_evaluated_idx = column_idx["Date_last_evaluated"] items[date_last_evaluated_idx] = items[date_last_evaluated_idx].split(' ')[0] OMIM_id_index = column_idx["Condition_ID_value"] items[OMIM_id_index] = convert_OMIM_id(items[OMIM_id_index]) items[column_idx["HGVS"]] = cleanup_HGVS(items[column_idx["Reference_sequence"]], items[column_idx["HGVS"]], HP, EVM) HGVS_cDNA = items[column_idx["Reference_sequence"]] + ":" + items[column_idx["HGVS"]] print items[column_idx["Reference_sequence"]], items[column_idx["HGVS"]], HGVS_cDNA try: genome_coor, HGVS_p = convert_HGVS(HGVS_cDNA, GENOME) except: if (items[column_idx["HGVS"]]).find(";") > -1: genome_coor, HGVS_p = create_None_filler() aa_abrev_index = column_idx["Abbrev_AA_change"] if HGVS_p not in ["p.?", "p.(=)", "None"]: if items[aa_abrev_index] == '': items[aa_abrev_index] = HGVS_p_to_AA_abrev(HGVS_p) final_items = list(items[index_to_save]) final_items.insert(1, genome_coor) final_items.append(HGVS_p) new_line = "\t".join(list(final_items)) + "\n" f_out.write(new_line) f_in.close() f_out.close()
def initialize(hg_fasta, snpeff_predictor_bin): ''' Load required databases: * human genome reference * refGene.txt''' global __GENOME__, __TRANSCRIPTS__ __GENOME__ = SequenceFileDB(hg_fasta) with gzip.open(snpeff_predictor_bin) as f: __TRANSCRIPTS__ = read_snpeff_transcripts(f)
def test_invalid_coordinates(): """ Regression test for 17 """ if not SequenceFileDB: raise nose.SkipTest genome = SequenceFileDB('pyhgvs/tests/data/test_refseqs.fa') hgvs_name = 'NC_000005.10:g.177421339_177421327delACTCGAGTGCTCC' parse_hgvs_name(hgvs_name, genome, get_transcript=get_transcript)
def main(): global REFGENE parser = argparse.ArgumentParser() parser.add_argument('-i', '--readable_input', type=argparse.FileType('r'), help='Opened readable input file for conversion.') parser.add_argument('-o', '--writable_output', type=argparse.FileType('w'), help='Opened writable output file for conversion.') parser.add_argument('-g', '--genome_path', help='Link to hg38.fa.') parser.add_argument('-r', '--reference_genome', default='./hg38.BRCA.refGene.txt', help='Link to hg38.BRCA.refgene.txt.') args = parser.parse_args() GENOME = SequenceFileDB(args.genome_path) REFGENE = args.reference_genome f_out = args.writable_output f_out.write("\t".join(OUTPUT_COLUMNS) + "\n") f_in = args.readable_input for index, line in enumerate(f_in): items = np.array(line.rstrip().split("\t")) if index == 0: # Handle column names columns = np.array([i.replace(" ", "_") for i in items]) index_to_save = [ np.where(columns == i)[0][0] for i in COLUMNS_TO_SAVE ] column_idx = dict(zip(COLUMNS_TO_SAVE, index_to_save)) continue OMIM_id_index = column_idx["Condition_ID_value"] items[OMIM_id_index] = convert_OMIM_id(items[OMIM_id_index]) HGVS_cDNA = (items[column_idx["Reference_sequence"]] + ":" + items[column_idx["HGVS"]]) try: genome_coor, HGVS_p = convert_HGVS(HGVS_cDNA, GENOME) except: if (items[column_idx["HGVS"]]).find(";") > -1: genome_coor, HGVS_p = create_None_filler() aa_abrev_index = column_idx["Abbrev_AA_change"] if HGVS_p not in ["p.?", "p.(=)", "None"]: if items[aa_abrev_index] == '': items[aa_abrev_index] = HGVS_p_to_AA_abrev(HGVS_p) final_items = list(items[index_to_save]) final_items.insert(1, genome_coor) final_items.append(HGVS_p) new_line = "\t".join(list(final_items)) + "\n" f_out.write(new_line) f_in.close() f_out.close()
def __init__(self, inFile, genome, vkey=False, verbose=False, log=sys.stderr): self.inFile = inFile self.verbose = verbose self.log = log self.vkey = vkey self.genome = SequenceFileDB(genome) self.infoHeader = "[" + self.__class__.__name__ + "]"
def get_genome_coor(hgvs_c): genome = SequenceFileDB('data/hg19.fa') refGene = "/Users/Molly/Desktop/web-dev/hgvs_counsyl/hgvs/pyhgvs/data/genes.refGene" with open(refGene) as infile: transcripts = pyhgvs_utils.read_transcripts(infile) def get_transcript(name): return transcripts.get(name) chrom, offset, ref, alt = pyhgvs.parse_hgvs_name( hgvs_c, genome, get_transcript=get_transcript) return chrom + ":" + str(offset) + ":" + ref + ">" + alt
def HGVS_to_GenomeCoor(HGVS): """use counsyl pyhgvs for this""" genome = SequenceFileDB('../data/hg19.fa') refGene = "../data/BRCA12.refGene.txt" with open(refGene) as infile: transcripts = pyhgvs_utils.read_transcripts(infile) def get_transcript(name): return transcripts.get(name) chrom, offset, ref, alt = pyhgvs.parse_hgvs_name( HGVS, genome, get_transcript=get_transcript) genome_coordinate = chrom + ":" + str(offset) + ":" + ref + ">" + alt return genome_coordinate
def __init__(self): """ Initializes hg19 reference and reference transcripts """ genome_path = os.path.join(os.path.dirname(__file__), 'resources', 'hg19.fa') refseq_path = os.path.join(os.path.dirname(__file__), 'resources', 'genes.refGene') # Read genome sequence using pygr. self.genome = SequenceFileDB(genome_path) # Read RefSeq transcripts into a python dict. with open(refseq_path) as infile: self.transcripts = hgvs.utils.read_transcripts(infile)
def main(argv=None): parser = make_parser() args = parser.parse_args(argv) genome = SequenceFileDB(args.genome) pwm = [IUPAC_SCORES[l] for l in args.consensus] pwm.extend([REQUIRED_SCORES[l] for l in args.required_3p_seq]) pwm = motility.PWM(pwm) # find all matches with open(args.outfile, 'w') as outfile: for chrom in genome.keys(): chromseq = str(genome[chrom]) print "searching ", chrom, "of length", len(chromseq) if len(chromseq) < len(pwm): print 'chromosome/fragment', chrom, 'is too short' continue matches = pwm.find(chromseq, -args.mismatches) for start, stop, strand, seq in matches: score = pwm.calc_score(seq) outfile.write('\t'.join( [chrom, str(start), str(stop), seq, str(score), '+' if strand == 1 else '-']) + '\n')
def test_iadd_duplicate_seqdb(self): dnaseq = testutil.datafile('dnaseq.fasta') seqdb = SequenceFileDB(dnaseq) try: seqdb2 = SequenceFileDB(dnaseq) try: new_seq = seqdb['seq1'] new_seq2 = seqdb2['seq1'] self.db += new_seq try: self.db += new_seq2 assert 0, "should never reach this point" except ValueError: pass finally: seqdb2.close() finally: seqdb.close()
def test_name_to_variant_refseqs(): """ Convert HGVS names to variant coordinates using refseqs directly. """ if not SequenceFileDB: print 'skip test_name_to_variant_refseqs' return genome = SequenceFileDB('pyhgvs/tests/data/test_refseqs.fa') for hgvs_name, variant, name_canonical, var_canonical in _name_variants: if not var_canonical or 'NM_' not in hgvs_name: # Only test transcript HGVS names. continue hgvs_variant = parse_hgvs_name(hgvs_name, genome, get_transcript=get_transcript) nose.tools.assert_equal(hgvs_variant, variant, repr([hgvs_name, variant, hgvs_variant]))
def test_pyhgvs_cdna_coordinate_correct(self): for i in self.data: pyhgvs_coord = i['pyhgvs_Genomic_Coordinate_38'] pyhgvs_cDNA = i['pyhgvs_cDNA'] genome = SequenceFileDB('../reference_genome/hg38/hg38.fa') def get_transcript(name): REFGENE = "../refgene38_brca.txt" with open(REFGENE) as infile: TRANSCRIPTS = pyhgvs_utils.read_transcripts(infile) return TRANSCRIPTS.get(name) chrom, offset, ref, alt = pyhgvs.parse_hgvs_name( pyhgvs_cDNA, genome, get_transcript=get_transcript) test_coord = chrom + ":" + "g." + str( offset) + ":" + ref + ">" + alt self.assertEqual(pyhgvs_coord, test_coord)
def __init__(self): """ Initializes hg19 reference and reference transcripts """ #genome_path = '/Users/charlesmarkello/leidenv1.0/resources/hg38.fa' #refseq_path = '/Users/charlesmarkello/leidenv1.0/resources/refGene.txt' #genome_path = '/Users/charlesmarkello/leidenv1.0/resources/hg19.fa' #refseq_path = '/Users/charlesmarkello/leidenv1.0/resources/genes.refGene' genome_path = os.path.join(os.path.dirname(__file__), 'resources', 'hg19.fa') refseq_path = os.path.join(os.path.dirname(__file__), 'resources', 'genes.refGene') print 'genome_path: ', genome_path # Read genome sequence using pygr. self.genome = SequenceFileDB(genome_path) # Read RefSeq transcripts into a python dict. with open(refseq_path) as infile: self.transcripts = pyhgvs.utils.read_transcripts(infile)
class SequenceFileDB_Test(unittest.TestCase): """ Test for all of the basic dictionary functions on 'SequenceFileDB', among other things. """ def setUp(self): "Test setup" dnaseq = testutil.datafile('dnaseq.fasta') self.db = SequenceFileDB(dnaseq) # contains 'seq1', 'seq2' self.db._weakValueDict.clear() # clear the cache def tearDown(self): self.db.close() # must close SequenceFileDB! def test_len(self): assert len(self.db) == 2 def test_seqInfoDict_len(self): assert len(self.db.seqInfoDict) == 2 def test_no_file_given(self): "Make sure that a TypeError is raised when no file is available" try: db = SequenceFileDB() assert 0, "should not reach this point" except TypeError: pass def test_seq_descriptor(self): "Check the '.seq' attribute (tied to a descriptor)" s = self.db['seq1'] assert str(s) == str(s.seq) def test_cache(self): "SequenceDB cache test" assert len(self.db._weakValueDict) == 0 seq1 = self.db['seq1'] # cache populated? assert len(self.db._weakValueDict) == 1 assert 'seq1' in self.db._weakValueDict # cache functions? seq1_try2 = self.db['seq1'] assert seq1 is seq1_try2 def test_clear_cache(self): "SequenceDB clear_cache test" assert len(self.db._weakValueDict) == 0 seq1 = self.db['seq1'] # cache populated? assert len(self.db._weakValueDict) == 1 assert 'seq1' in self.db._weakValueDict # clear_cache functions? self.db.clear_cache() seq1_try3 = self.db['seq1'] assert seq1 is not seq1_try3 def test_keys(self): "SequenceFileDB keys" k = self.db.keys() k.sort() assert k == ['seq1', 'seq2'] def test_contains(self): "SequenceFileDB contains" assert 'seq1' in self.db, self.db.keys() assert 'seq2' in self.db assert 'foo' not in self.db def test_invert_class(self): "SequenceFileDB __invert__" seq = self.db['seq1'] inversedb = ~self.db assert inversedb[seq] == 'seq1' assert seq in inversedb assert 'foo' not in inversedb def test_keys_info(self): "SequenceFileDB keys info" k = self.db.seqInfoDict.keys() k.sort() assert k == ['seq1', 'seq2'] def test_contains_info(self): "SequenceFileDB contains info" assert 'seq1' in self.db.seqInfoDict assert 'seq2' in self.db.seqInfoDict assert 'foo' not in self.db.seqInfoDict def test_has_key(self): "SequenceFileDB has key" assert 'seq1' in self.db assert 'seq2' in self.db assert 'foo' not in self.db def test_get(self): "SequenceFileDB get" assert self.db.get('foo') is None assert self.db.get('seq1') is not None assert str(self.db.get('seq1')).startswith('atggtgtca') assert self.db.get('seq2') is not None assert str(self.db.get('seq2')).startswith('GTGTTGAA') def test_items(self): "SequenceFileDB items" i = [k for (k, v) in self.db.items()] i.sort() assert i == ['seq1', 'seq2'] def test_iterkeys(self): "SequenceFileDB iterkeys" kk = self.db.keys() kk.sort() ik = list(self.db.iterkeys()) ik.sort() assert kk == ik def test_itervalues(self): "SequenceFileDB itervalues" kv = self.db.values() kv.sort() iv = list(self.db.itervalues()) iv.sort() assert kv == iv def test_iteritems(self): "SequenceFileDB iteritems" ki = self.db.items() ki.sort() ii = list(self.db.iteritems()) ii.sort() assert ki == ii def test_readonly(self): "SequenceFileDB readonly" try: self.db.copy() # what should 'copy' do on SequenceFileDB? assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.clear() assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.setdefault('foo') assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.pop() assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.popitem() assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.update({}) assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass # test some things other than dict behavior def test_keyerror(self): """SequenceFileDB keyerror. Make sure that the SequenceFileDB KeyError is informative.""" try: self.db['foo'] except KeyError, e: assert "no key 'foo' in database <SequenceFileDB" in str(e), str(e)
def setUp(self): dnaseq = testutil.datafile('dnaseq.fasta') self.seqdb = SequenceFileDB(dnaseq) # contains 'seq1', 'seq2' self.db = SeqPrefixUnionDict({'prefix': self.seqdb})
class SeqPrefixUnionDict_Test(unittest.TestCase): """ Test SeqPrefixUnionDict. """ def setUp(self): dnaseq = testutil.datafile('dnaseq.fasta') self.seqdb = SequenceFileDB(dnaseq) # contains 'seq1', 'seq2' self.db = SeqPrefixUnionDict({'prefix': self.seqdb}) def tearDown(self): self.seqdb.close() def test_basic_iadd(self): dnaseq = testutil.datafile('dnaseq.fasta') seqdb = SequenceFileDB(dnaseq) try: new_seq = seqdb['seq1'] self.db += new_seq assert new_seq in self.db name = (~self.db)[new_seq] assert name == 'dnaseq.seq1', name ### seqdb2 = SequenceFileDB(dnaseq) try: # Munge the filepath for testing. seqdb2.filepath = 'foo' new_seq2 = seqdb2['seq1'] self.db += new_seq2 name2 = (~self.db)[new_seq2] assert name2 == 'foo.seq1', name2 finally: seqdb2.close() finally: seqdb.close() # NOTE, the important thing here is less the specific names that # are given (which are based on filepath) but that different names # are created for the various sequences when they are added. def test_iadd_db_twice(self): dnaseq = testutil.datafile('dnaseq.fasta') seqdb = SequenceFileDB(dnaseq) try: new_seq = seqdb['seq1'] self.db += new_seq name1 = (~self.db)[new_seq] self.db += new_seq # should do nothing... name2 = (~self.db)[new_seq] assert name1 == name2 # ...leaving seq with same name. finally: seqdb.close() def test_iadd_user_seq(self): seq = Sequence('ATGGCAGG', 'foo') self.db += seq name = (~self.db)[seq] assert name == 'user.foo' # created a new 'user' db. # ok, make sure it doesn't wipe out the old 'user' db... seq2 = Sequence('ATGGCAGG', 'foo2') self.db += seq2 name = (~self.db)[seq2] assert name == 'user.foo2' first_name = (~self.db)[seq] assert first_name == 'user.foo' def test_iadd_duplicate_seqdb(self): dnaseq = testutil.datafile('dnaseq.fasta') seqdb = SequenceFileDB(dnaseq) try: seqdb2 = SequenceFileDB(dnaseq) try: new_seq = seqdb['seq1'] new_seq2 = seqdb2['seq1'] self.db += new_seq try: self.db += new_seq2 assert 0, "should never reach this point" except ValueError: pass finally: seqdb2.close() finally: seqdb.close() def test_no_db_info(self): dnaseq = testutil.datafile('dnaseq.fasta') seqdb = SequenceFileDB(dnaseq) try: new_seq = seqdb['seq1'] assert getattr(seqdb, '_persistent_id', None) is None del seqdb.filepath self.db += new_seq name = (~self.db)[new_seq] assert name == 'noname0.seq1' finally: seqdb.close() def test_inverse_add_behavior(self): dnaseq = testutil.datafile('dnaseq.fasta') seqdb = SequenceFileDB(dnaseq) try: seq = seqdb['seq1'] name = (~self.db)[seq] finally: seqdb.close() # only need to close if exception occurs def test_inverse_noadd_behavior(self): # compare with test_inverse_add_behavior... db = SeqPrefixUnionDict(addAll=False) dnaseq = testutil.datafile('dnaseq.fasta') seqdb = SequenceFileDB(dnaseq) try: seq = seqdb['seq1'] try: name = (~db)[seq] assert 0, "should not get here" except KeyError: pass finally: seqdb.close()
def setUp(self): "Test setup" dnaseq = testutil.datafile('dnaseq.fasta') self.db = SequenceFileDB(dnaseq) # contains 'seq1', 'seq2' self.db._weakValueDict.clear() # clear the cache
def setup(self): self.db = SequenceFileDB('dnaseq') # contains 'seq1', 'seq2'
class SequenceFileDB_Test(object): """ Test for all of the basic dictionary functions on 'SequenceFileDB'. """ def setup(self): self.db = SequenceFileDB('dnaseq') # contains 'seq1', 'seq2' def keys_test(self): k = self.db.keys() k.sort() assert k == ['seq1', 'seq2'] def contains_test(self): assert 'seq1' in self.db, self.db.keys() assert 'seq2' in self.db assert 'foo' not in self.db def keys_info_test(self): k = self.db.seqInfoDict.keys() k.sort() assert k == ['seq1', 'seq2'] def contains_info_test(self): assert 'seq1' in self.db.seqInfoDict assert 'seq2' in self.db.seqInfoDict assert 'foo' not in self.db.seqInfoDict def has_key_test(self): assert self.db.has_key('seq1') assert self.db.has_key('seq2') assert not self.db.has_key('foo') def get_test(self): assert self.db.get('foo') is None assert self.db.get('seq1') is not None assert str(self.db.get('seq1')).startswith('atggtgtca') assert self.db.get('seq2') is not None assert str(self.db.get('seq2')).startswith('GTGTTGAA') def items_test(self): i = [ k for (k,v) in self.db.items() ] i.sort() assert i == ['seq1', 'seq2'] def iterkeys_test(self): kk = self.db.keys() kk.sort() ik = list(self.db.iterkeys()) ik.sort() assert kk == ik def itervalues_test(self): kv = self.db.values() kv.sort() iv = list(self.db.itervalues()) iv.sort() assert kv == iv def iteritems_test(self): ki = self.db.items() ki.sort() ii = list(self.db.iteritems()) ii.sort() assert ki == ii def readonly_test(self): try: self.db.copy() # what should 'copy' do on SequenceFileDB? assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.clear() assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.setdefault('foo') assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.pop() assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.popitem() assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.update({}) assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass # test some things other than dict behavior def keyerror_test(self): "Make sure that the SequenceFileDB KeyError is informative." try: self.db['foo'] except KeyError, e: assert "no key 'foo' in database <SequenceFileDB" in str(e), str(e)