def main(argv=None): parser = make_parser() args = parser.parse_args(argv) genome = SequenceFileDB(args.genome) pwm = [IUPAC_SCORES[l] for l in args.consensus] pwm.extend([REQUIRED_SCORES[l] for l in args.required_3p_seq]) pwm = motility.PWM(pwm) # find all matches with open(args.outfile, 'w') as outfile: for chrom in genome.keys(): chromseq = str(genome[chrom]) print "searching ", chrom, "of length", len(chromseq) if len(chromseq) < len(pwm): print 'chromosome/fragment', chrom, 'is too short' continue matches = pwm.find(chromseq, -args.mismatches) for start, stop, strand, seq in matches: score = pwm.calc_score(seq) outfile.write('\t'.join( [chrom, str(start), str(stop), seq, str(score), '+' if strand == 1 else '-']) + '\n')
class SequenceFileDB_Test(unittest.TestCase): """ Test for all of the basic dictionary functions on 'SequenceFileDB', among other things. """ def setUp(self): "Test setup" dnaseq = testutil.datafile('dnaseq.fasta') self.db = SequenceFileDB(dnaseq) # contains 'seq1', 'seq2' self.db._weakValueDict.clear() # clear the cache def tearDown(self): self.db.close() # must close SequenceFileDB! def test_len(self): assert len(self.db) == 2 def test_seqInfoDict_len(self): assert len(self.db.seqInfoDict) == 2 def test_no_file_given(self): "Make sure that a TypeError is raised when no file is available" try: db = SequenceFileDB() assert 0, "should not reach this point" except TypeError: pass def test_seq_descriptor(self): "Check the '.seq' attribute (tied to a descriptor)" s = self.db['seq1'] assert str(s) == str(s.seq) def test_cache(self): "SequenceDB cache test" assert len(self.db._weakValueDict) == 0 seq1 = self.db['seq1'] # cache populated? assert len(self.db._weakValueDict) == 1 assert 'seq1' in self.db._weakValueDict # cache functions? seq1_try2 = self.db['seq1'] assert seq1 is seq1_try2 def test_clear_cache(self): "SequenceDB clear_cache test" assert len(self.db._weakValueDict) == 0 seq1 = self.db['seq1'] # cache populated? assert len(self.db._weakValueDict) == 1 assert 'seq1' in self.db._weakValueDict # clear_cache functions? self.db.clear_cache() seq1_try3 = self.db['seq1'] assert seq1 is not seq1_try3 def test_keys(self): "SequenceFileDB keys" k = self.db.keys() k.sort() assert k == ['seq1', 'seq2'] def test_contains(self): "SequenceFileDB contains" assert 'seq1' in self.db, self.db.keys() assert 'seq2' in self.db assert 'foo' not in self.db def test_invert_class(self): "SequenceFileDB __invert__" seq = self.db['seq1'] inversedb = ~self.db assert inversedb[seq] == 'seq1' assert seq in inversedb assert 'foo' not in inversedb def test_keys_info(self): "SequenceFileDB keys info" k = self.db.seqInfoDict.keys() k.sort() assert k == ['seq1', 'seq2'] def test_contains_info(self): "SequenceFileDB contains info" assert 'seq1' in self.db.seqInfoDict assert 'seq2' in self.db.seqInfoDict assert 'foo' not in self.db.seqInfoDict def test_has_key(self): "SequenceFileDB has key" assert 'seq1' in self.db assert 'seq2' in self.db assert 'foo' not in self.db def test_get(self): "SequenceFileDB get" assert self.db.get('foo') is None assert self.db.get('seq1') is not None assert str(self.db.get('seq1')).startswith('atggtgtca') assert self.db.get('seq2') is not None assert str(self.db.get('seq2')).startswith('GTGTTGAA') def test_items(self): "SequenceFileDB items" i = [k for (k, v) in self.db.items()] i.sort() assert i == ['seq1', 'seq2'] def test_iterkeys(self): "SequenceFileDB iterkeys" kk = self.db.keys() kk.sort() ik = list(self.db.iterkeys()) ik.sort() assert kk == ik def test_itervalues(self): "SequenceFileDB itervalues" kv = self.db.values() kv.sort() iv = list(self.db.itervalues()) iv.sort() assert kv == iv def test_iteritems(self): "SequenceFileDB iteritems" ki = self.db.items() ki.sort() ii = list(self.db.iteritems()) ii.sort() assert ki == ii def test_readonly(self): "SequenceFileDB readonly" try: self.db.copy() # what should 'copy' do on SequenceFileDB? assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.clear() assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.setdefault('foo') assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.pop() assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.popitem() assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.update({}) assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass # test some things other than dict behavior def test_keyerror(self): """SequenceFileDB keyerror. Make sure that the SequenceFileDB KeyError is informative.""" try: self.db['foo'] except KeyError, e: assert "no key 'foo' in database <SequenceFileDB" in str(e), str(e)
class Hgvs2: def __init__(self, refseq_fn=fileconfig.FILECONFIG['REFGENE']): """ register hg19 reference genome sequence and NCBI RefSeq transcript coordinates""" self.genome_fn = fileconfig.FILECONFIG['REFGENOME_UCSC'] self.genome = None if not os.path.exists(self.genome_fn): raise IOError( 'Reference Genome Sequence (UCSC format) for %s is not found' % self.genome_fn) self.refseq_fn = refseq_fn self.refseq = None if not os.path.exists(self.refseq_fn): raise IOError('NCBI RefSeq transcript for %s is not found' % self.refseq_fn) def get_transcript(self, name): if self.refseq: return self.refseq.get(name) else: raise RuntimeError( 'first, load a transcript coordinate file![%s]' % self.refseq_fn) def load_resource(self): #load genome sequence print 'loading the genome sequence [%s] for HGVS...' % self.genome_fn self.genome = SequenceFileDB(self.genome_fn) print 'done.' #load refseq into dic print 'loading the refseq transcript [%s] for HGVS...' % self.refseq_fn fp = open(self.refseq_fn, 'r') self.refseq = pyhgvs.utils.read_transcripts(fp) fp.close() print 'done.' def close_resource(self): self.genome.close() self.refseq = None def to_cDNA(self, chrom, offset, ref, alt, refseq_acc): """ convert to HGVS nomenclature """ transcript = self.get_transcript(refseq_acc) if not chrom.startswith('chr'): chrom = 'chr%s' % chrom if chrom not in CHROMOSOMES: return None if not chrom in self.genome.keys(): return None cdna = pyhgvs.format_hgvs_name(chrom, offset, ref, alt, self.genome, transcript) if cdna: itms = cdna.split(':') if len(itms) > 1: cdna = itms[1] else: cdna = cdna return cdna else: return None def gdna_to_vcf(self, gdna): return pyhgvs.gdna_to_vcf(gdna, self.genome) def to_chrom_coordinate(self, cDNA): try: chrom, offset, ref, alt = pyhgvs.parse_hgvs_name(cDNA, self.genome, \ get_transcript = self.get_transcript) return chrom, offset, ref, alt except: print "[%s] cannot be coverted to chromosome coordinate" % cDNA return None, None, None, None
class SequenceFileDB_Test(object): """ Test for all of the basic dictionary functions on 'SequenceFileDB'. """ def setup(self): self.db = SequenceFileDB('dnaseq') # contains 'seq1', 'seq2' def keys_test(self): k = self.db.keys() k.sort() assert k == ['seq1', 'seq2'] def contains_test(self): assert 'seq1' in self.db, self.db.keys() assert 'seq2' in self.db assert 'foo' not in self.db def keys_info_test(self): k = self.db.seqInfoDict.keys() k.sort() assert k == ['seq1', 'seq2'] def contains_info_test(self): assert 'seq1' in self.db.seqInfoDict assert 'seq2' in self.db.seqInfoDict assert 'foo' not in self.db.seqInfoDict def has_key_test(self): assert self.db.has_key('seq1') assert self.db.has_key('seq2') assert not self.db.has_key('foo') def get_test(self): assert self.db.get('foo') is None assert self.db.get('seq1') is not None assert str(self.db.get('seq1')).startswith('atggtgtca') assert self.db.get('seq2') is not None assert str(self.db.get('seq2')).startswith('GTGTTGAA') def items_test(self): i = [ k for (k,v) in self.db.items() ] i.sort() assert i == ['seq1', 'seq2'] def iterkeys_test(self): kk = self.db.keys() kk.sort() ik = list(self.db.iterkeys()) ik.sort() assert kk == ik def itervalues_test(self): kv = self.db.values() kv.sort() iv = list(self.db.itervalues()) iv.sort() assert kv == iv def iteritems_test(self): ki = self.db.items() ki.sort() ii = list(self.db.iteritems()) ii.sort() assert ki == ii def readonly_test(self): try: self.db.copy() # what should 'copy' do on SequenceFileDB? assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.clear() assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.setdefault('foo') assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.pop() assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.popitem() assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass try: self.db.update({}) assert 0, 'this method should raise NotImplementedError' except NotImplementedError: pass # test some things other than dict behavior def keyerror_test(self): "Make sure that the SequenceFileDB KeyError is informative." try: self.db['foo'] except KeyError, e: assert "no key 'foo' in database <SequenceFileDB" in str(e), str(e)