def testCountGenomesEmpty(self): """ An empty database must have zero genomes. """ writer = SqliteIndexWriter(':memory:') db = SqliteIndex(writer._connection) self.assertEqual(0, db.genomeCount()) writer.close() db.close()
def testCountProteinsOne(self): """ A database with one protein must have a protein count of one. """ writer = SqliteIndexWriter(':memory:') writer.addProtein('NC222', 'NC23', 'AAA', 'offets', True, False, 3) db = SqliteIndex(writer._connection) self.assertEqual(1, db.proteinCount()) writer.close() db.close()
def main(args, parser): """ Build the protein database. @param args: The namespace of command-line arguments returned by argparse.parse_args() @param parser: An C{argparse.ArgumentParser} instance. """ if (args.minGenomeLength is not None and args.maxGenomeLength is not None and args.minGenomeLength > args.maxGenomeLength): raise ValueError( '--minGenomeLength cannot be larger than --maxGenomeLength') if args.excludeExclusiveHost: excludeExclusiveHosts = set( chain.from_iterable(args.excludeExclusiveHost)) else: excludeExclusiveHosts = None taxonomyDatabase = parseTaxonomyDatabaseCommandLineOptions(args, parser) progress = args.progress if progress: overallStart = time() totalGenomeCount = totalProteinCount = 0 with SqliteIndexWriter(args.databaseFile) as db: for fileCount, (filename, addFunc, type_) in enumerate(filenamesAndAdders(args, db), start=1): if args.logFile: print("\n>>> Indexing '%s'." % filename, end='\n\n', file=args.logFile) if progress: start = time() examinedGenomeCount, genomeCount, proteinCount = addFunc( filename, dnaOnly=args.dnaOnly, rnaOnly=args.rnaOnly, minGenomeLength=args.minGenomeLength, maxGenomeLength=args.maxGenomeLength, excludeExclusiveHosts=excludeExclusiveHosts, excludeFungusOnlyViruses=args.excludeFungusOnlyViruses, excludePlantOnlyViruses=args.excludePlantOnlyViruses, databaseName=args.databaseName, taxonomyDatabase=taxonomyDatabase, proteinSource=args.proteinSource, genomeSource=args.genomeSource, duplicationPolicy=args.duplicationPolicy, logfp=args.logFile) if examinedGenomeCount == 0: if type_ == 'gb': print('WARNING: No genomes found in %r. Did the GenBank ' 'download fail on that file?' % filename, file=sys.stderr) else: assert type_ == 'json' print('WARNING: no genomes found in JSON file %r.' % filename, file=sys.stderr) if progress: elapsed = time() - start totalGenomeCount += genomeCount totalProteinCount += proteinCount print('Processed %r: added %3d of %3d genome%s (%5d ' 'protein%s) in %.2f seconds.' % (filename, genomeCount, examinedGenomeCount, ' ' if examinedGenomeCount == 1 else 's', proteinCount, '' if proteinCount == 1 else 's', elapsed), file=sys.stderr) if progress: elapsed = time() - overallStart print('%d files (containing %d genomes and %d proteins) ' 'indexed in %.2f seconds (%.2f mins).' % (fileCount, totalGenomeCount, totalProteinCount, elapsed, elapsed / 60), file=sys.stderr)
def testTibetanFrogHBV(self): """ Test that Tibetan frogs can get HBV. """ proteinAccession = 'YP_009259545.1' proteinSequence = SAMPLE_DATA['proteins'][proteinAccession]['protein'] proteinId = SAMPLE_DATA['proteins'][proteinAccession]['id'] proteinRange = SAMPLE_DATA['proteins'][proteinAccession]['range'] ranges = GenomeRanges(proteinRange) queryStartInProtein = 10 # This is a 0-based amino acid offset. queryLenInProtein = 40 # This is an amino acid length. genomeAccession = 'NC_030446.1' genomeSequence = SAMPLE_DATA['genomes'][genomeAccession]['genome'] genomeLen = len(genomeSequence) # The query sequence is nucleotides that match the amino acids in the # protein. Here we use the first ([0] in the below) codon for each # amino acid to make the nucleotide sequence. queryId = 'query' querySequence = ''.join( CODONS[aa][0] for aa in proteinSequence[queryStartInProtein:queryStartInProtein + queryLenInProtein]) queryQuality = 'E' * len(querySequence) # Use the protein sequence to make a DIAMOND database and run DIAMOND # on the query. Yes, this really runs DIAMOND, so you need to have it # installed, with its executable somewhere in your shell's PATH. with DiamondExecutor() as de: de.addSubject(Read(proteinId, proteinSequence)) queries = Reads([Read(queryId, querySequence, queryQuality)]) (diamondResult, ) = list(de.search(queries)) # Make sure DIAMOND gives us back what we expected. self.assertEqual( { 'bitscore': 83.6, 'btop': str(queryLenInProtein), # Exact match of all AAs. 'qframe': 1, 'qend': 3 * queryLenInProtein, 'full_qqual': queryQuality, 'qlen': len(querySequence), 'full_qseq': querySequence, 'qseqid': 'query', 'qstart': 1, 'slen': len(proteinSequence), 'sstart': queryStartInProtein + 1, # DIAMOND is 1-based. 'stitle': proteinId, }, diamondResult) # Make a genomes/proteins sqlite database and add information about # the protein and the nucleotide genome it comes from. db = SqliteIndexWriter(':memory:') db.addProtein(proteinAccession, genomeAccession, proteinSequence, proteinRange, True, ranges.circular(genomeLen), ranges.distinctRangeCount(genomeLen)) genome = _Genome( # genomeAccession, genomeName, genomeSequence ) db.addGenome(genome, 1, 'test-db') # Make a DIAMOND-to-SAM writer and give it the DIAMOND output. writer = SimpleDiamondSAMWriter(SqliteIndex(db._connection)) writer.addMatch('\t'.join( map(str, (diamondResult['bitscore'], diamondResult['btop'], diamondResult['qframe'], diamondResult['qend'], diamondResult['full_qqual'], diamondResult['qlen'], diamondResult['full_qseq'], diamondResult['qseqid'], diamondResult['qstart'], diamondResult['slen'], diamondResult['sstart'], diamondResult['stitle'])))) # Tell the writer to save the matches as SAM and check the result. fp = StringIO() writer.save(filename=fp) flags = '31' self.assertEqual( '\n'.join([ '@SQ\tSN:%s\tLN:%d' % (genomeAccession, len(genomeSequence)), '\t'.join([ queryId, '0', genomeAccession, flags, '255', '120M', # (Exact) match of 40 AAs. '*', '0', '0', querySequence, queryQuality, 'AS:i:%d' % int(diamondResult['bitscore']), ]) ]) + '\n', fp.getvalue())
def testPolymerase(self): """ Test the polymerase protein. """ self.maxDiff = None proteinAccession = 'CAK55121.1' proteinSequence = SAMPLE_DATA['proteins'][proteinAccession]['protein'] proteinId = SAMPLE_DATA['proteins'][proteinAccession]['id'] proteinRange = SAMPLE_DATA['proteins'][proteinAccession]['range'] ranges = GenomeRanges(proteinRange) queryStartInProtein = 270 # This is a 0-based amino acid offset. queryLenInProtein = 470 # This is an amino acid length. genomeAccession = 'AM282986.1' genomeId = SAMPLE_DATA['genomes'][genomeAccession]['id'] genomeSequence = SAMPLE_DATA['genomes'][genomeAccession]['genome'] genomeLen = len(genomeSequence) # The query sequence is nucleotides that match the amino acids in the # protein. Here we use the first ([0] in the below) codon for each # amino acid to make the nucleotide sequence. queryId = 'query' querySequence = ''.join( CODONS[aa][0] for aa in proteinSequence[queryStartInProtein:queryStartInProtein + queryLenInProtein]) queryQuality = 'E' * len(querySequence) # Use the protein sequence to make a DIAMOND database and run DIAMOND # on the query. Yes, this really runs DIAMOND, so you need to have it # installed, with its executable somewhere in your shell's PATH. with DiamondExecutor() as de: de.addSubject(Read(proteinId, proteinSequence)) queries = Reads([Read(queryId, querySequence, queryQuality)]) (diamondResult, ) = list(de.search(queries)) # Make sure DIAMOND gives us back what we expected. self.assertEqual( { 'bitscore': 974.0, 'btop': str(queryLenInProtein), # Exact match of all query AA. 'qframe': 1, 'qstart': 1, 'qend': 3 * queryLenInProtein, 'qlen': len(querySequence), 'qseqid': 'query', 'full_qseq': querySequence, 'full_qqual': queryQuality, 'slen': len(proteinSequence), 'sstart': queryStartInProtein + 1, # DIAMOND is 1-based. 'stitle': proteinId, }, diamondResult) # Make a genomes/proteins sqlite database and add information about # the protein and the nucleotide genome it comes from. db = SqliteIndexWriter(':memory:') db.addProtein(proteinAccession, genomeAccession, proteinSequence, proteinRange, True, ranges.circular(genomeLen), ranges.distinctRangeCount(genomeLen)) genome = _Genome({ 'id': genomeAccession, 'name': genomeId, 'sequence': genomeSequence, 'features': [], }) db.addGenome(genome, source={ 'host': 'H**o sapiens', 'mol_type': 'DNA', 'organism': 'Hepatitis B Virus', }, taxonomyId=500, proteinCount=len(SAMPLE_DATA['proteins']), databaseName='test-db') # Make a DIAMOND-to-SAM writer and give it the DIAMOND output. writer = SimpleDiamondSAMWriter(SqliteIndex(db._connection)) writer.addMatch('\t'.join( map(str, (diamondResult['bitscore'], diamondResult['btop'], diamondResult['qframe'], diamondResult['qend'], diamondResult['full_qqual'], diamondResult['qlen'], diamondResult['full_qseq'], diamondResult['qseqid'], diamondResult['qstart'], diamondResult['slen'], diamondResult['sstart'], diamondResult['stitle'])))) # Tell the writer to save the matches as SAM and check the result. fp = StringIO() writer.save(filename=fp) flags = 0 self.assertEqual( '\n'.join(('@SQ\tSN:%s\tLN:%d' % (genomeAccession, len(genomeSequence)), '\t'.join( map(str, ( queryId, flags, genomeAccession, queryStartInProtein * 3 + 1, 255, '1410M', '*', 0, 0, querySequence, queryQuality, 'AS:i:%d' % int(diamondResult['bitscore']), ))))) + '\n', fp.getvalue())