Esempio n. 1
0
 def testCountGenomesEmpty(self):
     """
     An empty database must have zero genomes.
     """
     writer = SqliteIndexWriter(':memory:')
     db = SqliteIndex(writer._connection)
     self.assertEqual(0, db.genomeCount())
     writer.close()
     db.close()
Esempio n. 2
0
 def testCountProteinsOne(self):
     """
     A database with one protein must have a protein count of one.
     """
     writer = SqliteIndexWriter(':memory:')
     writer.addProtein('NC222', 'NC23', 'AAA', 'offets', True, False, 3)
     db = SqliteIndex(writer._connection)
     self.assertEqual(1, db.proteinCount())
     writer.close()
     db.close()
Esempio n. 3
0
def main(args, parser):
    """
    Build the protein database.

    @param args: The namespace of command-line arguments returned by
        argparse.parse_args()
    @param parser: An C{argparse.ArgumentParser} instance.
    """

    if (args.minGenomeLength is not None and args.maxGenomeLength is not None
            and args.minGenomeLength > args.maxGenomeLength):
        raise ValueError(
            '--minGenomeLength cannot be larger than --maxGenomeLength')

    if args.excludeExclusiveHost:
        excludeExclusiveHosts = set(
            chain.from_iterable(args.excludeExclusiveHost))
    else:
        excludeExclusiveHosts = None

    taxonomyDatabase = parseTaxonomyDatabaseCommandLineOptions(args, parser)
    progress = args.progress

    if progress:
        overallStart = time()
        totalGenomeCount = totalProteinCount = 0

    with SqliteIndexWriter(args.databaseFile) as db:
        for fileCount, (filename, addFunc,
                        type_) in enumerate(filenamesAndAdders(args, db),
                                            start=1):

            if args.logFile:
                print("\n>>> Indexing '%s'." % filename,
                      end='\n\n',
                      file=args.logFile)

            if progress:
                start = time()

            examinedGenomeCount, genomeCount, proteinCount = addFunc(
                filename,
                dnaOnly=args.dnaOnly,
                rnaOnly=args.rnaOnly,
                minGenomeLength=args.minGenomeLength,
                maxGenomeLength=args.maxGenomeLength,
                excludeExclusiveHosts=excludeExclusiveHosts,
                excludeFungusOnlyViruses=args.excludeFungusOnlyViruses,
                excludePlantOnlyViruses=args.excludePlantOnlyViruses,
                databaseName=args.databaseName,
                taxonomyDatabase=taxonomyDatabase,
                proteinSource=args.proteinSource,
                genomeSource=args.genomeSource,
                duplicationPolicy=args.duplicationPolicy,
                logfp=args.logFile)

            if examinedGenomeCount == 0:
                if type_ == 'gb':
                    print('WARNING: No genomes found in %r. Did the GenBank '
                          'download fail on that file?' % filename,
                          file=sys.stderr)
                else:
                    assert type_ == 'json'
                    print('WARNING: no genomes found in JSON file %r.' %
                          filename,
                          file=sys.stderr)

            if progress:
                elapsed = time() - start
                totalGenomeCount += genomeCount
                totalProteinCount += proteinCount
                print('Processed %r: added %3d of %3d genome%s (%5d '
                      'protein%s) in %.2f seconds.' %
                      (filename, genomeCount, examinedGenomeCount,
                       ' ' if examinedGenomeCount == 1 else 's', proteinCount,
                       '' if proteinCount == 1 else 's', elapsed),
                      file=sys.stderr)

    if progress:
        elapsed = time() - overallStart
        print('%d files (containing %d genomes and %d proteins) '
              'indexed in %.2f seconds (%.2f mins).' %
              (fileCount, totalGenomeCount, totalProteinCount, elapsed,
               elapsed / 60),
              file=sys.stderr)
Esempio n. 4
0
    def testTibetanFrogHBV(self):
        """
        Test that Tibetan frogs can get HBV.
        """
        proteinAccession = 'YP_009259545.1'
        proteinSequence = SAMPLE_DATA['proteins'][proteinAccession]['protein']
        proteinId = SAMPLE_DATA['proteins'][proteinAccession]['id']
        proteinRange = SAMPLE_DATA['proteins'][proteinAccession]['range']
        ranges = GenomeRanges(proteinRange)
        queryStartInProtein = 10  # This is a 0-based amino acid offset.
        queryLenInProtein = 40  # This is an amino acid length.

        genomeAccession = 'NC_030446.1'
        genomeSequence = SAMPLE_DATA['genomes'][genomeAccession]['genome']
        genomeLen = len(genomeSequence)

        # The query sequence is nucleotides that match the amino acids in the
        # protein. Here we use the first ([0] in the below) codon for each
        # amino acid to make the nucleotide sequence.
        queryId = 'query'
        querySequence = ''.join(
            CODONS[aa][0]
            for aa in proteinSequence[queryStartInProtein:queryStartInProtein +
                                      queryLenInProtein])
        queryQuality = 'E' * len(querySequence)

        # Use the protein sequence to make a DIAMOND database and run DIAMOND
        # on the query. Yes, this really runs DIAMOND, so you need to have it
        # installed, with its executable somewhere in your shell's PATH.
        with DiamondExecutor() as de:
            de.addSubject(Read(proteinId, proteinSequence))
            queries = Reads([Read(queryId, querySequence, queryQuality)])
            (diamondResult, ) = list(de.search(queries))

        # Make sure DIAMOND gives us back what we expected.
        self.assertEqual(
            {
                'bitscore': 83.6,
                'btop': str(queryLenInProtein),  # Exact match of all AAs.
                'qframe': 1,
                'qend': 3 * queryLenInProtein,
                'full_qqual': queryQuality,
                'qlen': len(querySequence),
                'full_qseq': querySequence,
                'qseqid': 'query',
                'qstart': 1,
                'slen': len(proteinSequence),
                'sstart': queryStartInProtein + 1,  # DIAMOND is 1-based.
                'stitle': proteinId,
            },
            diamondResult)

        # Make a genomes/proteins sqlite database and add information about
        # the protein and the nucleotide genome it comes from.
        db = SqliteIndexWriter(':memory:')
        db.addProtein(proteinAccession, genomeAccession, proteinSequence,
                      proteinRange, True, ranges.circular(genomeLen),
                      ranges.distinctRangeCount(genomeLen))

        genome = _Genome(
            # genomeAccession, genomeName, genomeSequence
        )
        db.addGenome(genome, 1, 'test-db')

        # Make a DIAMOND-to-SAM writer and give it the DIAMOND output.
        writer = SimpleDiamondSAMWriter(SqliteIndex(db._connection))
        writer.addMatch('\t'.join(
            map(str, (diamondResult['bitscore'], diamondResult['btop'],
                      diamondResult['qframe'], diamondResult['qend'],
                      diamondResult['full_qqual'], diamondResult['qlen'],
                      diamondResult['full_qseq'], diamondResult['qseqid'],
                      diamondResult['qstart'], diamondResult['slen'],
                      diamondResult['sstart'], diamondResult['stitle']))))

        # Tell the writer to save the matches as SAM and check the result.
        fp = StringIO()
        writer.save(filename=fp)

        flags = '31'
        self.assertEqual(
            '\n'.join([
                '@SQ\tSN:%s\tLN:%d' % (genomeAccession, len(genomeSequence)),
                '\t'.join([
                    queryId,
                    '0',
                    genomeAccession,
                    flags,
                    '255',
                    '120M',  # (Exact) match of 40 AAs.
                    '*',
                    '0',
                    '0',
                    querySequence,
                    queryQuality,
                    'AS:i:%d' % int(diamondResult['bitscore']),
                ])
            ]) + '\n',
            fp.getvalue())
Esempio n. 5
0
    def testPolymerase(self):
        """
        Test the polymerase protein.
        """
        self.maxDiff = None
        proteinAccession = 'CAK55121.1'
        proteinSequence = SAMPLE_DATA['proteins'][proteinAccession]['protein']
        proteinId = SAMPLE_DATA['proteins'][proteinAccession]['id']
        proteinRange = SAMPLE_DATA['proteins'][proteinAccession]['range']
        ranges = GenomeRanges(proteinRange)
        queryStartInProtein = 270  # This is a 0-based amino acid offset.
        queryLenInProtein = 470  # This is an amino acid length.

        genomeAccession = 'AM282986.1'
        genomeId = SAMPLE_DATA['genomes'][genomeAccession]['id']
        genomeSequence = SAMPLE_DATA['genomes'][genomeAccession]['genome']
        genomeLen = len(genomeSequence)

        # The query sequence is nucleotides that match the amino acids in the
        # protein. Here we use the first ([0] in the below) codon for each
        # amino acid to make the nucleotide sequence.
        queryId = 'query'
        querySequence = ''.join(
            CODONS[aa][0]
            for aa in proteinSequence[queryStartInProtein:queryStartInProtein +
                                      queryLenInProtein])
        queryQuality = 'E' * len(querySequence)

        # Use the protein sequence to make a DIAMOND database and run DIAMOND
        # on the query. Yes, this really runs DIAMOND, so you need to have it
        # installed, with its executable somewhere in your shell's PATH.
        with DiamondExecutor() as de:
            de.addSubject(Read(proteinId, proteinSequence))
            queries = Reads([Read(queryId, querySequence, queryQuality)])
            (diamondResult, ) = list(de.search(queries))

        # Make sure DIAMOND gives us back what we expected.
        self.assertEqual(
            {
                'bitscore': 974.0,
                'btop': str(queryLenInProtein),  # Exact match of all query AA.
                'qframe': 1,
                'qstart': 1,
                'qend': 3 * queryLenInProtein,
                'qlen': len(querySequence),
                'qseqid': 'query',
                'full_qseq': querySequence,
                'full_qqual': queryQuality,
                'slen': len(proteinSequence),
                'sstart': queryStartInProtein + 1,  # DIAMOND is 1-based.
                'stitle': proteinId,
            },
            diamondResult)

        # Make a genomes/proteins sqlite database and add information about
        # the protein and the nucleotide genome it comes from.
        db = SqliteIndexWriter(':memory:')
        db.addProtein(proteinAccession, genomeAccession, proteinSequence,
                      proteinRange, True, ranges.circular(genomeLen),
                      ranges.distinctRangeCount(genomeLen))

        genome = _Genome({
            'id': genomeAccession,
            'name': genomeId,
            'sequence': genomeSequence,
            'features': [],
        })

        db.addGenome(genome,
                     source={
                         'host': 'H**o sapiens',
                         'mol_type': 'DNA',
                         'organism': 'Hepatitis B Virus',
                     },
                     taxonomyId=500,
                     proteinCount=len(SAMPLE_DATA['proteins']),
                     databaseName='test-db')

        # Make a DIAMOND-to-SAM writer and give it the DIAMOND output.
        writer = SimpleDiamondSAMWriter(SqliteIndex(db._connection))
        writer.addMatch('\t'.join(
            map(str, (diamondResult['bitscore'], diamondResult['btop'],
                      diamondResult['qframe'], diamondResult['qend'],
                      diamondResult['full_qqual'], diamondResult['qlen'],
                      diamondResult['full_qseq'], diamondResult['qseqid'],
                      diamondResult['qstart'], diamondResult['slen'],
                      diamondResult['sstart'], diamondResult['stitle']))))

        # Tell the writer to save the matches as SAM and check the result.
        fp = StringIO()
        writer.save(filename=fp)

        flags = 0
        self.assertEqual(
            '\n'.join(('@SQ\tSN:%s\tLN:%d' %
                       (genomeAccession, len(genomeSequence)), '\t'.join(
                           map(str, (
                               queryId,
                               flags,
                               genomeAccession,
                               queryStartInProtein * 3 + 1,
                               255,
                               '1410M',
                               '*',
                               0,
                               0,
                               querySequence,
                               queryQuality,
                               'AS:i:%d' % int(diamondResult['bitscore']),
                           ))))) + '\n', fp.getvalue())