def testNoFilesToStr(self):
     """
     If no files have been given to a protein grouper, its text string
     format must as expected.
     """
     pg = ProteinGrouper()
     self.assertEqual('0 viruses found in 0 samples\n', pg.toStr())
Beispiel #2
0
 def testNoFilesToStr(self):
     """
     If no files have been given to a protein grouper, its text string
     format must as expected.
     """
     pg = ProteinGrouper()
     self.assertEqual('0 viruses found in 0 samples\n', pg.toStr())
Beispiel #3
0
 def testOneLineInOneFileWithDifferentAssetDir(self):
     """
     If a protein grouper is given a different assetDir name, 
     the outDir needs to have that same name, as expected.
     """
     fp = StringIO(
         '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [Lausannevirus]\n')
     pg = ProteinGrouper(assetDir='differentname')
     pg.addFile('sample-filename', fp)
     self.assertEqual(
         {
             'Lausannevirus': {
                 'sample-filename': {
                     'proteins': {
                         'gi|327|X|I44.6 ubiquitin': {
                             'bestScore': 48.1,
                             'bluePlotFilename': 'differentname/0.png',
                             'coverage': 0.77,
                             'readsFilename': 'differentname/0.fasta',
                             'hspCount': 6,
                             'index': 0,
                             'medianScore': 46.6,
                             'outDir': 'differentname',
                             'proteinLength': 74,
                             'proteinName': 'gi|327|X|I44.6 ubiquitin',
                             'proteinURL': (
                                 'http://www.ncbi.nlm.nih.gov/nuccore/I44'),
                             'readCount': 5,
                         },
                     },
                     'uniqueReadCount': None,
                 },
             }
         },
         pg.pathogenNames)
Beispiel #4
0
 def testOneLineInOneFileFASTQ(self):
     """
     If a protein grouper is given one file with one line, its pathogenNames
     dict must be as expected, including for a FASTQ file.
     """
     fp = StringIO(
         '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [Lausannevirus]\n')
     pg = ProteinGrouper(format_='fastq')
     pg.addFile('sample-filename', fp)
     self.assertEqual(
         {
             'Lausannevirus': {
                 'sample-filename': {
                     'proteins': {
                         'gi|327|X|I44.6 ubiquitin': {
                             'bestScore': 48.1,
                             'bluePlotFilename': 'out/0.png',
                             'coverage': 0.77,
                             'readsFilename': 'out/0.fastq',
                             'hspCount': 6,
                             'index': 0,
                             'medianScore': 46.6,
                             'outDir': 'out',
                             'proteinLength': 74,
                             'proteinName': 'gi|327|X|I44.6 ubiquitin',
                             'proteinURL': (
                                 'http://www.ncbi.nlm.nih.gov/nuccore/I44'),
                             'readCount': 5,
                         },
                     },
                     'uniqueReadCount': None,
                 },
             }
         },
         pg.pathogenNames)
 def testOneLineInOneFile(self):
     """
     If a protein grouper is given one file with one line, its virusTitles
     dict must be as expected.
     """
     fp = StringIO(
         '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [Lausannevirus]\n')
     pg = ProteinGrouper()
     pg.addFile('sample-filename', fp)
     self.assertEqual(
         {
             'Lausannevirus': {
                 'sample-filename': [
                     {
                         'bestScore': 48.1,
                         'bluePlotFilename': 'out/0.png',
                         'coverage': 0.77,
                         'fastaFilename': 'out/0.fasta',
                         'hspCount': 6,
                         'index': 0,
                         'medianScore': 46.6,
                         'outDir': 'out',
                         'proteinLength': 74,
                         'proteinTitle': 'gi|327|X|I44.6 ubiquitin',
                         'proteinURL': (
                             'http://www.ncbi.nlm.nih.gov/nuccore/I44'),
                         'readCount': 5,
                     },
                 ]
             }
         },
         pg.virusTitles)
Beispiel #6
0
 def testOneLineInEachOfTwoFilesDifferentPathogens(self):
     """
     If a protein grouper is given two files in two different directories,
     each with one line from the different pathogens, its pathogenNames dict
     must be as expected.
     """
     fp1 = StringIO(
         '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n')
     fp2 = StringIO(
         '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Hepatitis B virus]\n')
     pg = ProteinGrouper()
     pg.addFile('dir-1/sample-filename-1', fp1)
     pg.addFile('dir-2/sample-filename-2', fp2)
     self.assertEqual(
         {
             'Lausannevirus': {
                 'dir-1/sample-filename-1': {
                     'proteins': {
                         'gi|327410| protein 77': {
                             'bestScore': 44.2,
                             'bluePlotFilename': 'dir-1/out/0.png',
                             'coverage': 0.63,
                             'readsFilename': 'dir-1/out/0.fasta',
                             'hspCount': 9,
                             'index': 0,
                             'medianScore': 41.3,
                             'outDir': 'dir-1/out',
                             'proteinLength': 12,
                             'proteinName': 'gi|327410| protein 77',
                             'proteinURL': None,
                             'readCount': 9,
                         },
                     },
                     'uniqueReadCount': None,
                 },
             },
             'Hepatitis B virus': {
                 'dir-2/sample-filename-2': {
                     'proteins': {
                         'gi|327409| ubiquitin': {
                             'bestScore': 48.1,
                             'bluePlotFilename': 'dir-2/out/0.png',
                             'coverage': 0.77,
                             'readsFilename': 'dir-2/out/0.fasta',
                             'hspCount': 6,
                             'index': 0,
                             'medianScore': 46.6,
                             'outDir': 'dir-2/out',
                             'proteinLength': 74,
                             'proteinName': 'gi|327409| ubiquitin',
                             'proteinURL': None,
                             'readCount': 5,
                         },
                     },
                     'uniqueReadCount': None,
                 },
             },
         }, pg.pathogenNames)
Beispiel #7
0
 def testOneLineInEachOfTwoFilesSamePathogen(self):
     """
     If a protein grouper is given two files, each with one line from the
     same pathogen, its pathogenNames dict must be as expected.
     """
     fp1 = StringIO(
         '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n'
     )
     fp2 = StringIO(
         '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Lausannevirus]\n'
     )
     pg = ProteinGrouper()
     pg.addFile('sample-filename-1', fp1)
     pg.addFile('sample-filename-2', fp2)
     self.assertEqual(
         {
             'Lausannevirus': {
                 'sample-filename-1': {
                     'proteins': {
                         'gi|327410| protein 77': {
                             'bestScore': 44.2,
                             'bluePlotFilename': 'out/0.png',
                             'coverage': 0.63,
                             'readsFilename': 'out/0.fasta',
                             'hspCount': 9,
                             'index': 0,
                             'medianScore': 41.3,
                             'outDir': 'out',
                             'proteinLength': 12,
                             'proteinName': 'gi|327410| protein 77',
                             'proteinURL': None,
                             'readCount': 9,
                         },
                     },
                     'uniqueReadCount': None,
                 },
                 'sample-filename-2': {
                     'proteins': {
                         'gi|327409| ubiquitin': {
                             'bestScore': 48.1,
                             'bluePlotFilename': 'out/0.png',
                             'coverage': 0.77,
                             'readsFilename': 'out/0.fasta',
                             'hspCount': 6,
                             'index': 0,
                             'medianScore': 46.6,
                             'outDir': 'out',
                             'proteinLength': 74,
                             'proteinName': 'gi|327409| ubiquitin',
                             'proteinURL': None,
                             'readCount': 5,
                         },
                     },
                     'uniqueReadCount': None,
                 },
             },
         },
         pg.pathogenNames)
Beispiel #8
0
 def testNoFilesToStr(self):
     """
     If no files have been given to a protein grouper, its text string
     format must as expected.
     """
     pg = ProteinGrouper()
     self.assertEqual(
         'Overall, proteins from 0 pathogens were found in 0 samples.\n',
         pg.toStr())
Beispiel #9
0
 def testTwoLinesInOneFileDifferentPathogens(self):
     """
     If a protein grouper is given one file with two lines from different
     pathogens, its pathogenNames dict must be as expected.
     """
     fp = StringIO(
         '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n'
         '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Hepatitis B virus]\n'
     )
     pg = ProteinGrouper()
     pg.addFile('sample-filename', fp)
     self.assertEqual(
         {
             'Lausannevirus': {
                 'sample-filename': {
                     'proteins': {
                         'gi|327410| protein 77': {
                             'bestScore': 44.2,
                             'bluePlotFilename': 'out/0.png',
                             'coverage': 0.63,
                             'readsFilename': 'out/0.fasta',
                             'hspCount': 9,
                             'index': 0,
                             'medianScore': 41.3,
                             'outDir': 'out',
                             'proteinLength': 12,
                             'proteinName': 'gi|327410| protein 77',
                             'proteinURL': None,
                             'readCount': 9,
                         },
                     },
                     'uniqueReadCount': None,
                 },
             },
             'Hepatitis B virus': {
                 'sample-filename': {
                     'proteins': {
                         'gi|327409| ubiquitin': {
                             'bestScore': 48.1,
                             'bluePlotFilename': 'out/1.png',
                             'coverage': 0.77,
                             'readsFilename': 'out/1.fasta',
                             'hspCount': 6,
                             'index': 1,
                             'medianScore': 46.6,
                             'outDir': 'out',
                             'proteinLength': 74,
                             'proteinName': 'gi|327409| ubiquitin',
                             'proteinURL': None,
                             'readCount': 5,
                         },
                     },
                     'uniqueReadCount': None,
                 },
             },
         },
         pg.pathogenNames)
Beispiel #10
0
 def testNoFilesToStr(self):
     """
     If no files have been given to a protein grouper, its text string
     format must as expected.
     """
     pg = ProteinGrouper()
     self.assertEqual(
         'Overall, proteins from 0 pathogens were found in 0 samples.\n',
         pg.toStr())
 def testOneLineInOneFileTitle(self):
     """
     If a protein grouper is given one file with one line, its _title method
     must return the expected string.
     """
     fp = StringIO(
         '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [Lausannevirus]\n')
     pg = ProteinGrouper()
     pg.addFile('sample-filename', fp)
     self.assertEqual('1 virus found in 1 sample', pg._title())
Beispiel #12
0
 def testOneLineInOneFileTitle(self):
     """
     If a protein grouper is given one file with one line, its _title method
     must return the expected string.
     """
     fp = StringIO(
         '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [Lausannevirus]\n')
     pg = ProteinGrouper()
     pg.addFile('sample-filename', fp)
     self.assertEqual('1 virus found in 1 sample', pg._title())
 def testNoFilesToHTML(self):
     """
     If no files have been given to a protein grouper, its HTML string
     format must as expected.
     """
     pg = ProteinGrouper()
     self.assertEqual(
         '\n'.join([
             '<html>',
             '<head>',
             '<title>',
             '0 viruses found in 0 samples',
             '</title>',
             '</head>',
             '<body>',
             '<style>',
             '            body {',
             '                margin-left: 2%;',
             '                margin-right: 2%;',
             '            }',
             '            .sample {',
             '                margin-bottom: 2px;',
             '            }',
             '            .sample-name {',
             '                color: red;',
             '            }',
             '            .index {',
             '                font-size: small;',
             '            }',
             '            .protein-title {',
             '                font-family: "Courier New", Courier, '
             'monospace;',
             '            }',
             '            .stats {',
             '                font-family: "Courier New", Courier, '
             'monospace;',
             '                white-space: pre;',
             '            }',
             '            .protein-list {',
             '                margin-top: 2px;',
             '            }',
             '</style>',
             '</head>',
             '<body>',
             '<h1>0 viruses found in 0 samples</h1>',
             '<h2>Virus index</h2>',
             '</p>',
             '<h2>Sample index</h2>',
             '</p>',
             '<h1>Viruses by sample</h1>',
             '<h1>Samples by virus</h1>',
             '</body>',
             '</html>',
             ]),
         pg.toHTML())
Beispiel #14
0
 def testTwoLinesInOneFileDifferentPathogens(self):
     """
     If a protein grouper is given one file with two lines from different
     pathogens, its pathogenNames dict must be as expected.
     """
     fp = StringIO(
         '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n'
         '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Hepatitis B virus]\n')
     pg = ProteinGrouper()
     pg.addFile('sample-filename', fp)
     self.assertEqual(
         {
             'Lausannevirus': {
                 'sample-filename': {
                     'proteins': {
                         'gi|327410| protein 77': {
                             'bestScore': 44.2,
                             'bluePlotFilename': 'out/0.png',
                             'coverage': 0.63,
                             'readsFilename': 'out/0.fasta',
                             'hspCount': 9,
                             'index': 0,
                             'medianScore': 41.3,
                             'outDir': 'out',
                             'proteinLength': 12,
                             'proteinName': 'gi|327410| protein 77',
                             'proteinURL': None,
                             'readCount': 9,
                         },
                     },
                     'uniqueReadCount': None,
                 },
             },
             'Hepatitis B virus': {
                 'sample-filename': {
                     'proteins': {
                         'gi|327409| ubiquitin': {
                             'bestScore': 48.1,
                             'bluePlotFilename': 'out/1.png',
                             'coverage': 0.77,
                             'readsFilename': 'out/1.fasta',
                             'hspCount': 6,
                             'index': 1,
                             'medianScore': 46.6,
                             'outDir': 'out',
                             'proteinLength': 74,
                             'proteinName': 'gi|327409| ubiquitin',
                             'proteinURL': None,
                             'readCount': 5,
                         },
                     },
                     'uniqueReadCount': None,
                 },
             },
         }, pg.pathogenNames)
 def testOneLineInEachOfTwoFilesDifferentViruses(self):
     """
     If a protein grouper is given two files in two different directories,
     each with one line from the different viruses, its virusTitles dict
     must be as expected.
     """
     fp1 = StringIO(
         '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n'
     )
     fp2 = StringIO(
         '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Hepatitis B virus]\n'
     )
     pg = ProteinGrouper()
     pg.addFile('dir-1/sample-filename-1', fp1)
     pg.addFile('dir-2/sample-filename-2', fp2)
     self.assertEqual(
         {
             'Lausannevirus': {
                 'dir-1/sample-filename-1': [
                     {
                         'bestScore': 44.2,
                         'bluePlotFilename': 'dir-1/out/0.png',
                         'coverage': 0.63,
                         'fastaFilename': 'dir-1/out/0.fasta',
                         'hspCount': 9,
                         'index': 0,
                         'medianScore': 41.3,
                         'outDir': 'dir-1/out',
                         'proteinLength': 12,
                         'proteinTitle': 'gi|327410| protein 77',
                         'proteinURL': None,
                         'readCount': 9,
                     },
                 ],
             },
             'Hepatitis B virus': {
                 'dir-2/sample-filename-2': [
                     {
                         'bestScore': 48.1,
                         'bluePlotFilename': 'dir-2/out/0.png',
                         'coverage': 0.77,
                         'fastaFilename': 'dir-2/out/0.fasta',
                         'hspCount': 6,
                         'index': 0,
                         'medianScore': 46.6,
                         'outDir': 'dir-2/out',
                         'proteinLength': 74,
                         'proteinTitle': 'gi|327409| ubiquitin',
                         'proteinURL': None,
                         'readCount': 5,
                     },
                 ],
             },
         },
         pg.virusTitles)
Beispiel #16
0
 def testNoFilesToHTML(self):
     """
     If no files have been given to a protein grouper, its HTML string
     format must as expected.
     """
     pg = ProteinGrouper()
     self.assertEqual(
         '\n'.join([
             '<html>',
             '<head>',
             '<title>',
             '0 viruses found in 0 samples',
             '</title>',
             '</head>',
             '<body>',
             '<style>',
             '            body {',
             '                margin-left: 2%;',
             '                margin-right: 2%;',
             '            }',
             '            .sample {',
             '                margin-bottom: 2px;',
             '            }',
             '            .sample-name {',
             '                color: red;',
             '            }',
             '            .index {',
             '                font-size: small;',
             '            }',
             '            .protein-title {',
             '                font-family: "Courier New", Courier, '
             'monospace;',
             '            }',
             '            .stats {',
             '                font-family: "Courier New", Courier, '
             'monospace;',
             '                white-space: pre;',
             '            }',
             '            .protein-list {',
             '                margin-top: 2px;',
             '            }',
             '</style>',
             '</head>',
             '<body>',
             '<h1>0 viruses found in 0 samples</h1>',
             '<h2>Virus index</h2>',
             '</p>',
             '<h2>Sample index</h2>',
             '</p>',
             '<h1>Viruses by sample</h1>',
             '<h1>Samples by virus</h1>',
             '</body>',
             '</html>',
         ]), pg.toHTML())
Beispiel #17
0
 def testTwoLinesInOneFileTitle(self):
     """
     If a protein grouper is given one file with two protein lines, each
     from a different virus, its _title method must return the expected
     string.
     """
     fp = StringIO(
         '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [Lausannevirus]\n'
         '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [X Virus]\n')
     pg = ProteinGrouper()
     pg.addFile('sample-filename', fp)
     self.assertEqual('2 viruses found in 1 sample', pg._title())
Beispiel #18
0
 def testOneLineInEachOfTwoFilesDifferentVirusesTitle(self):
     """
     If a protein grouper is given two files, each with one line from
     different viruses, its _title method must return the expected string.
     """
     fp1 = StringIO(
         '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n')
     fp2 = StringIO('0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [HBV]\n')
     pg = ProteinGrouper()
     pg.addFile('sample-filename-1', fp1)
     pg.addFile('sample-filename-2', fp2)
     self.assertEqual('2 viruses found in 2 samples', pg._title())
Beispiel #19
0
    def testOpenNotCalledOnRepeatedCall(self):
        """
        If a repeated call to pathogenSampleFiles.add is made with the same
        arguments, no file should be read because the original result value is
        cached.
        """
        class Open(object):
            def __init__(self, test, manager):
                self.test = test
                self.manager = manager
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('out/0.fasta', filename)
                    self.count += 1
                    return File(['>id1\n', 'ACTG\n'])
                elif self.count == 1:
                    self.test.assertEqual('out/pathogen-0-sample-0.fasta',
                                          filename)
                    self.count += 1
                    return self.manager
                else:
                    self.test.fail(
                        'We are only supposed to be called twice. '
                        'Filename: %r, Args: %r, Keyword args: %r.' %
                        (filename, args, kwargs))

        fp = StringIO(
            '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n'
        )
        fastaIO = StringIO()

        @contextmanager
        def manager():
            yield fastaIO

        pg = ProteinGrouper()
        pg.addFile('filename-1', fp)
        pathogenSampleFiles = PathogenSampleFiles(pg)

        sideEffect = Open(self, manager()).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            filename = pathogenSampleFiles.add('Lausannevirus', 'filename-1')
            self.assertEqual('out/pathogen-0-sample-0.fasta', filename)
            self.assertEqual('>id1\nACTG\n', fastaIO.getvalue())

            # Repeated call. The side effect open will fail if open is
            # called at this point.
            filename = pathogenSampleFiles.add('Lausannevirus', 'filename-1')
            self.assertEqual('out/pathogen-0-sample-0.fasta', filename)
Beispiel #20
0
    def testIdenticalReadsRemoved(self):
        """
        If two proteins in the same pathogen are matched by the same read, the
        de-duplicated FASTA for the pathogen must have only one copy of the
        duplicated read.
        """
        class Open(object):
            def __init__(self, test, manager):
                self.test = test
                self.manager = manager
                self.expectedFilenames = {
                    'out/0.fasta', 'out/1.fasta',
                    'out/pathogen-0-sample-0.fasta'
                }

            def sideEffect(self, filename, *args, **kwargs):
                try:
                    self.expectedFilenames.remove(filename)
                except KeyError:
                    self.test.fail(
                        'Open called with unexpected filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))
                else:
                    if filename == 'out/0.fasta':
                        return File(['>id1\n', 'ACTG\n'])
                    elif filename == 'out/1.fasta':
                        return File(['>id1\n', 'ACTG\n', '>id2\n', 'CAGT\n'])
                    else:
                        return self.manager

        fp = StringIO(
            '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n'
            '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Lausannevirus]\n')
        fastaIO = StringIO()

        @contextmanager
        def manager():
            yield fastaIO

        pg = ProteinGrouper()
        pg.addFile('filename-1', fp)
        pathogenSampleFiles = PathogenSampleFiles(pg)

        opener = Open(self, manager())
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = opener.sideEffect
            filename = pathogenSampleFiles.add('Lausannevirus', 'filename-1')

        self.assertEqual('out/pathogen-0-sample-0.fasta', filename)
        self.assertEqual('>id1\nACTG\n>id2\nCAGT\n', fastaIO.getvalue())
        # Make sure all expected filenames were seen by the mocked open.
        self.assertEqual(set(), opener.expectedFilenames)
Beispiel #21
0
 def testDuplicatePathogenProteinSample(self):
     """
     If a protein grouper is given duplicate information for a
     pathogen/protein/sample combination it must raise a ValueError.
     """
     fp = StringIO(
         '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [Lausannevirus]\n')
     pg = ProteinGrouper()
     pg.addFile('sample', fp)
     fp.seek(0)
     error = ("^Protein 'gi\|327\|X\|I44.6 ubiquitin' already seen for "
              "pathogen 'Lausannevirus' sample 'sample'\.$")
     assertRaisesRegex(self, ValueError, error, pg.addFile, 'sample', fp)
Beispiel #22
0
 def testDuplicatePathogenProteinSample(self):
     """
     If a protein grouper is given duplicate information for a
     pathogen/protein/sample combination it must raise a ValueError.
     """
     fp = StringIO(
         '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [Lausannevirus]\n')
     pg = ProteinGrouper()
     pg.addFile('sample', fp)
     fp.seek(0)
     error = ("^Protein 'gi\\|327\\|X\\|I44.6 ubiquitin' already seen for "
              "pathogen 'Lausannevirus' sample 'sample'\\.$")
     assertRaisesRegex(self, ValueError, error, pg.addFile, 'sample', fp)
 def testTwoLinesInOneFileTitle(self):
     """
     If a protein grouper is given one file with two protein lines, each
     from a different virus, its _title method must return the expected
     string.
     """
     fp = StringIO(
         '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [Lausannevirus]\n'
         '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [X Virus]\n'
         )
     pg = ProteinGrouper()
     pg.addFile('sample-filename', fp)
     self.assertEqual('2 viruses found in 1 sample', pg._title())
Beispiel #24
0
    def testOpenNotCalledOnRepeatedCall(self):
        """
        If a repeated call to pathogenSampleFiles.add is made with the same
        arguments, no file should be read because the original result value is
        cached.
        """
        class Open(object):
            def __init__(self, test, manager):
                self.test = test
                self.manager = manager
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('out/0.fasta', filename)
                    self.count += 1
                    return File(['>id1\n', 'ACTG\n'])
                elif self.count == 1:
                    self.test.assertEqual('out/pathogen-0-sample-0.fasta',
                                          filename)
                    self.count += 1
                    return self.manager
                else:
                    self.test.fail(
                        'We are only supposed to be called twice. '
                        'Filename: %r, Args: %r, Keyword args: %r.' %
                        (filename, args, kwargs))

        fp = StringIO(
            '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n')
        fastaIO = StringIO()

        @contextmanager
        def manager():
            yield fastaIO

        pg = ProteinGrouper()
        pg.addFile('filename-1', fp)
        pathogenSampleFiles = PathogenSampleFiles(pg)

        sideEffect = Open(self, manager()).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            filename = pathogenSampleFiles.add('Lausannevirus', 'filename-1')
            self.assertEqual('out/pathogen-0-sample-0.fasta', filename)
            self.assertEqual('>id1\nACTG\n', fastaIO.getvalue())

            # Repeated call. The side effect open will fail if open is
            # called at this point.
            filename = pathogenSampleFiles.add('Lausannevirus', 'filename-1')
            self.assertEqual('out/pathogen-0-sample-0.fasta', filename)
Beispiel #25
0
    def testIdenticalReadsRemoved(self):
        """
        If two proteins in the same pathogen are matched by the same read, the
        de-duplicated FASTA for the pathogen must have only one copy of the
        duplicated read.
        """
        class Open(object):
            def __init__(self, test, manager):
                self.test = test
                self.manager = manager
                self.expectedFilenames = {'out/0.fasta', 'out/1.fasta',
                                          'out/pathogen-0-sample-0.fasta'}

            def sideEffect(self, filename, *args, **kwargs):
                try:
                    self.expectedFilenames.remove(filename)
                except KeyError:
                    self.test.fail(
                        'Open called with unexpected filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))
                else:
                    if filename == 'out/0.fasta':
                        return File(['>id1\n', 'ACTG\n'])
                    elif filename == 'out/1.fasta':
                        return File(['>id1\n', 'ACTG\n', '>id2\n', 'CAGT\n'])
                    else:
                        return self.manager

        fp = StringIO(
            '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n'
            '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Lausannevirus]\n'
        )
        fastaIO = StringIO()

        @contextmanager
        def manager():
            yield fastaIO

        pg = ProteinGrouper()
        pg.addFile('filename-1', fp)
        pathogenSampleFiles = PathogenSampleFiles(pg)

        opener = Open(self, manager())
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = opener.sideEffect
            filename = pathogenSampleFiles.add('Lausannevirus', 'filename-1')

        self.assertEqual('out/pathogen-0-sample-0.fasta', filename)
        self.assertEqual('>id1\nACTG\n>id2\nCAGT\n', fastaIO.getvalue())
        # Make sure all expected filenames were seen by the mocked open.
        self.assertEqual(set(), opener.expectedFilenames)
Beispiel #26
0
    def testReadLengthsAdded(self):
        """
        If saveReadLengths is True for a ProteinGrouper, read lengths must be
        saved for each protein.
        """
        class Open(object):
            def __init__(self, test, manager):
                self.test = test
                self.manager = manager
                self.expectedFilenames = {
                    'out/0.fasta', 'out/1.fasta',
                    'out/pathogen-0-sample-0.fasta'
                }

            def sideEffect(self, filename, *args, **kwargs):
                if filename in self.expectedFilenames:
                    if filename == 'out/0.fasta':
                        return File(['>id1\n', 'ACTG\n'])
                    elif filename == 'out/1.fasta':
                        return File(['>id2\n', 'AC\n', '>id3\n', 'CAGTTTT\n'])
                    else:
                        return self.manager
                else:
                    self.test.fail(
                        'Open called with unexpected filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        fp = StringIO(
            '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n'
            '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Lausannevirus]\n')
        fastaIO = StringIO()

        @contextmanager
        def manager():
            yield fastaIO

        opener = Open(self, manager())
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = opener.sideEffect
            pg = ProteinGrouper(saveReadLengths=True)
            pg.addFile('filename-1', fp)
            pathogenSampleFiles = PathogenSampleFiles(pg)
            pathogenSampleFiles.add('Lausannevirus', 'filename-1')

        # Read lengths must be saved correctly.
        proteins = pg.pathogenNames['Lausannevirus']['filename-1']['proteins']
        self.assertEqual((4, ),
                         proteins['gi|327410| protein 77']['readLengths'])
        self.assertEqual((2, 7),
                         proteins['gi|327409| ubiquitin']['readLengths'])
Beispiel #27
0
    def testReadLengthsAdded(self):
        """
        If saveReadLengths is True for a ProteinGrouper, read lengths must be
        saved for each protein.
        """
        class Open(object):
            def __init__(self, test, manager):
                self.test = test
                self.manager = manager
                self.expectedFilenames = {'out/0.fasta', 'out/1.fasta',
                                          'out/pathogen-0-sample-0.fasta'}

            def sideEffect(self, filename, *args, **kwargs):
                if filename in self.expectedFilenames:
                    if filename == 'out/0.fasta':
                        return File(['>id1\n', 'ACTG\n'])
                    elif filename == 'out/1.fasta':
                        return File(['>id2\n', 'AC\n', '>id3\n', 'CAGTTTT\n'])
                    else:
                        return self.manager
                else:
                    self.test.fail(
                        'Open called with unexpected filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        fp = StringIO(
            '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n'
            '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Lausannevirus]\n'
        )
        fastaIO = StringIO()

        @contextmanager
        def manager():
            yield fastaIO

        opener = Open(self, manager())
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = opener.sideEffect
            pg = ProteinGrouper(saveReadLengths=True)
            pg.addFile('filename-1', fp)
            pathogenSampleFiles = PathogenSampleFiles(pg)
            pathogenSampleFiles.add('Lausannevirus', 'filename-1')

        # Read lengths must be saved correctly.
        proteins = pg.pathogenNames['Lausannevirus']['filename-1']['proteins']
        self.assertEqual((4,),
                         proteins['gi|327410| protein 77']['readLengths'])
        self.assertEqual((2, 7),
                         proteins['gi|327409| ubiquitin']['readLengths'])
Beispiel #28
0
    def testMaxProteinFraction(self):
        """
        The maxProteinFraction method must return the correct values.
        """
        class SideEffect(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('proteins.fasta', filename)
                    self.count += 1
                    return File(['>protein 1 [pathogen 1]\n',
                                 'ACTG\n',
                                 '>protein 2 [pathogen 1]\n',
                                 'AA\n',
                                 '>protein 3 [pathogen 1]\n',
                                 'AA\n',
                                 '>protein 4 [pathogen 1]\n',
                                 'AA\n',
                                 '>no pathogen name here\n',
                                 'AA\n',
                                 '>protein 5 [pathogen 2]\n',
                                 'AA\n'])
                else:
                    self.test.fail('We are only supposed to be called once!')

        sideEffect = SideEffect(self)
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect.sideEffect
            pg = ProteinGrouper(proteinFastaFilenames=['proteins.fasta'])
            self.assertEqual(1, sideEffect.count)

            fp = StringIO(
                '0.77 46.6 48.1 5 6 74 gi|32|X|I4 protein 1 [pathogen 1]\n'
                '0.77 46.6 48.1 5 6 74 gi|32|X|I4 protein 5 [pathogen 2]\n'
            )
            pg.addFile('sample-1', fp)

            fp = StringIO(
                '0.77 46.6 48.1 5 6 74 gi|32|X|I4 protein 2 [pathogen 1]\n'
                '0.77 46.6 48.1 5 6 74 gi|32|X|I4 protein 3 [pathogen 1]\n'
            )
            pg.addFile('sample-1', fp)

            fp = StringIO(
                '0.77 46.6 48.1 5 6 74 gi|32|X|I4 protein 1 [pathogen 1]\n'
                '0.77 46.6 48.1 5 6 74 gi|32|X|I4 protein 2 [pathogen 1]\n'
            )
            pg.addFile('sample-2', fp)

            self.assertEqual(0.75, pg.maxProteinFraction('pathogen 1'))
            self.assertEqual(1.0, pg.maxProteinFraction('pathogen 2'))
 def testOneLineInEachOfTwoFilesDifferentVirusesTitle(self):
     """
     If a protein grouper is given two files, each with one line from
     different viruses, its _title method must return the expected string.
     """
     fp1 = StringIO(
         '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n'
     )
     fp2 = StringIO(
         '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [HBV]\n'
     )
     pg = ProteinGrouper()
     pg.addFile('sample-filename-1', fp1)
     pg.addFile('sample-filename-2', fp2)
     self.assertEqual('2 viruses found in 2 samples', pg._title())
Beispiel #30
0
 def testOneLineInEachOfTwoFilesSamePathogenTitle(self):
     """
     If a protein grouper is given two files, each with one line from the
     same pathogen, its _title method must return the expected string.
     """
     fp1 = StringIO(
         '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n')
     fp2 = StringIO(
         '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Lausannevirus]\n')
     pg = ProteinGrouper()
     pg.addFile('sample-filename-1', fp1)
     pg.addFile('sample-filename-2', fp2)
     self.assertEqual(
         'Overall, proteins from 1 pathogen were found in 2 samples.',
         pg._title())
Beispiel #31
0
 def testOneLineInOneFileToStr(self):
     """
     If a protein grouper is given one file with one line, its toStr method
     must produce the expected result.
     """
     fp = StringIO('0.77 46.6 48.1 5 6 74 gi|32|X|I4 protein X [HBV]\n')
     pg = ProteinGrouper()
     pg.addFile('sample-filename', fp)
     self.assertEqual(
         '1 virus found in 1 sample\n'
         '\n'
         'HBV (in 1 sample)\n'
         '  sample-filename (1 protein, 5 reads)\n'
         '    0.77\t46.60\t48.10\t   5\t   6\t  0\tgi|32|X|I4 protein X\n',
         pg.toStr())
Beispiel #32
0
 def testAssetDir(self):
     """
     If an asset directorey is given to a protein grouper, its _assetDir
     attribute be set to hold that value.
     """
     pg = ProteinGrouper(assetDir='xxx')
     self.assertEqual('xxx', pg._assetDir)
 def testOneLineInOneFileToStr(self):
     """
     If a protein grouper is given one file with one line, its toStr method
     must produce the expected result.
     """
     fp = StringIO(
         '0.77 46.6 48.1 5 6 74 gi|32|X|I4 protein X [HBV]\n')
     pg = ProteinGrouper()
     pg.addFile('sample-filename', fp)
     self.assertEqual(
         '1 virus found in 1 sample\n'
         '\n'
         'HBV (in 1 sample)\n'
         '  sample-filename (1 protein, 5 reads)\n'
         '    0.77\t46.60\t48.10\t   5\t   6\t  0\tgi|32|X|I4 protein X\n',
         pg.toStr())
Beispiel #34
0
 def testNoAssetDir(self):
     """
     If no asset directorey is given to a protein grouper, its _assetDir
     attribute be the default ('out').
     """
     pg = ProteinGrouper()
     self.assertEqual('out', pg._assetDir)
Beispiel #35
0
 def testNoRegex(self):
     """
     If no regex is given to a protein grouper, its _sampleNameRegex
     attribute be None.
     """
     pg = ProteinGrouper()
     self.assertEqual(None, pg._sampleNameRegex)
 def testTwoLinesInOneFileSameVirus(self):
     """
     If a protein grouper is given one file with two lines from the same
     virus, its virusTitles dict must be as expected.
     """
     fp = StringIO(
         '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n'
         '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Lausannevirus]\n'
     )
     pg = ProteinGrouper()
     pg.addFile('sample-filename', fp)
     self.assertEqual(
         {
             'Lausannevirus': {
                 'sample-filename': [
                     {
                         'bestScore': 44.2,
                         'bluePlotFilename': 'out/0.png',
                         'coverage': 0.63,
                         'fastaFilename': 'out/0.fasta',
                         'hspCount': 9,
                         'index': 0,
                         'medianScore': 41.3,
                         'outDir': 'out',
                         'proteinLength': 12,
                         'proteinTitle': 'gi|327410| protein 77',
                         'proteinURL': None,
                         'readCount': 9,
                     },
                     {
                         'bestScore': 48.1,
                         'bluePlotFilename': 'out/1.png',
                         'coverage': 0.77,
                         'fastaFilename': 'out/1.fasta',
                         'hspCount': 6,
                         'index': 1,
                         'medianScore': 46.6,
                         'outDir': 'out',
                         'proteinLength': 74,
                         'proteinTitle': 'gi|327409| ubiquitin',
                         'proteinURL': None,
                         'readCount': 5,
                     },
                 ],
             },
         },
         pg.virusTitles)
Beispiel #37
0
 def testOneLineInEachOfTwoFilesSameVirus(self):
     """
     If a protein grouper is given two files, each with one line from the
     same virus, its virusTitles dict must be as expected.
     """
     fp1 = StringIO(
         '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n')
     fp2 = StringIO(
         '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Lausannevirus]\n')
     pg = ProteinGrouper()
     pg.addFile('sample-filename-1', fp1)
     pg.addFile('sample-filename-2', fp2)
     self.assertEqual(
         {
             'Lausannevirus': {
                 'sample-filename-1': [
                     {
                         'bestScore': 44.2,
                         'bluePlotFilename': 'out/0.png',
                         'coverage': 0.63,
                         'fastaFilename': 'out/0.fasta',
                         'hspCount': 9,
                         'index': 0,
                         'medianScore': 41.3,
                         'proteinLength': 12,
                         'proteinTitle': 'gi|327410| protein 77',
                         'proteinURL': None,
                         'readCount': 9,
                     },
                 ],
                 'sample-filename-2': [
                     {
                         'bestScore': 48.1,
                         'bluePlotFilename': 'out/0.png',
                         'coverage': 0.77,
                         'fastaFilename': 'out/0.fasta',
                         'hspCount': 6,
                         'index': 0,
                         'medianScore': 46.6,
                         'proteinLength': 74,
                         'proteinTitle': 'gi|327409| ubiquitin',
                         'proteinURL': None,
                         'readCount': 5,
                     },
                 ],
             },
         }, pg.virusTitles)
Beispiel #38
0
 def testOneLineInEachOfTwoFilesSamePathogenTitle(self):
     """
     If a protein grouper is given two files, each with one line from the
     same pathogen, its _title method must return the expected string.
     """
     fp1 = StringIO(
         '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n'
     )
     fp2 = StringIO(
         '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Lausannevirus]\n'
     )
     pg = ProteinGrouper()
     pg.addFile('sample-filename-1', fp1)
     pg.addFile('sample-filename-2', fp2)
     self.assertEqual(
         'Overall, proteins from 1 pathogen were found in 2 samples.',
         pg._title())
Beispiel #39
0
 def testNoFiles(self):
     """
     If no files have been given to a protein grouper, its sample names and
     virus titles attributes must both be empty.
     """
     pg = ProteinGrouper()
     self.assertEqual({}, pg.virusTitles)
     self.assertEqual({}, pg.sampleNames)
Beispiel #40
0
 def testOneLineInOneFileWithDifferentAssetDir(self):
     """
     If a protein grouper is given a different assetDir name,
     the outDir needs to have that same name, as expected.
     """
     fp = StringIO(
         '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [Lausannevirus]\n')
     pg = ProteinGrouper(assetDir='differentname')
     pg.addFile('sample-filename', fp)
     self.assertEqual(
         {
             'Lausannevirus': {
                 'sample-filename': {
                     'proteins': {
                         'gi|327|X|I44.6 ubiquitin': {
                             'bestScore':
                             48.1,
                             'bluePlotFilename':
                             'differentname/0.png',
                             'coverage':
                             0.77,
                             'readsFilename':
                             'differentname/0.fasta',
                             'hspCount':
                             6,
                             'index':
                             0,
                             'medianScore':
                             46.6,
                             'outDir':
                             'differentname',
                             'proteinLength':
                             74,
                             'proteinName':
                             'gi|327|X|I44.6 ubiquitin',
                             'proteinURL':
                             ('http://www.ncbi.nlm.nih.gov/nuccore/I44'),
                             'readCount':
                             5,
                         },
                     },
                     'uniqueReadCount': None,
                 },
             }
         }, pg.pathogenNames)
Beispiel #41
0
 def testOneLineInOneFileFASTQ(self):
     """
     If a protein grouper is given one file with one line, its pathogenNames
     dict must be as expected, including for a FASTQ file.
     """
     fp = StringIO(
         '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [Lausannevirus]\n')
     pg = ProteinGrouper(format_='fastq')
     pg.addFile('sample-filename', fp)
     self.assertEqual(
         {
             'Lausannevirus': {
                 'sample-filename': {
                     'proteins': {
                         'gi|327|X|I44.6 ubiquitin': {
                             'bestScore':
                             48.1,
                             'bluePlotFilename':
                             'out/0.png',
                             'coverage':
                             0.77,
                             'readsFilename':
                             'out/0.fastq',
                             'hspCount':
                             6,
                             'index':
                             0,
                             'medianScore':
                             46.6,
                             'outDir':
                             'out',
                             'proteinLength':
                             74,
                             'proteinName':
                             'gi|327|X|I44.6 ubiquitin',
                             'proteinURL':
                             ('http://www.ncbi.nlm.nih.gov/nuccore/I44'),
                             'readCount':
                             5,
                         },
                     },
                     'uniqueReadCount': None,
                 },
             }
         }, pg.pathogenNames)
Beispiel #42
0
 def testUnknownPathogenType(self):
     """
     If the toHTML method of a protein grouper is given an unknown pathogen
     type it must raise a ValueError.
     """
     pg = ProteinGrouper()
     error = ("^Unrecognized pathogenType argument: 'x'\\. Value must be "
              "either 'bacterial' or 'viral'\\.$")
     assertRaisesRegex(self, ValueError, error, pg.toHTML, pathogenType='x')
Beispiel #43
0
 def testOneLineInOneFile(self):
     """
     If a protein grouper is given one file with one line, its virusTitles
     dict must be as expected.
     """
     fp = StringIO(
         '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [Lausannevirus]\n')
     pg = ProteinGrouper()
     pg.addFile('sample-filename', fp)
     self.assertEqual(
         {
             'Lausannevirus': {
                 'sample-filename': [
                     {
                         'bestScore':
                         48.1,
                         'bluePlotFilename':
                         'out/0.png',
                         'coverage':
                         0.77,
                         'fastaFilename':
                         'out/0.fasta',
                         'hspCount':
                         6,
                         'index':
                         0,
                         'medianScore':
                         46.6,
                         'proteinLength':
                         74,
                         'proteinTitle':
                         'gi|327|X|I44.6 ubiquitin',
                         'proteinURL':
                         ('http://www.ncbi.nlm.nih.gov/nuccore/I44'),
                         'readCount':
                         5,
                     },
                 ]
             }
         }, pg.virusTitles)
Beispiel #44
0
 def testUnknownFormat(self):
     """
     Passing an unknown format argument must result in a ValueError
     being raised.
     """
     pg = ProteinGrouper()
     error = "^format_ must be either 'fasta' or 'fastq'\\.$"
     assertRaisesRegex(self,
                       ValueError,
                       error,
                       PathogenSampleFiles,
                       pg,
                       format_='unknown')
    parser.add_argument(
        'filenames', nargs='*', help='Sample file names to read input from.')

    parser.add_argument(
        '--sampleNameRegex', default=None,
        help=('An (optional) regular expression that can be used to extract a '
              'short sample name from full sample file name.  The regular '
              'expression must have a matching group (delimited by '
              'parentheses) to capture the part of the file name that should '
              'be used as the sample name.'))

    parser.add_argument(
        '--html', default=False, action='store_true',
        help='If specified, output HTML instead of plain text.')

    args = parser.parse_args()

    grouper = ProteinGrouper(sampleNameRegex=args.sampleNameRegex)

    if args.filenames:
        filenames = args.filenames
    else:
        filenames = (line[:-1] for line in sys.stdin)

    for filename in filenames:
        with open(filename) as fp:
            grouper.addFile(filename, fp)

    print(grouper.toHTML() if args.html else grouper.toStr())
    parser.add_argument(
        '--sampleNameRegex',
        default=None,
        help=('An (optional) regular expression that can be used to extract a '
              'short sample name from full sample file name.  The regular '
              'expression must have a matching group (delimited by '
              'parentheses) to capture the part of the file name that should '
              'be used as the sample name.'))

    parser.add_argument(
        '--html',
        default=False,
        action='store_true',
        help='If specified, output HTML instead of plain text.')

    args = parser.parse_args()

    grouper = ProteinGrouper(sampleNameRegex=args.sampleNameRegex)

    if args.filenames:
        filenames = args.filenames
    else:
        filenames = (line[:-1] for line in sys.stdin)

    for filename in filenames:
        with open(filename) as fp:
            grouper.addFile(filename, fp)

    print(grouper.toHTML() if args.html else grouper.toStr())
Beispiel #47
0
        # action='append'. We use both because it allows people to use
        # (e.g.)  --pff on the command line either via "--pff file1 --pff
        # file2" or "--pff file1 file2", or a combination of these. That
        # way it's not necessary to remember which way you're supposed to
        # use it and you also can't be hit by the subtle problem
        # encountered in https://github.com/acorg/dark-matter/issues/453
        proteinFastaFilenames = list(
            chain.from_iterable(args.proteinFastaFilename))
    else:
        proteinFastaFilenames = None

    grouper = ProteinGrouper(assetDir=args.assetDir,
                             sampleName=args.sampleName,
                             sampleNameRegex=args.sampleNameRegex,
                             format_=args.format,
                             proteinFastaFilenames=proteinFastaFilenames,
                             saveReadLengths=args.showReadLengths,
                             titleRegex=args.titleRegex,
                             negativeTitleRegex=args.negativeTitleRegex,
                             pathogenDataDir=args.pathogenDataDir)

    if args.filenames:
        filenames = args.filenames
    else:
        filenames = (line[:-1] for line in sys.stdin)

    for filename in filenames:
        with open(filename) as fp:
            grouper.addFile(filename, fp)

    if args.html:
Beispiel #48
0
    def testProteinsSavedCorrectly(self):
        """
        Information about proteins must be saved correctly in the
        ProteinGrouper for a given pathogen/sample combination.
        """
        class Open(object):
            def __init__(self, test, manager):
                self.test = test
                self.manager = manager
                self.expectedFilenames = {
                    'out/0.fasta', 'out/1.fasta',
                    'out/pathogen-0-sample-0.fasta'
                }

            def sideEffect(self, filename, *args, **kwargs):
                if filename in self.expectedFilenames:
                    if filename == 'out/0.fasta':
                        return File(['>id1\n', 'ACTG\n'])
                    elif filename == 'out/1.fasta':
                        return File(['>id2\n', 'AC\n', '>id3\n', 'CAGTTTT\n'])
                    else:
                        return self.manager
                else:
                    self.test.fail(
                        'Open called with unexpected filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        fp = StringIO(
            '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n'
            '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Lausannevirus]\n')
        fastaIO = StringIO()

        @contextmanager
        def manager():
            yield fastaIO

        opener = Open(self, manager())
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = opener.sideEffect
            pg = ProteinGrouper()
            pg.addFile('filename-1', fp)
            pathogenSampleFiles = PathogenSampleFiles(pg)
            pathogenSampleFiles.add('Lausannevirus', 'filename-1')

        self.assertEqual(
            {
                'proteins': {
                    'gi|327409| ubiquitin': {
                        'bestScore': 48.1,
                        'bluePlotFilename': 'out/1.png',
                        'coverage': 0.77,
                        'hspCount': 6,
                        'index': 1,
                        'medianScore': 46.6,
                        'outDir': 'out',
                        'proteinLength': 74,
                        'proteinName': 'gi|327409| ubiquitin',
                        'proteinURL': None,
                        'readCount': 5,
                        'readsFilename': 'out/1.fasta',
                    },
                    'gi|327410| protein 77': {
                        'bestScore': 44.2,
                        'bluePlotFilename': 'out/0.png',
                        'coverage': 0.63,
                        'hspCount': 9,
                        'index': 0,
                        'medianScore': 41.3,
                        'outDir': 'out',
                        'proteinLength': 12,
                        'proteinName': 'gi|327410| protein 77',
                        'proteinURL': None,
                        'readCount': 9,
                        'readsFilename': 'out/0.fasta',
                    }
                },
                'uniqueReadCount': 3,
            }, pg.pathogenNames['Lausannevirus']['filename-1'])
    if args.proteinFastaFilename:
        # Flatten lists of lists that we get from using both nargs='+' and
        # action='append'. We use both because it allows people to use
        # (e.g.)  --pff on the command line either via "--pff file1 --pff
        # file2" or "--pff file1 file2", or a combination of these. That
        # way it's not necessary to remember which way you're supposed to
        # use it and you also can't be hit by the subtle problem
        # encountered in https://github.com/acorg/dark-matter/issues/453
        proteinFastaFilenames = list(chain.from_iterable(
            args.proteinFastaFilename))
    else:
        proteinFastaFilenames = None

    grouper = ProteinGrouper(assetDir=args.assetDir,
                             sampleNameRegex=args.sampleNameRegex,
                             format_=args.format,
                             proteinFastaFilenames=proteinFastaFilenames,
                             saveReadLengths=args.showReadLengths)

    if args.filenames:
        filenames = args.filenames
    else:
        filenames = (line[:-1] for line in sys.stdin)

    for filename in filenames:
        with open(filename) as fp:
            grouper.addFile(filename, fp)

    if args.html:
        print(grouper.toHTML(args.pathogenPanelFilename,
                             minProteinFraction=args.minProteinFraction,
    if args.proteinFastaFilename:
        # Flatten lists of lists that we get from using both nargs='+' and
        # action='append'. We use both because it allows people to use
        # (e.g.)  --pff on the command line either via "--pff file1 --pff
        # file2" or "--pff file1 file2", or a combination of these. That
        # way it's not necessary to remember which way you're supposed to
        # use it and you also can't be hit by the subtle problem
        # encountered in https://github.com/acorg/dark-matter/issues/453
        proteinFastaFilenames = list(chain.from_iterable(
            args.proteinFastaFilename))
    else:
        proteinFastaFilenames = None

    grouper = ProteinGrouper(assetDir=args.assetDir,
                             sampleName=args.sampleName,
                             sampleNameRegex=args.sampleNameRegex,
                             format_=args.format,
                             proteinFastaFilenames=proteinFastaFilenames,
                             saveReadLengths=args.showReadLengths)

    if args.filenames:
        filenames = args.filenames
    else:
        filenames = (line[:-1] for line in sys.stdin)

    for filename in filenames:
        with open(filename) as fp:
            grouper.addFile(filename, fp)

    if args.html:
        print(grouper.toHTML(args.pathogenPanelFilename,
                             minProteinFraction=args.minProteinFraction,
Beispiel #51
0
    def testProteinsSavedCorrectly(self):
        """
        Information about proteins must be saved correctly in the
        ProteinGrouper for a given pathogen/sample combination.
        """
        class Open(object):
            def __init__(self, test, manager):
                self.test = test
                self.manager = manager
                self.expectedFilenames = {'out/0.fasta', 'out/1.fasta',
                                          'out/pathogen-0-sample-0.fasta'}

            def sideEffect(self, filename, *args, **kwargs):
                if filename in self.expectedFilenames:
                    if filename == 'out/0.fasta':
                        return File(['>id1\n', 'ACTG\n'])
                    elif filename == 'out/1.fasta':
                        return File(['>id2\n', 'AC\n', '>id3\n', 'CAGTTTT\n'])
                    else:
                        return self.manager
                else:
                    self.test.fail(
                        'Open called with unexpected filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        fp = StringIO(
            '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n'
            '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Lausannevirus]\n'
        )
        fastaIO = StringIO()

        @contextmanager
        def manager():
            yield fastaIO

        opener = Open(self, manager())
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = opener.sideEffect
            pg = ProteinGrouper()
            pg.addFile('filename-1', fp)
            pathogenSampleFiles = PathogenSampleFiles(pg)
            pathogenSampleFiles.add('Lausannevirus', 'filename-1')

        self.assertEqual(
            {
                'proteins': {
                    'gi|327409| ubiquitin': {
                        'bestScore': 48.1,
                        'bluePlotFilename': 'out/1.png',
                        'coverage': 0.77,
                        'hspCount': 6,
                        'index': 1,
                        'medianScore': 46.6,
                        'outDir': 'out',
                        'proteinLength': 74,
                        'proteinName': 'gi|327409| ubiquitin',
                        'proteinURL': None,
                        'readCount': 5,
                        'readsFilename': 'out/1.fasta',
                    },
                    'gi|327410| protein 77': {
                        'bestScore': 44.2,
                        'bluePlotFilename': 'out/0.png',
                        'coverage': 0.63,
                        'hspCount': 9,
                        'index': 0,
                        'medianScore': 41.3,
                        'outDir': 'out',
                        'proteinLength': 12,
                        'proteinName': 'gi|327410| protein 77',
                        'proteinURL': None,
                        'readCount': 9,
                        'readsFilename': 'out/0.fasta',
                    }
                },
                'uniqueReadCount': 3,
            },
            pg.pathogenNames['Lausannevirus']['filename-1'])