Ejemplo n.º 1
0
    def testTwoFiles(self):
        """
        It must be possible to read from two FASTQ files.
        """
        class SideEffect(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename):
                if self.count == 0:
                    self.test.assertEqual('file1.fastq', filename)
                    self.count += 1
                    return File(['@id1\n', 'ACTG\n', '+\n', '!!!!\n'])
                elif self.count == 1:
                    self.test.assertEqual('file2.fastq', filename)
                    self.count += 1
                    return File(['@id2\n', 'CAGT\n', '+\n', '!!!!\n'])
                else:
                    self.fail('We are only supposed to be called twice!')

        sideEffect = SideEffect(self)
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect.sideEffect
            reads = FastqReads(['file1.fastq', 'file2.fastq'])
            self.assertEqual([
                DNARead('id1', 'ACTG', '!!!!'),
                DNARead('id2', 'CAGT', '!!!!'),
            ], list(reads))
Ejemplo n.º 2
0
def parseColors(colors, args):
    """
    Parse read id color specification.

    @param colors: A C{list} of C{str}s. Each item is of the form, e.g.,
        'green X Y Z...', where each of X, Y, Z, ... etc. is either a read
        id or the name of a FASTA or FASTQ file containing reads whose ids
        should be displayed with the corresponding color. Note that if read
        ids contain spaces you will need to use the latter (i.e. FASTA/Q file
        name) approach because C{args.colors} is split on whitespace.
    @param args: The argparse C{Namespace} instance holding the other parsed
        command line arguments.
    @return: A C{dict} whose keys are colors and whose values are sets of
        read ids.
    """
    result = defaultdict(set)
    for colorInfo in colors:
        readIds = colorInfo.split()
        color = readIds.pop(0)
        for readId in readIds:
            if os.path.isfile(readId):
                filename = readId
                if args.fasta:
                    reads = FastaReads(filename)
                else:
                    reads = FastqReads(filename)
                for read in reads:
                    result[color].add(read.id)
            else:
                result[color].add(readId)
    return result
Ejemplo n.º 3
0
 def testEmpty(self):
     """
     An empty FASTQ file results in an empty iterator.
     """
     with patch.object(builtins, 'open', mock_open()):
         reads = FastqReads('filename.fastq')
         self.assertEqual([], list(reads))
Ejemplo n.º 4
0
 def testOneRead(self):
     """
     A FASTQ file with one read must be read properly.
     """
     data = '\n'.join(['@id1', 'ACGT', '+', '!!!!'])
     with patch.object(builtins, 'open', mock_open(read_data=data)):
         reads = list(FastqReads('filename.fastq'))
         self.assertEqual([DNARead('id1', 'ACGT', '!!!!')], reads)
Ejemplo n.º 5
0
 def testTypeRNA(self):
     """
     A FASTQ file whose read class is RNARead must result in reads that
     are instances of RNARead.
     """
     data = '\n'.join(['@id1', 'ACGT', '+', '!!!!'])
     with patch.object(builtins, 'open', mock_open(read_data=data)):
         reads = list(FastqReads('filename.fastq', RNARead))
         self.assertTrue(isinstance(reads[0], RNARead))
Ejemplo n.º 6
0
 def testTypeDefaultsToDNA(self):
     """
     A FASTQ file whose type is not specified must result in reads that
     are instances of DNARead.
     """
     data = '\n'.join(['@id1', 'ACGT', '+', '!!!!'])
     with patch.object(builtins, 'open', mock_open(read_data=data)):
         reads = list(FastqReads('filename.fastq'))
         self.assertTrue(isinstance(reads[0], DNARead))
Ejemplo n.º 7
0
 def testTwoReads(self):
     """
     A FASTQ file with two reads must be read properly and its
     sequences must be returned in the correct order.
     """
     data = '\n'.join(['@id1', 'ACGT', '+', '!!!!',
                       '@id2', 'TGCA', '+', '????'])
     mockOpener = mockOpen(read_data=data)
     with patch.object(builtins, 'open', mockOpener):
         reads = list(FastqReads('filename.fastq'))
         self.assertEqual(2, len(reads))
         self.assertEqual([DNARead('id1', 'ACGT', '!!!!'),
                           DNARead('id2', 'TGCA', '????')], reads)
    # "--json file1 file2", or a combination of these. That way it's not
    # necessary to remember which way you're supposed to use it and you also
    # can't be hit by the subtle problem encountered in
    # https://github.com/acorg/dark-matter/issues/453
    jsonFiles = list(chain.from_iterable(args.json))
    whitelist = (
        set(chain.from_iterable(args.whitelist)) if args.whitelist else None)
    blacklist = (
        set(chain.from_iterable(args.blacklist)) if args.blacklist else None)

    # TODO: Add a --readClass command-line option in case we want to
    # process FASTA containing AA sequences.
    if args.fasta:
        reads = FastaReads(list(chain.from_iterable(args.fasta)))
    else:
        reads = FastqReads(list(chain.from_iterable(args.fastq)))

    if args.matcher == 'blast':
        from dark.blast.alignments import BlastReadsAlignments
        readsAlignments = BlastReadsAlignments(reads, jsonFiles)
    else:
        # Must be 'diamond' (due to parser.add_argument 'choices' argument).
        if (args.diamondDatabaseFastaFilename is None and
                args.diamondSqliteDatabaseFilename is None):
            print('Either --diamondDatabaseFastaFilename or '
                  '--diamondSqliteDatabaseFilename must be used with '
                  '--matcher diamond.', file=sys.stderr)
            sys.exit(1)
        elif not (args.diamondDatabaseFastaFilename is None or
                  args.diamondSqliteDatabaseFilename is None):
            print('--diamondDatabaseFastaFilename and '
Ejemplo n.º 9
0
#!/usr/bin/env python

from __future__ import print_function

import sys
from os.path import basename

from dark.fastq import FastqReads

# Print a usage message if any arguments were given on the command
# line. This is to remind people who want to provide filenames that they
# must use '<' to supply our stdin and '>' to store our output.

if len(sys.argv) > 1:
    print('Usage: %s < input.fastq [> output.fasta]' % basename(sys.argv[0]),
          file=sys.stderr)
    sys.exit(1)
else:
    write = sys.stdout.write
    for read in FastqReads(sys.stdin):
        write(read.toString('fasta'))
Ejemplo n.º 10
0
        set(chain.from_iterable(args.whitelist)) if args.whitelist else None)
    blacklist = (
        set(chain.from_iterable(args.blacklist)) if args.blacklist else None)

    if args.fasta:
        if args.sortFilenames:
            files = numericallySortFilenames(chain.from_iterable(args.fasta))
        else:
            files = list(chain.from_iterable(args.fasta))
        reads = FastaReads(files)
    else:
        if args.sortFilenames:
            files = numericallySortFilenames(chain.from_iterable(args.fastq))
        else:
            files = list(chain.from_iterable(args.fastq))
        reads = FastqReads(files)

    if args.matcher == 'blast':
        from dark.blast.alignments import BlastReadsAlignments
        readsAlignments = BlastReadsAlignments(
            reads, jsonFiles, databaseFilename=args.databaseFastaFilename,
            databaseDirectory=args.databaseFastaDirectory,
            sqliteDatabaseFilename=args.sqliteDatabaseFilename,
            sortBlastFilenames=args.sortFilenames)
    else:
        # Must be 'diamond' (due to parser.add_argument 'choices' argument).
        if (args.databaseFastaFilename is None and
                args.sqliteDatabaseFilename is None):
            print('Either --databaseFastaFilename or --sqliteDatabaseFilename '
                  'must be used with --matcher diamond.', file=sys.stderr)
            sys.exit(1)
Ejemplo n.º 11
0
        help=('A [0.0, 1.0] C{float} indicating a fraction of the reads that '
              'should be allowed to pass through the filter. The sample size '
              'will only be approximately the product of the sample fraction '
              'and the number of reads. The sample is taken at random.'))

    parser.add_argument(
        '--sequenceNumbersFile', default=None,
        help=('A file of (1-based) sequence numbers to retain. Numbers must '
              'be one per line.'))

    args = parser.parse_args()

    if args.readClass == 'fastq':
        # TODO: FastqReads should take a checkAlphabet argument, in the way
        # that FastaReads does.
        reads = FastqReads(sys.stdin)
    elif args.readClass == 'fasta':
        reads = FastaReads(sys.stdin, checkAlphabet=False)
    else:
        # args.readClass must be fasta-ss due to the 'choices' argument
        # passed to parser.add_argument value above.
        assert args.readClass == 'fasta-ss'
        reads = SSFastaReads(sys.stdin, checkAlphabet=False)

    saveAs = args.saveAs or args.readClass

    # Check for incompatible read/write formats. We can't write FASTQ
    # unless we have FASTQ on input (else we won't have quality
    # information), and we can't write PDB FASTA with secondary structure
    # information unless we have that on input.
    if saveAs == 'fastq' and args.readClass != 'fastq':
Ejemplo n.º 12
0
parser.add_argument(
    '--coverageDepthFile',
    required=True,
    metavar='FILENAME',
    help='The filename of the output of sam-coverage-depth.py, inputting the '
    'sorted and indexed bam file given by bwa mapping against rRNA.')

parser.add_argument('--outFile',
                    required=True,
                    metavar='FILENAME',
                    help='The filename the output will be written to.')

args = parser.parse_args()

mappedReads = FastqReads(args.mappedFile)
unmappedReads = FastqReads(args.unmappedFile)
coverageDepthFile = args.coverageDepthFile
outfile = args.outFile

# The Lengths below in bp, taken from NR_146117 genbank annotation.
# sExonLength45 = 13502
# sRrnaLength18 = 1872
# sRrnaLength5 = 156
# sRrnaLength28 = 5195
# sDnaLength45 = sExonLength45 - sRrnaLength18 - sRrnaLength5 - sRrnaLength28

# These are the positions, 0-based, taken from NR_146117 genbank annotation.
sRrna18 = (3659, 5531)
sRrna5 = (6615, 6771)
sRrna28 = (7942, 13137)