Esempio n. 1
0
def queryDatabase(subjects, queries, database, findParams=None):
    """
    Add subjects to a database, query it, return results.

    @param subjects: Either an instance of C{dark.reads.Reads} or a C{str}
        filename with sequences that should be turned into a database.
    @param queries: Either an instance of C{dark.reads.Reads} or a C{str}
        filename with sequences that should be looked up in the database.
    @param database: A C{light.database.Database} instance.
    @param findParams: An instance of C{light.parameters.FindParameters} or
        C{None} to use default find parameters.
    @return: A C{dict} whose keys are query ids and whose values are C{dict}s
        that map subject ids to scores. I.e., for each read we provide a
        C{dict} showing what subjects it matched, and with what score.
    """
    if isinstance(queries, str):
        queries = FastaReads(queries, readClass=AAReadWithX, upperCase=True)

    if isinstance(subjects, str):
        subjects = FastaReads(subjects, readClass=AAReadWithX, upperCase=True)

    list(map(database.addSubject, subjects))

    resultDict = defaultdict(dict)

    for query in queries:
        result = database.find(query, findParams)
        for subjectIndex in result.significantSubjects():
            subject = database.getSubjectByIndex(subjectIndex)
            score = result.analysis[subjectIndex]['bestBinScore']
            resultDict[query.id][subject.read.id] = score

    return resultDict
Esempio n. 2
0
 def testFilterRandomSubsetOfZeroFromZeroReads(self):
     """
     It must be possible to select a random subset of zero reads from a set
     of zero reads, where the read count is provided to C{filter} via the
     C{trueLength} argument.
     """
     data = ''
     with patch.object(builtins, 'open', mock_open(read_data=data)):
         reads = FastaReads('filename.fasta')
         result = list(reads.filter(randomSubset=0, trueLength=0))
         self.assertEqual([], result)
Esempio n. 3
0
 def testFilterRandomSubsetOfOneFromTenReads(self):
     """
     It must be possible to select a random subset of one read from a set
     of ten reads, where the read count is provided to C{filter} via the
     C{trueLength} argument.
     """
     data = '\n'.join(['>id', 'ACGT'] * 10)
     with patch.object(builtins, 'open', mock_open(read_data=data)):
         reads = FastaReads('filename.fasta')
         result = list(reads.filter(randomSubset=1, trueLength=10))
         self.assertEqual(1, len(result))
Esempio n. 4
0
 def testFilterRandomSubsetOfZeroFromZeroReads(self):
     """
     It must be possible to select a random subset of zero reads from a set
     of zero reads, where the read count is provided to C{filter} via the
     C{trueLength} argument.
     """
     data = ''
     mockOpener = mockOpen(read_data=data)
     with patch.object(builtins, 'open', mockOpener):
         reads = FastaReads('filename.fasta')
         result = list(reads.filter(randomSubset=0, trueLength=0))
         self.assertEqual([], result)
Esempio n. 5
0
 def testFilterRandomSubsetOfOneFromTenReads(self):
     """
     It must be possible to select a random subset of one read from a set
     of ten reads, where the read count is provided to C{filter} via the
     C{trueLength} argument.
     """
     data = '\n'.join(['>id', 'ACGT'] * 10)
     mockOpener = mockOpen(read_data=data)
     with patch.object(builtins, 'open', mockOpener):
         reads = FastaReads('filename.fasta')
         result = list(reads.filter(randomSubset=1, trueLength=10))
         self.assertEqual(1, len(result))
Esempio n. 6
0
 def testFilterRandomSubsetOfTwoFromTwoReads(self):
     """
     It must be possible to select a random subset of two reads from a set
     of two reads, where the read count is provided to C{filter} via the
     C{trueLength} argument.
     """
     data = '\n'.join(['>id1', 'ACGT', '>id2', 'TGCA'])
     with patch.object(builtins, 'open', mock_open(read_data=data)):
         reads = FastaReads('filename.fasta')
         result = list(reads.filter(randomSubset=2, trueLength=2))
         self.assertEqual(
             [Read('id1', 'ACGT'), Read('id2', 'TGCA')], result)
Esempio n. 7
0
 def testFilterRandomSubsetOfTwoFromTwoReads(self):
     """
     It must be possible to select a random subset of two reads from a set
     of two reads, where the read count is provided to C{filter} via the
     C{trueLength} argument.
     """
     data = '\n'.join(['>id1', 'ACGT', '>id2', 'TGCA'])
     mockOpener = mockOpen(read_data=data)
     with patch.object(builtins, 'open', mockOpener):
         reads = FastaReads('filename.fasta')
         result = list(reads.filter(randomSubset=2, trueLength=2))
         self.assertEqual([Read('id1', 'ACGT'), Read('id2', 'TGCA')],
                          result)
Esempio n. 8
0
    def getSubjectSequence(self, title):
        """
        Obtain information about a subject sequence given its title.

        @param title: A C{str} sequence title from a DIAMOND hit.
        @raise KeyError: If the C{title} is not present in the DIAMOND
            database.
        @return: An C{AAReadWithX} instance.
        """
        if self._subjectTitleToSubject is None:
            if self._databaseFilename is None:
                # An Sqlite3 database is used to look up subjects.
                self._subjectTitleToSubject = SqliteIndex(
                    self._sqliteDatabaseFilename,
                    fastaDirectory=self._databaseDirectory,
                    readClass=AAReadWithX)
            else:
                # Build a dict to look up subjects.
                titles = {}
                for read in FastaReads(self._databaseFilename,
                                       readClass=AAReadWithX):
                    titles[read.id] = read
                self._subjectTitleToSubject = titles

        return self._subjectTitleToSubject[title]
Esempio n. 9
0
 def testEmpty(self):
     """
     An empty FASTA file results in an empty iterator.
     """
     with patch.object(builtins, 'open', mock_open()):
         reads = FastaReads('filename.fasta')
         self.assertEqual([], list(reads))
Esempio n. 10
0
    def fromSequences(cls, labels, sequences, findParams=None, **kwargs):
        """
        Construct an NJTree instance from some seqeunces.

        @param cls: Our class.
        @param labels: An iterable producing C{str} labels for the sequences.
        @param sequences: Either A C{str} filename of sequences to consider or
            a C{light.reads.Reads} instance.
        @param findParams: An instance of C{FindParameters}.
        @param kwargs: See
            C{database.DatabaseSpecifier.getDatabaseFromKeywords} for
            additional keywords, all of which are optional.
        @return: An C{NJTree} instance.
        """
        if isinstance(sequences, str):
            sequences = FastaReads(sequences,
                                   readClass=AAReadWithX,
                                   upperCase=True)

        new = cls()
        new.sequences = list(sequences)
        new.labels = labels
        findParams = findParams or FindParameters()
        affinity = np.array(
            affinityMatrix(new.sequences, findParams=findParams, **kwargs))
        new.distance = np.ones(affinity.shape) - affinity
        new.tree = nj(DistanceMatrix(new.distance, labels))
        return new
Esempio n. 11
0
    def testTwoFiles(self):
        """
        It must be possible to read from two FASTA files.
        """
        class SideEffect(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('file1.fasta', filename)
                    self.count += 1
                    return File(['>id1\n', 'ACTG\n'])
                elif self.count == 1:
                    self.test.assertEqual('file2.fasta', filename)
                    self.count += 1
                    return File(['>id2\n', 'CAGT\n'])
                else:
                    self.fail('We are only supposed to be called twice!')

        sideEffect = SideEffect(self)
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect.sideEffect
            reads = FastaReads(['file1.fasta', 'file2.fasta'])
            self.assertEqual([
                DNARead('id1', 'ACTG'),
                DNARead('id2', 'CAGT'),
            ], list(reads))
def needle(reads):
    """
    Run a Needleman-Wunsch alignment and return the two sequences.

    @param reads: An iterable of two reads.
    @return: A C{Reads} instance with the two aligned sequences.
    """
    from tempfile import mkdtemp
    from shutil import rmtree

    dir = mkdtemp()

    file1 = join(dir, 'file1.fasta')
    with open(file1, 'w') as fp:
        print(reads[0].toString('fasta'), end='', file=fp)

    file2 = join(dir, 'file2.fasta')
    with open(file2, 'w') as fp:
        print(reads[1].toString('fasta'), end='', file=fp)

    out = join(dir, 'result.fasta')

    Executor().execute("needle -asequence '%s' -bsequence '%s' -auto "
                       "-outfile '%s' -aformat fasta" % (file1, file2, out))

    # Use 'list' in the following to force reading the FASTA from disk.
    result = Reads(list(FastaReads(out)))
    rmtree(dir)

    return result
Esempio n. 13
0
def main(recordFilenames, fastaFilename, title, xRange, bitRange):
    """
    Print reads that match in a specified X-axis and bit score range.

    @param recordFilenames: A C{list} of C{str} file names contain results of a
        BLAST run, in JSON format.
    @param fastaFilename: The C{str} name of the FASTA file that was originally
        BLASTed.
    @param title: The C{str} title of the subject sequence, as output by BLAST.
    @param xRange: A (start, end) list of C{int}s, giving an X-axis range or
        C{None} if the entire X axis range should be printed.
    @param bitRange: A (start, end) list of C{int}s, giving a bit score range
        or C{None} if the entire bit score range should be printed.
    """
    reads = FastaReads(fastaFilename)
    blastReadsAlignments = BlastReadsAlignments(reads, recordFilenames)
    filtered = blastReadsAlignments.filter(whitelist=set([title]),
                                           negativeTitleRegex='.')
    titlesAlignments = TitlesAlignments(filtered)

    if title not in titlesAlignments:
        print('%s: Title %r not found in BLAST output' % (sys.argv[0], title))
        sys.exit(3)

    for titleAlignment in titlesAlignments[title]:
        for hsp in titleAlignment.hsps:
            if ((xRange is None or (xRange[0] <= hsp.subjectEnd
                                    and xRange[1] >= hsp.subjectStart))
                    and (bitRange is None or
                         (bitRange[0] <= hsp.score.score <= bitRange[1]))):
                print(('query: %s, start: %d, end: %d, score: %d' %
                       (titleAlignment.read.id, hsp.subjectStart,
                        hsp.subjectEnd, hsp.score.score)))
Esempio n. 14
0
def parseColors(colors, args):
    """
    Parse read id color specification.

    @param colors: A C{list} of C{str}s. Each item is of the form, e.g.,
        'green X Y Z...', where each of X, Y, Z, ... etc. is either a read
        id or the name of a FASTA or FASTQ file containing reads whose ids
        should be displayed with the corresponding color. Note that if read
        ids contain spaces you will need to use the latter (i.e. FASTA/Q file
        name) approach because C{args.colors} is split on whitespace.
    @param args: The argparse C{Namespace} instance holding the other parsed
        command line arguments.
    @return: A C{dict} whose keys are colors and whose values are sets of
        read ids.
    """
    result = defaultdict(set)
    for colorInfo in colors:
        readIds = colorInfo.split()
        color = readIds.pop(0)
        for readId in readIds:
            if os.path.isfile(readId):
                filename = readId
                if args.fasta:
                    reads = FastaReads(filename)
                else:
                    reads = FastqReads(filename)
                for read in reads:
                    result[color].add(read.id)
            else:
                result[color].add(readId)
    return result
Esempio n. 15
0
    def _readReferenceGenomes(self, referenceGenomeFiles):
        """
        Read reference genomes from files and check that any duplicates have
        identical sequences.

        @param referenceGenomeFiles: A C{list} of C{str} names of FASTA files
            containing reference genomes.
        @raise ValueError: If a reference genome is found in more than one file
            and the sequences are not identical.
        @return: A C{dict} keyed by C{str} sequence id with C{dark.Read}
            values holding reference genomes.
        """
        result = {}
        seen = {}
        for filename in referenceGenomeFiles:
            for read in FastaReads(filename):
                id_ = read.id
                if id_ in seen:
                    if result[id_].sequence != read.sequence:
                        raise ValueError(
                            'Reference genome id %r was found in two files '
                            '(%r and %r) but with different sequences.' %
                            (id_, seen[id_], filename))
                else:
                    seen[id_] = filename
                    result[id_] = read

        self.report('Read %d reference genome%s:\n%s' %
                    (len(result), s(len(result)), '\n'.join(
                        '  %s' % id_ for id_ in sorted(result))),
                    requiredVerbosityLevel=2)

        return result
Esempio n. 16
0
def mafft(reads, verbose=False, options=None, threads=None):
    """
    Run a MAFFT alignment and return the sequences.

    @param reads: An iterable of multiple reads.
    @param verbose: If C{True} print progress info to sys.stderr.
    @param options: A C{str} of options to pass to mafft.
    @return: A C{Reads} instance with the aligned sequences.
    """
    tempdir = mkdtemp()

    infile = join(tempdir, 'input.fasta')
    out = join(tempdir, 'result.fasta')

    Reads(reads).save(infile)

    if verbose:
        print('Running mafft.', file=sys.stderr)

    Executor().execute("mafft %s %s '%s' > '%s'" %
                       (('' if threads is None else '--thread %d' % threads),
                        options or '', infile, out))

    # Use 'list' in the following to force reading the FASTA from disk.
    result = Reads(list(FastaReads(out)))
    rmtree(tempdir)

    return result
Esempio n. 17
0
    def getFractionOfStructuresCovered(self):
        """
        Return the fraction of known structures matched by at least one
        substring in the subset that is being evaluated.
        """
        hit = 0
        total = 0

        db = DatabaseSpecifier().getDatabaseFromKeywords(
            trigPoints=[],
            landmarks=['AC ' + self.structureType],
            acAlphaHelixFilename=self.acAlphaHelixFilename,
            acAlphaHelix310Filename=self.acAlphaHelix310Filename,
            acAlphaHelixCombinedFilename=self.acAlphaHelixCombinedFilename,
            acAlphaHelixPiFilename=self.acAlphaHelixPiFilename,
            acExtendedStrandFilename=self.acExtendedStrandFilename)

        backend = Backend()
        backend.configure(db.dbParams)

        for read in FastaReads(self.structureFile,
                               readClass=AAReadWithX,
                               checkAlphabet=0):
            total += 1
            scannedRead = backend.scan(read)
            if len(scannedRead.landmarks) > 0:
                hit += 1

        return hit / total if total else 0.0
Esempio n. 18
0
 def testNoQuality(self):
     """
     A FASTA file read must not have any quality information.
     """
     data = '\n'.join(['>id1', 'ACGT'])
     with patch.object(builtins, 'open', mock_open(read_data=data)):
         reads = list(FastaReads('filename.fasta'))
         self.assertEqual(None, reads[0].quality)
Esempio n. 19
0
 def testConvertLowerToUpperCaseIfSpecifiedDNARead(self):
     """
     A read needs to be converted from lower to upper case if specified.
     """
     data = '\n'.join(['>id1', 'actg'])
     with patch.object(builtins, 'open', mock_open(read_data=data)):
         reads = list(FastaReads('filename.fasta', upperCase=True))
         self.assertEqual([AARead('id1', 'ACTG')], reads)
Esempio n. 20
0
 def testDontConvertLowerToUpperCaseIfNotSpecified(self):
     """
     A read must not be converted from lower to upper case if not specified.
     """
     data = '\n'.join(['>id1', 'actgs'])
     with patch.object(builtins, 'open', mock_open(read_data=data)):
         reads = list(FastaReads('filename.fasta', readClass=AARead))
         self.assertEqual([AARead('id1', 'actgs')], reads)
Esempio n. 21
0
 def testOneRead(self):
     """
     A FASTA file with one read must be read properly.
     """
     data = '\n'.join(['>id1', 'ACGT'])
     with patch.object(builtins, 'open', mock_open(read_data=data)):
         reads = list(FastaReads('filename.fasta'))
         self.assertEqual([Read('id1', 'ACGT')], reads)
Esempio n. 22
0
 def testTypeRNA(self):
     """
     A FASTA file whose read class is RNARead must result in reads that
     are instances of RNARead.
     """
     data = '\n'.join(['>id1', 'ACGT'])
     with patch.object(builtins, 'open', mock_open(read_data=data)):
         reads = list(FastaReads('filename.fasta', RNARead))
         self.assertTrue(isinstance(reads[0], RNARead))
Esempio n. 23
0
 def testTypeDefaultsToDNA(self):
     """
     A FASTA file whose type is not specified must result in reads that
     are instances of DNARead.
     """
     data = '\n'.join(['>id1', 'ACGT'])
     with patch.object(builtins, 'open', mock_open(read_data=data)):
         reads = list(FastaReads('filename.fasta'))
         self.assertTrue(isinstance(reads[0], DNARead))
Esempio n. 24
0
 def testTwoReads(self):
     """
     A FASTA file with two reads must be read properly and its
     sequences must be returned in the correct order.
     """
     data = '\n'.join(['>id1', 'ACGT', '>id2', 'TGCA'])
     with patch.object(builtins, 'open', mock_open(read_data=data)):
         reads = list(FastaReads('filename.fasta'))
         self.assertEqual(2, len(reads))
         self.assertEqual([Read('id1', 'ACGT'), Read('id2', 'TGCA')], reads)
Esempio n. 25
0
def getSequence(filename, id_=None):
    """
    Load a FASTA file.

    @param filename: The C{str} FASTA file name.
    @param id_: The C{str} id of the sequence wanted, or C{None} if to retrieve
        the first sequence.
    """
    for read in FastaReads(filename):
        if id_ is None or read.id.split()[0] == id_:
            return read

    raise ValueError(f'Sequence {id_} not found in {filename!r}.')
Esempio n. 26
0
 def testDisableAlphabetChecking(self):
     """
     It must be possible to have a FastaReads instance not do alphabet
     checking, if requested (by passing checkAlphabet=0).
     """
     data = '\n'.join([
         '>one',
         'at-at',
     ])
     mockOpener = mockOpen(read_data=data)
     with patch.object(builtins, 'open', mockOpener):
         self.assertEqual(
             1, len(list(FastaReads(data, AARead, checkAlphabet=0))))
Esempio n. 27
0
def needle(reads, verbose=False, options=None):
    """
    Run a Needleman-Wunsch alignment and return the two sequences.

    @param reads: An iterable of two reads.
    @param verbose: If C{True} print progress info to sys.stderr.
    @param options: Additional options to pass to needle.
    @return: A C{Reads} instance with the two aligned sequences.
    """
    tempdir = mkdtemp()

    file1 = join(tempdir, 'file1.fasta')
    with open(file1, 'w') as fp:
        print(reads[0].toString('fasta'), end='', file=fp)

    file2 = join(tempdir, 'file2.fasta')
    with open(file2, 'w') as fp:
        print(reads[1].toString('fasta'), end='', file=fp)

    out = join(tempdir, 'result.fasta')

    def useStderr(e):
        return "Sequences too big. Try 'stretcher'" not in e.stderr

    if verbose:
        print('Running needle.', file=sys.stderr)
    try:
        Executor().execute("needle -asequence '%s' -bsequence '%s' %s "
                           "-outfile '%s' -aformat fasta" %
                           (file1, file2, options or '', out),
                           useStderr=useStderr)
    except CalledProcessError as e:
        if useStderr(e):
            raise
        else:
            if verbose:
                print(
                    'Sequences too long for needle. Falling back to '
                    'stretcher. Be patient!',
                    file=sys.stderr)
            Executor().execute("stretcher -asequence '%s' -bsequence '%s' "
                               "-auto "
                               "-outfile '%s' -aformat fasta" %
                               (file1, file2, out))

    # Use 'list' in the following to force reading the FASTA from disk.
    result = Reads(list(FastaReads(out)))
    rmtree(tempdir)

    return result
Esempio n. 28
0
    def getSubjectSequence(self, title):
        """
        Obtain information about a subject sequence given its title.

        This information is cached in self._subjectTitleToSubject. It can
        be obtained from either a) an sqlite database (given via the
        sqliteDatabaseFilename argument to __init__), b) the FASTA that was
        originally given to BLAST (via the databaseFilename argument), or
        c) from the BLAST database using blastdbcmd (which can be
        unreliable - occasionally failing to find subjects that are in its
        database).

        @param title: A C{str} sequence title from a BLAST hit. Of the form
            'gi|63148399|gb|DQ011818.1| Description...'.
        @return: An C{AARead} or C{DNARead} instance, depending on the type of
            BLAST database in use.

        """
        if self.params.application in {'blastp', 'blastx'}:
            readClass = AARead
        else:
            readClass = DNARead

        if self._subjectTitleToSubject is None:
            if self._databaseFilename is None:
                if self._sqliteDatabaseFilename is None:
                    # Fall back to blastdbcmd.  ncbidb has to be imported
                    # as below so ncbidb.getSequence can be patched by our
                    # test suite.
                    from dark import ncbidb
                    seq = ncbidb.getSequence(
                        title, self.params.applicationParams['database'])
                    return readClass(seq.description, str(seq.seq))
                else:
                    # An Sqlite3 database is used to look up subjects.
                    self._subjectTitleToSubject = SqliteIndex(
                        self._sqliteDatabaseFilename,
                        fastaDirectory=self._databaseDirectory,
                        readClass=readClass)
            else:
                # Build an in-memory dict to look up subjects. This only
                # works for small databases, obviously.
                titles = {}
                for read in FastaReads(self._databaseFilename,
                                       readClass=readClass):
                    titles[read.id] = read
                self._subjectTitleToSubject = titles

        return self._subjectTitleToSubject[title]
Esempio n. 29
0
    def getSubjectSequence(self, title):
        """
        Obtain information about a subject sequence given its title.

        @param title: A C{str} sequence title from a DIAMOND hit.
        @raise KeyError: If the C{title} is not present in the DIAMOND
            database.
        @return: An C{AARead} instance.
        """
        if self._subjectTitleToSubject is None:
            titles = {}
            for read in FastaReads(self._databaseFilename,
                                   readClass=AAReadWithX, checkAlphabet=0):
                titles[read.id] = read
            self._subjectTitleToSubject = titles

        return self._subjectTitleToSubject[title]
Esempio n. 30
0
 def testAlphabetIsCheckedAndRaisesValueErrorOnFirstRead(self):
     """
     The default behavior of a FastaReads instance is to check to ensure
     its sequences have the correct alphabet and to raise ValueError if not.
     A non-alphabetic character in the first read must be detected.
     """
     data = '\n'.join([
         '>one',
         'at-at',
     ])
     error = ("^Read alphabet \('-AT'\) is not a subset of expected "
              "alphabet \('ACDEFGHIKLMNPQRSTVWY'\) for read class "
              "AARead\.$")
     mockOpener = mockOpen(read_data=data)
     with patch.object(builtins, 'open', mockOpener):
         six.assertRaisesRegex(self, ValueError, error, list,
                               FastaReads(data, AARead))
Esempio n. 31
0
 def testOnlyCheckSomeAlphabets(self):
     """
     It must be possible to have the alphabets of only a certain number of
     reads checked. A non-alphabetic character in a later read must not
     stop that read from being processed.
     """
     data = '\n'.join([
         '>one',
         'atat',
         '>two',
         'at-at',
     ])
     mockOpener = mockOpen(read_data=data)
     with patch.object(builtins, 'open', mockOpener):
         reads = list(FastaReads(data, AARead, checkAlphabet=1))
         self.assertEqual(2, len(reads))
         self.assertEqual('at-at', reads[1].sequence)
def check(fastaFile, jsonFiles):
    """
    Check for simple consistency between the FASTA file and the JSON files.

    Note that some checking is already performed by the BlastReadsAlignments
    class. That includes checking the number of reads matches the number of
    BLAST records and that read ids and BLAST record read ids match.

    @param jsonFiles: A C{list} of names of our BLAST JSON. These may
        may be compressed (as bz2).
    @param fastaFile: The C{str} name of a FASTA-containing file.
    """
    reads = FastaReads(fastaFile)
    readsAlignments = BlastReadsAlignments(reads, jsonFiles)
    for index, readAlignments in enumerate(readsAlignments):

        # Check that all the alignments in the BLAST JSON do not have query
        # sequences or query offsets that are greater than the length of
        # the sequence given in the FASTA file.
        fastaLen = len(readAlignments.read)
        for readAlignment in readAlignments:
            for hsp in readAlignment.hsps:
                # The FASTA sequence should be at least as long as the
                # query in the JSON BLAST record (minus any gaps).
                assert (fastaLen >=
                        len(hsp.query) - hsp.query.count('-')), (
                    'record %d: FASTA len %d < HSP query len %d.\n'
                    'FASTA: %s\nQuery match: %s' % (
                        index, fastaLen, len(hsp.query),
                        readAlignments.read.sequence, hsp.query))
                # The FASTA sequence length should be larger than either of
                # the query offsets mentioned in the JSON BLAST
                # record. That's because readStart and readEnd are offsets
                # into the read - so they can't be bigger than the read
                # length.
                #
                # TODO: These asserts should be more informative when they
                # fail.
                assert fastaLen >= hsp.readEnd >= hsp.readStart, (
                    'record %d: FASTA len %d not greater than both read '
                    'offsets (%d - %d), or read offsets are non-increasing. '
                    'FASTA: %s\nQuery match: %s' % (
                        index, fastaLen, hsp.readStart, hsp.readEnd,
                        readAlignments.read.sequence, hsp.query))
Esempio n. 33
0
    def saveReducedFasta(self, significantOffsets, outputDir):
        """
        Write out FASTA that contains reads with bases just at the
        significant offsets.

        @param significantOffsets: A C{set} of signifcant offsets.
        @param outputDir: A C{str} directory path.
        """
        self.report('    Saving reduced FASTA')
        print('    Saving reduced FASTA not implemented yet')
        return

        allGaps = '-' * len(significantOffsets)

        def unwanted(read):
            return (None if read.sequence == allGaps else read)

        FastaReads(self.fastaFile).filter(keepSites=significantOffsets).filter(
            modifier=unwanted).save(join(outputDir, 'reduced.fasta'))
Esempio n. 34
0
#!/usr/bin/env python

from __future__ import print_function

import argparse

from dark.fasta import FastaReads


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="For a fasta file with sequences, summarize what is " "happening at a specific position."
    )

    parser.add_argument("--fastaFile", required=True, help="The name of the FASTA file to read.")

    parser.add_argument("--position", required=True, type=int, help="The (one-based) position to examine.")

    args = parser.parse_args()
    reads = FastaReads(args.fastaFile)
    result = reads.summarizePosition(args.position - 1)

    print("%d of %d sequences were excluded due to length." % (result["excludedCount"], len(reads)))

    denominator = (len(reads) - result["excludedCount"]) / 100.0
    for base, count in result["countAtPosition"].items():
        print("%s: Total: %s (%.2f%%)" % (base, count, count / denominator))
Esempio n. 35
0
        help='sequences that are matched with a median score that is '
        'worse will be elided.')

    parser.add_argument(
        '--withScoreBetterThan', type=float, default=None,
        help='sequences that are matched without at least one score '
        'at least this good will be elided.')

    parser.add_argument(
        '--minNewReads', type=float, default=None,
        help='The fraction of its reads by which a new read set must differ '
        'from all previously seen read sets in order to be considered '
        'acceptably different.')

    args = parser.parse_args()
    reads = FastaReads(args.fasta)
    readsAlignments = BlastReadsAlignments(reads, args.json)

    # Convert white/blacklists lists to sets.
    if args.whitelist is not None:
        args.whitelist = set(args.whitelist)
    if args.blacklist is not None:
        args.blacklist = set(args.blacklist)

    readsAlignments.filter(
        minSequenceLen=args.minSequenceLen,
        maxSequenceLen=args.maxSequenceLen,
        minStart=args.minStart,
        maxStop=args.maxStop,
        oneAlignmentPerRead=args.oneAlignmentPerRead,
        maxHspsPerHit=args.maxHspsPerHit,