Ejemplo n.º 1
0
    def add(self, virusTitle, sampleName):
        """
        Add a a virus title, sample name combination and get its FASTA file
        name. Write the FASTA file if it does not already exist.

        @param virusTitle: A C{str} virus title.
        @param sampleName: A C{str} sample name.
        @return: A C{str} FASTA file name holding all the reads (without
            duplicates) from the sample that matched the proteins in the given
            virus.
        """
        virusIndex = self._viruses.setdefault(virusTitle, len(self._viruses))
        sampleIndex = self._samples.setdefault(sampleName, len(self._samples))

        try:
            return self._fastaFilenames[(virusIndex, sampleIndex)]
        except KeyError:
            result = Reads()
            for proteinMatch in self._proteinGrouper.virusTitles[
                    virusTitle][sampleName]:
                for read in FastaReads(proteinMatch['fastaFilename'],
                                       checkAlphabet=0):
                    result.add(read)
            saveFilename = join(
                proteinMatch['outDir'],
                'virus-%d-sample-%d.fasta' % (virusIndex, sampleIndex))
            result.filter(removeDuplicates=True).save(saveFilename)
            self._fastaFilenames[(virusIndex, sampleIndex)] = saveFilename
            return saveFilename
Ejemplo n.º 2
0
    def add(self, pathogenName, sampleName):
        """
        Add a (pathogen name, sample name) combination and get its FASTA/FASTQ
        file name and unique read count. Write the FASTA/FASTQ file if it does
        not already exist. Save the unique read count into
        C{self._proteinGrouper}.

        @param pathogenName: A C{str} pathogen name.
        @param sampleName: A C{str} sample name.
        @return: A C{str} giving the FASTA/FASTQ file name holding all the
            reads (without duplicates, by id) from the sample that matched the
            proteins in the given pathogen.
        """
        pathogenIndex = self._pathogens.setdefault(pathogenName,
                                                   len(self._pathogens))
        sampleIndex = self._samples.setdefault(sampleName, len(self._samples))

        try:
            return self._readsFilenames[(pathogenIndex, sampleIndex)]
        except KeyError:
            reads = Reads()
            for proteinMatch in self._proteinGrouper.pathogenNames[
                    pathogenName][sampleName]['proteins'].values():
                for read in self._readsClass(proteinMatch['readsFilename']):
                    reads.add(read)
            saveFilename = join(
                proteinMatch['outDir'], 'pathogen-%d-sample-%d.%s' %
                (pathogenIndex, sampleIndex, self._format))
            reads.filter(removeDuplicatesById=True)
            nReads = reads.save(saveFilename, format_=self._format)
            # Save the unique read count into self._proteinGrouper
            self._proteinGrouper.pathogenNames[pathogenName][sampleName][
                'uniqueReadCount'] = nReads
            self._readsFilenames[(pathogenIndex, sampleIndex)] = saveFilename
            return saveFilename
Ejemplo n.º 3
0
 def testFilterOnMaxLength(self):
     """
     Filtering on maximal length must work.
     """
     reads = Reads()
     read1 = Read('id1', 'ATCG')
     read2 = Read('id2', 'ACG')
     reads.add(read1)
     reads.add(read2)
     result = reads.filter(maxLength=3)
     self.assertEqual([read2], list(result))
Ejemplo n.º 4
0
 def testFilterWithMinLengthEqualToMaxLength(self):
     """
     When filtering on length, a read should be returned if its length
     equals a passed minimum and maximum length.
     """
     reads = Reads()
     read1 = Read('id1', 'ATCG')
     read2 = Read('id2', 'ACG')
     reads.add(read1)
     reads.add(read2)
     result = reads.filter(minLength=4, maxLength=4)
     self.assertEqual([read1], list(result))
Ejemplo n.º 5
0
 def testFilterOnLengthEverythingMatches(self):
     """
     When filtering on length, all reads should be returned if they all
     satisfy the length requirements.
     """
     reads = Reads()
     read1 = Read('id1', 'ATCG')
     read2 = Read('id2', 'ACG')
     reads.add(read1)
     reads.add(read2)
     result = reads.filter(minLength=2, maxLength=5)
     self.assertEqual([read1, read2], list(result))
Ejemplo n.º 6
0
 def testFilterOnLengthNothingMatches(self):
     """
     When filtering on length, no reads should be returned if none of them
     satisfy the length requirements.
     """
     reads = Reads()
     read1 = Read('id1', 'ATCG')
     read2 = Read('id2', 'ACG')
     reads.add(read1)
     reads.add(read2)
     result = reads.filter(minLength=10, maxLength=15)
     self.assertEqual([], list(result))
Ejemplo n.º 7
0
    def add(self, pathogenName, sampleName):
        """
        Add a (pathogen name, sample name) combination and get its FASTA/FASTQ
        file name and unique read count. Write the FASTA/FASTQ file if it does
        not already exist. Save the unique read count into
        C{self._proteinGrouper}.

        @param pathogenName: A C{str} pathogen name.
        @param sampleName: A C{str} sample name.
        @return: A C{str} giving the FASTA/FASTQ file name holding all the
            reads (without duplicates, by id) from the sample that matched the
            proteins in the given pathogen.
        """
        pathogenIndex = self._pathogens.setdefault(pathogenName,
                                                   len(self._pathogens))
        sampleIndex = self._samples.setdefault(sampleName, len(self._samples))

        try:
            return self._readsFilenames[(pathogenIndex, sampleIndex)]
        except KeyError:
            reads = Reads()
            for proteinMatch in self._proteinGrouper.pathogenNames[
                    pathogenName][sampleName]['proteins'].values():
                for read in self._readsClass(proteinMatch['readsFilename']):
                    reads.add(read)
            saveFilename = join(
                proteinMatch['outDir'],
                'pathogen-%d-sample-%d.%s' % (pathogenIndex, sampleIndex,
                                              self._format))
            reads.filter(removeDuplicatesById=True)
            nReads = reads.save(saveFilename, format_=self._format)
            # Save the unique read count into self._proteinGrouper
            self._proteinGrouper.pathogenNames[
                pathogenName][sampleName]['uniqueReadCount'] = nReads
            self._readsFilenames[(pathogenIndex, sampleIndex)] = saveFilename
            return saveFilename
Ejemplo n.º 8
0
        sys.exit(2)

    if variableSites:
        toDelete = set()
        if args.printSites:
            for site, counts in variableSites.items():
                if site >= baseOffset:
                    ref = ((' (ref %s)' %
                            reference.sequence[site]) if reference else '')
                    print('%d: %s%s' % (site + 1 - baseOffset, counts, ref),
                          file=sys.stderr)
                else:
                    toDelete.add(site)

        for site in toDelete:
            del variableSites[site]

    if variableSites:
        if args.sitesOnly:
            print(','.join(
                map(lambda site: str(site + 1 - baseOffset),
                    sorted(variableSites))))
        else:
            saveAs = 'fasta' if args.fasta else 'fastq'
            reads.filter(keepSites=set(variableSites)).save(sys.stdout, saveAs)
            printHeader(variableSites, args, baseOffset)
    else:
        print('No sites were %svariable (threshold for homogeneity: %.3f).' %
              ('confirmed ' if args.confirm else '', args.homogeneous),
              file=sys.stderr)