def add(self, virusTitle, sampleName): """ Add a a virus title, sample name combination and get its FASTA file name. Write the FASTA file if it does not already exist. @param virusTitle: A C{str} virus title. @param sampleName: A C{str} sample name. @return: A C{str} FASTA file name holding all the reads (without duplicates) from the sample that matched the proteins in the given virus. """ virusIndex = self._viruses.setdefault(virusTitle, len(self._viruses)) sampleIndex = self._samples.setdefault(sampleName, len(self._samples)) try: return self._fastaFilenames[(virusIndex, sampleIndex)] except KeyError: result = Reads() for proteinMatch in self._proteinGrouper.virusTitles[ virusTitle][sampleName]: for read in FastaReads(proteinMatch['fastaFilename'], checkAlphabet=0): result.add(read) saveFilename = join( proteinMatch['outDir'], 'virus-%d-sample-%d.fasta' % (virusIndex, sampleIndex)) result.filter(removeDuplicates=True).save(saveFilename) self._fastaFilenames[(virusIndex, sampleIndex)] = saveFilename return saveFilename
def add(self, pathogenName, sampleName): """ Add a (pathogen name, sample name) combination and get its FASTA/FASTQ file name and unique read count. Write the FASTA/FASTQ file if it does not already exist. Save the unique read count into C{self._proteinGrouper}. @param pathogenName: A C{str} pathogen name. @param sampleName: A C{str} sample name. @return: A C{str} giving the FASTA/FASTQ file name holding all the reads (without duplicates, by id) from the sample that matched the proteins in the given pathogen. """ pathogenIndex = self._pathogens.setdefault(pathogenName, len(self._pathogens)) sampleIndex = self._samples.setdefault(sampleName, len(self._samples)) try: return self._readsFilenames[(pathogenIndex, sampleIndex)] except KeyError: reads = Reads() for proteinMatch in self._proteinGrouper.pathogenNames[ pathogenName][sampleName]['proteins'].values(): for read in self._readsClass(proteinMatch['readsFilename']): reads.add(read) saveFilename = join( proteinMatch['outDir'], 'pathogen-%d-sample-%d.%s' % (pathogenIndex, sampleIndex, self._format)) reads.filter(removeDuplicatesById=True) nReads = reads.save(saveFilename, format_=self._format) # Save the unique read count into self._proteinGrouper self._proteinGrouper.pathogenNames[pathogenName][sampleName][ 'uniqueReadCount'] = nReads self._readsFilenames[(pathogenIndex, sampleIndex)] = saveFilename return saveFilename
def testFilterOnMaxLength(self): """ Filtering on maximal length must work. """ reads = Reads() read1 = Read('id1', 'ATCG') read2 = Read('id2', 'ACG') reads.add(read1) reads.add(read2) result = reads.filter(maxLength=3) self.assertEqual([read2], list(result))
def testFilterWithMinLengthEqualToMaxLength(self): """ When filtering on length, a read should be returned if its length equals a passed minimum and maximum length. """ reads = Reads() read1 = Read('id1', 'ATCG') read2 = Read('id2', 'ACG') reads.add(read1) reads.add(read2) result = reads.filter(minLength=4, maxLength=4) self.assertEqual([read1], list(result))
def testFilterOnLengthEverythingMatches(self): """ When filtering on length, all reads should be returned if they all satisfy the length requirements. """ reads = Reads() read1 = Read('id1', 'ATCG') read2 = Read('id2', 'ACG') reads.add(read1) reads.add(read2) result = reads.filter(minLength=2, maxLength=5) self.assertEqual([read1, read2], list(result))
def testFilterOnLengthNothingMatches(self): """ When filtering on length, no reads should be returned if none of them satisfy the length requirements. """ reads = Reads() read1 = Read('id1', 'ATCG') read2 = Read('id2', 'ACG') reads.add(read1) reads.add(read2) result = reads.filter(minLength=10, maxLength=15) self.assertEqual([], list(result))
def add(self, pathogenName, sampleName): """ Add a (pathogen name, sample name) combination and get its FASTA/FASTQ file name and unique read count. Write the FASTA/FASTQ file if it does not already exist. Save the unique read count into C{self._proteinGrouper}. @param pathogenName: A C{str} pathogen name. @param sampleName: A C{str} sample name. @return: A C{str} giving the FASTA/FASTQ file name holding all the reads (without duplicates, by id) from the sample that matched the proteins in the given pathogen. """ pathogenIndex = self._pathogens.setdefault(pathogenName, len(self._pathogens)) sampleIndex = self._samples.setdefault(sampleName, len(self._samples)) try: return self._readsFilenames[(pathogenIndex, sampleIndex)] except KeyError: reads = Reads() for proteinMatch in self._proteinGrouper.pathogenNames[ pathogenName][sampleName]['proteins'].values(): for read in self._readsClass(proteinMatch['readsFilename']): reads.add(read) saveFilename = join( proteinMatch['outDir'], 'pathogen-%d-sample-%d.%s' % (pathogenIndex, sampleIndex, self._format)) reads.filter(removeDuplicatesById=True) nReads = reads.save(saveFilename, format_=self._format) # Save the unique read count into self._proteinGrouper self._proteinGrouper.pathogenNames[ pathogenName][sampleName]['uniqueReadCount'] = nReads self._readsFilenames[(pathogenIndex, sampleIndex)] = saveFilename return saveFilename
sys.exit(2) if variableSites: toDelete = set() if args.printSites: for site, counts in variableSites.items(): if site >= baseOffset: ref = ((' (ref %s)' % reference.sequence[site]) if reference else '') print('%d: %s%s' % (site + 1 - baseOffset, counts, ref), file=sys.stderr) else: toDelete.add(site) for site in toDelete: del variableSites[site] if variableSites: if args.sitesOnly: print(','.join( map(lambda site: str(site + 1 - baseOffset), sorted(variableSites)))) else: saveAs = 'fasta' if args.fasta else 'fastq' reads.filter(keepSites=set(variableSites)).save(sys.stdout, saveAs) printHeader(variableSites, args, baseOffset) else: print('No sites were %svariable (threshold for homogeneity: %.3f).' % ('confirmed ' if args.confirm else '', args.homogeneous), file=sys.stderr)