def __init__(self, reads, blastFilenames, databaseFilename=None, databaseDirectory=None, sqliteDatabaseFilename=None, scoreClass=HigherIsBetterScore, sortBlastFilenames=True, randomizeZeroEValues=True): if type(blastFilenames) == str: blastFilenames = [blastFilenames] if sortBlastFilenames: self.blastFilenames = numericallySortFilenames(blastFilenames) else: self.blastFilenames = blastFilenames self._databaseFilename = databaseFilename self._sqliteDatabaseFilename = sqliteDatabaseFilename self._databaseDirectory = databaseDirectory self._subjectTitleToSubject = None self.randomizeZeroEValues = randomizeZeroEValues # Prepare application parameters in order to initialize self. self._reader = self._getReader(self.blastFilenames[0], scoreClass) application = self._reader.application blastParams = copy.deepcopy(self._reader.params) subjectIsNucleotides = application != 'blastx' scoreTitle = ('Bit score' if scoreClass is HigherIsBetterScore else '$- log_{10}(e)$') applicationParams = ReadsAlignmentsParams( application, blastParams, subjectIsNucleotides=subjectIsNucleotides, scoreTitle=scoreTitle) ReadsAlignments.__init__(self, reads, applicationParams, scoreClass=scoreClass)
def __init__(self, reads, blastFilenames, scoreClass=HigherIsBetterScore, sortBlastFilenames=True, randomizeZeroEValues=True): if type(blastFilenames) == str: blastFilenames = [blastFilenames] if sortBlastFilenames: self.blastFilenames = numericallySortFilenames(blastFilenames) else: self.blastFilenames = blastFilenames self.randomizeZeroEValues = randomizeZeroEValues # Prepare application parameters in order to initialize self. self._reader = self._getReader(self.blastFilenames[0], scoreClass) application = self._reader.application blastParams = copy.deepcopy(self._reader.params) subjectIsNucleotides = application != 'blastx' scoreTitle = ('Bit score' if scoreClass is HigherIsBetterScore else '$- log_{10}(e)$') applicationParams = ReadsAlignmentsParams( application, blastParams, subjectIsNucleotides=subjectIsNucleotides, scoreTitle=scoreTitle) ReadsAlignments.__init__(self, reads, applicationParams, scoreClass=scoreClass)
def __init__(self, resultFilenames, database, sortFilenames=True): if isinstance(resultFilenames, str): resultFilenames = [resultFilenames] if sortFilenames: self.resultFilenames = numericallySortFilenames(resultFilenames) else: self.resultFilenames = resultFilenames self._database = database # Add a dictionary that will allow us to look up database subjects # by title. self._subjects = dict((subject.read.id, subject.read.sequence) for subject in database.getSubjects()) # Prepare application parameters in order to initialize self. self._reader = self._getReader(self.resultFilenames[0]) params = copy.deepcopy(self._reader.params) applicationParams = ReadsAlignmentsParams('light', applicationParams=params, subjectIsNucleotides=False) # We will add to self._reads as we go through the results (which # contain the read ids and sequences). Note that this means the # reads will not be available to the ReadsAlignments instance until # after all results have been read. self._reads = Reads() ReadsAlignments.__init__(self, self._reads, applicationParams)
def __init__(self, reads, filenames, databaseFilename=None, databaseDirectory=None, sqliteDatabaseFilename=None, scoreClass=HigherIsBetterScore, sortFilenames=False, randomizeZeroEValues=True): if type(filenames) == str: filenames = [filenames] if sortFilenames: self.filenames = numericallySortFilenames(filenames) else: self.filenames = filenames self._databaseFilename = databaseFilename self._sqliteDatabaseFilename = sqliteDatabaseFilename self._databaseDirectory = databaseDirectory self._subjectTitleToSubject = None self.randomizeZeroEValues = randomizeZeroEValues # Prepare diamondTask parameters in order to initialize self. self._reader = self._getReader(self.filenames[0], scoreClass) diamondTask = self._reader.diamondTask diamondParams = copy.deepcopy(self._reader.params) scoreTitle = ('Bit score' if scoreClass is HigherIsBetterScore else '$- log_{10}(e)$') diamondTaskParams = ReadsAlignmentsParams( diamondTask, diamondParams, subjectIsNucleotides=False, # DIAMOND dbs are always protein. scoreTitle=scoreTitle) ReadsAlignments.__init__(self, reads, diamondTaskParams, scoreClass=scoreClass)
def testSeveralNames(self): """ A list with several numeric names should result in a correctly sorted list of names being returned. """ self.assertEqual(['1.json', '2.json', '3.json'], numericallySortFilenames( ['3.json', '1.json', '2.json']))
def testSeveralNames(self): """ A list with several numeric names should result in a correctly sorted list of names being returned. """ self.assertEqual( ['1.json', '2.json', '3.json'], numericallySortFilenames(['3.json', '1.json', '2.json']))
def testSeveralNamesWithUnequalPrefixLengths(self): """ A list with several numeric names whose numeric prefixes differ in length should result in a correctly sorted list of names being returned. """ self.assertEqual( ['2.json', '3.json', '21.json', '35.json', '250.json'], numericallySortFilenames( ['3.json', '21.json', '35.json', '250.json', '2.json']))
def testBasename(self): """ Sorting must be according to file basename. """ self.assertEqual( ['../output/2.json', '../output/3.json', '../output/21.json', '../output/35.json', '../output/250.json'], numericallySortFilenames( ['../output/3.json', '../output/21.json', '../output/35.json', '../output/250.json', '../output/2.json']))
def testOneNumericName(self): """ A list with a single numeric name should result in that same name being returned. """ self.assertEqual(['3.json'], numericallySortFilenames(['3.json']))
def testNoNames(self): """ An empty list must be returned when an empty list is given. """ self.assertEqual([], numericallySortFilenames([]))
# Flatten lists of lists that we get from using both nargs='+' and # action='append'. We use both because it allows people to use (e.g.) # --json on the command line either via "--json file1 --json file2" or # "--json file1 file2", or a combination of these. That way it's not # necessary to remember which way you're supposed to use it and you also # can't be hit by the subtle problem encountered in # https://github.com/acorg/dark-matter/issues/453 jsonFiles = list(chain.from_iterable(args.json)) whitelist = ( set(chain.from_iterable(args.whitelist)) if args.whitelist else None) blacklist = ( set(chain.from_iterable(args.blacklist)) if args.blacklist else None) if args.fasta: if args.sortFilenames: files = numericallySortFilenames(chain.from_iterable(args.fasta)) else: files = list(chain.from_iterable(args.fasta)) reads = FastaReads(files) else: if args.sortFilenames: files = numericallySortFilenames(chain.from_iterable(args.fastq)) else: files = list(chain.from_iterable(args.fastq)) reads = FastqReads(files) if args.matcher == 'blast': from dark.blast.alignments import BlastReadsAlignments readsAlignments = BlastReadsAlignments( reads, jsonFiles, databaseFilename=args.databaseFastaFilename, databaseDirectory=args.databaseFastaDirectory,
# --json on the command line either via "--json file1 --json file2" or # "--json file1 file2", or a combination of these. That way it's not # necessary to remember which way you're supposed to use it and you also # can't be hit by the subtle problem encountered in # https://github.com/acorg/dark-matter/issues/453 jsonFiles = list(chain.from_iterable(args.json)) whitelist = ( set(chain.from_iterable(args.whitelist)) if args.whitelist else None) blacklist = ( set(chain.from_iterable(args.blacklist)) if args.blacklist else None) # TODO: Add a --readClass command-line option in case we want to # process FASTA containing AA sequences. if args.fasta: if args.sortFilenames: files = numericallySortFilenames(chain.from_iterable(args.fasta)) else: files = list(chain.from_iterable(args.fasta)) reads = FastaReads(files) else: if args.sortFilenames: files = numericallySortFilenames(chain.from_iterable(args.fastq)) else: files = list(chain.from_iterable(args.fastq)) reads = FastqReads(files) if args.matcher == 'blast': from dark.blast.alignments import BlastReadsAlignments readsAlignments = BlastReadsAlignments( reads, jsonFiles, databaseFilename=args.databaseFastaFilename, databaseDirectory=args.databaseFastaDirectory,