def armi(): """Performs the necessary analyses on strains using armi targets""" global targetpath, seqdict # Set the analysis type analysistype = "armi" # Set the path of the target files currenttargetpath = "%s%s" % (targetpath, analysistype) # Create the bait files as necessary baittargets(currenttargetpath, analysistype) # The cat file is all the target files concatenated together catfile = "%s/%sConcatenated.fasta" % (currenttargetpath, analysistype) # If the cat file doesn't exist, create it from the bait file generated in baittargets() if not os.path.isfile(catfile): shutil.copyfile("%s/bait/%sBait.fa" % (currenttargetpath, analysistype), catfile) # In order to save time, a precomputed hash file is used hashfile = glob("%s/bait/*.gz" % currenttargetpath) # If this precomputed hash exists, use it if hashfile: baitfile = hashfile[0] # Otherwise use the .fasta bait file else: baitfile = glob("%s/bait/*.fa*" % currenttargetpath)[0] # Get the full target database into a variable armidatabase = glob("%s/*.fa*" % currenttargetpath) # Filter faidx processed targets from the list armidatabase = [target for target in armidatabase if ".fai" not in target and "Concatenated" not in target] # Add necessary variables to seqdict for folderName in seqdict: seqdict[folderName]["bait"]["fastqFiles"][analysistype] = baitfile seqdict[folderName]["targets"][analysistype] = armidatabase seqdict[folderName]["concatenatedTargets"][analysistype] = catfile print "Filtering .fastq files with %s targets" % analysistype # Run the baiting process baitrprocesses(analysistype) print "\nIndexing %s targets" % analysistype # Index the combined target file SMALTcombined.SMALTindexTargets(catfile, currenttargetpath) print '\nPerforming %s reference mapping' % analysistype # Use SMALT to perform reference mapping of the combined target file SMALTcombined.SMALTmappingProcesses(seqdict, analysistype, "SMALT") print "\nSorting mapped %s files" % analysistype # Use samtools to sort the bam file bamProcessorCombined.sortingprocesses(seqdict, analysistype) print '\nIndexing sorted %s files' % analysistype # Use samtools to index the sorted bam file bamProcessorCombined.bamindexingprocesses(seqdict, analysistype) print '\nParsing %s results' % analysistype # Use pysamstats to parse the bam files. Mike's armi module is called from within bamPysamStatsCombined bamPysamStatsCombined.bamParseProcesses(seqdict, analysistype, reportfolder)
def pathotyper(organismdict, organismlist, analysistype): """ Performs the necessary analyses on strains using genus-specific pathotype/serotype targets :param organismdict: dictionary of the 16S results :param organismlist: list of all the genera in the current analysis :param analysistype: string of the current analysis type """ global targetpath, seqdict # Targets are stored in targetpath/Organism/<genus>/<analysistype>/ currenttargetpath = "%sOrganism" % targetpath # Print this prior to the loop print '\nIndexing %s target files' % analysistype # As strains will be processed depending differently based on their genus, they must be processed separately for strain in organismdict: # Try/except account for genera without pathotyping schemes try: # Set the target path pathopath = glob("%s/%s/%s" % (currenttargetpath, organismdict[strain].keys()[0], analysistype))[0] # Create the bait targets if necessary baittargets(pathopath, analysistype) # The cat file will be used in the "combined" reference mapping catfile = "%s/%sConcatenated.fasta" % (pathopath, analysistype) # If the cat file does not exist, copy the bait file from the bait folder to the target folder if not os.path.isfile(catfile): shutil.copyfile("%s/bait/%sBait.fa" % (pathopath, analysistype), catfile) # Find the files to be used in baiting - if a precomputed hash file is present, use it hashfile = glob("%s/bait/*.gz" % pathopath) if hashfile: baitfile = hashfile[0] else: baitfile = glob("%s/bait/*.fa*" % pathopath)[0] # Set the baittype variable as the genus, and the analysis type baittype = "%s_%s" % (organismdict[strain].keys()[0], analysistype) # Get all the targets into a list # Even though the cat file will be used for the reference mapping rather than the individual target files, # the target names (taken from the name of the target files) is still used in the parsing of results targets = glob("%s/*.fa*" % pathopath) # Remove faidx processed files from the list targets = [target for target in targets if ".fai" not in target and "Concatenated" not in target] # Add the bait file, the cat file, and the list of targets to seqdict seqdict[strain]["bait"]["fastqFiles"][baittype] = baitfile seqdict[strain]["targets"][analysistype] = targets seqdict[strain]["concatenatedTargets"][analysistype] = catfile # Index the SMALT targets SMALTcombined.SMALTindexTargets(catfile, pathopath) except IndexError: pass # Bait! print "Filtering .fastq files with %s targets" % analysistype baitrprocesses(analysistype) print '\nPerforming reference mapping' # Use SMALT to perform reference mapping SMALTcombined.SMALTmappingProcesses(seqdict, analysistype, "SMALT") print '\nSorting mapped %s files' % analysistype # Use samtools to sort bam files bamProcessorCombined.sortingprocesses(seqdict, analysistype) print '\nIndexing sorted %s files' % analysistype # Use samtools to index sorted bam files bamProcessorCombined.bamindexingprocesses(seqdict, analysistype) print '\nParsing %s results' % analysistype # Use pysamstats to parse results pathomatches = bamPysamStatsCombined.bamParseProcesses(seqdict, analysistype, reportfolder) # Create a report pathoreportr(pathomatches, analysistype, organismdict, organismlist)