def rmlst(): """Performs the necessary analyses on strains using armi targets""" global targetpath, seqdict # Set the analysis type analysistype = "rMLST" # Set the profile name profiletype = "rMLSTprofile" # Set the path of the analysistype data currenttargetpath = "%s%s" % (targetpath, analysistype) # Set the allele and profile path variables rmlstpath = currenttargetpath + "/alleles" rmlstprofilepath = currenttargetpath + "/profile" # Find the .json version of the profile file profilefile = glob("%s/*.json" % rmlstprofilepath) # If there is no .json file, use the .txt profile file instead. Note that this file must be edited (by removing # any columns after the last gene) if not profilefile: profilefile = glob("%s/*.txt" % rmlstprofilepath)[0] else: profilefile = profilefile[0] # Because the combined database of alleles is so large (~143 MB), it is filtered with uSearch prior to baiting # this is not done automatically, so the set-up of the bait files is slightly different here # In order to save time, a precomputed hash file is used baitfile = glob("%s/bait/*.gz" % rmlstpath)[0] baittargets(rmlstpath, analysistype) # Get the target files into a list targets = glob("%s/*.*fa" % rmlstpath) # Filter the targets targets = [target for target in targets if ".fai" not in target and "Concatenated" not in target] # Add the bait file, the profile file, and a list of the targets to seqdict for strain in seqdict: seqdict[strain]["bait"]["fastqFiles"][analysistype] = baitfile seqdict[strain]["targets"][analysistype] = targets seqdict[strain][profiletype] = profilefile # Index the targets print "\nIndexing %s targets" % analysistype SMALT.smaltindextargetsprocesses(targets, rmlstpath) # Bait! print "Filtering .fastq files with %s targets" % analysistype baitrprocesses(analysistype) # Get the MLST profile into a dictionary print "\nLoading %s profiles" % analysistype profiledict, genedict = rawMLST.profilR(seqdict, profiletype) # Use SMALT to perform reference mapping print '\nPerforming %s reference mapping' % analysistype SMALT.smaltmappingprocesses(seqdict, analysistype, "SMALT") # Use samtools to sort bam files print "\nSorting mapped %s files" % analysistype bamProcessor.sortingprocesses(seqdict, analysistype) # Use samtools to index bam files print '\nIndexing sorted %s files' % analysistype bamProcessor.bamindexingprocesses(seqdict, analysistype) # Use pysamstats to parse bam files print '\nParsing %s results' % analysistype mlstmatches = bamPysamStats.bamparseprocesses(seqdict, analysistype) # Determine sequence types print '\nFinding multilocus sequence types' sequencetypes = rawMLST.sequenceTyper(mlstmatches, profiledict, genedict) # Create a report rawMLST.rMLSTreportMaker(seqdict, sequencetypes, analysistype, reportfolder, path)
def sixteens(): """Performs the necessary analyses on strains using 16S targets""" global targetpath, seqdict # Set the analysis type variable to 16S. This variable is important for retrieving 16S-specific data from seqdict analysistype = "16S" # Set the path of the analysistype data currenttargetpath = "%s%s" % (targetpath, analysistype) # Create the bait target file (if necessary) baittargets(currenttargetpath, analysistype) # In order to save time, during baiting, a precomputed hash file is used hashfile = glob("%s/bait/*.gz" % currenttargetpath) # If the hashfile is present, use it if hashfile: baitfile = hashfile[0] # Otherwise, use the concatenated bait file else: baitfile = glob("%s/bait/*.fa*" % currenttargetpath)[0] print "Filtering .fastq files with %s targets" % analysistype # Get a list of the targets into a variable sixteensdatabase = glob("%s/*.fa*" % currenttargetpath) # Filter as above - repeated as the above code is not executed if the bait file is already present sixteensdatabase = [target for target in sixteensdatabase if ".fai" not in target] # Populate seqdict with the name and path of the bait file used as well as the database for foldername in seqdict: seqdict[foldername]["bait"]["fastqFiles"][analysistype] = baitfile seqdict[foldername]["targets"][analysistype] = sixteensdatabase # Run the baiting process baitrprocesses(analysistype) # Perform SMALT indexing of targets print "\nIndexing %s targets" % analysistype SMALT.smaltindextargetsprocesses(sixteensdatabase, currenttargetpath) # Perform reference mapping with SMALT print '\nPerforming %s reference mapping' % analysistype SMALT.smaltmappingprocesses(seqdict, analysistype, "SMALT") # Use samtools to sort the bam files print "\nSorting mapped %s files" % analysistype bamProcessor.sortingprocesses(seqdict, analysistype) # Use samtools to index the sorted bam files print '\nIndexing sorted %s files' % analysistype bamProcessor.bamindexingprocesses(seqdict, analysistype) # Use pysamstats to parse the sorted, indexed bam files print '\nParsing %s results' % analysistype generadict, generalist = bamPysamStats.bamparseprocesses(seqdict, analysistype) # Create reports of the results sixteensreportmaker(generadict, analysistype) # Return the computed genera return generadict, generalist
def mlst(organismdict, organismlist): """ Performs the necessary analyses on strains using genus-specific MLST targets :param organismdict: dictionary of the 16S results :param organismlist: list of all the genera in the current analysis """ global targetpath, seqdict # Set the analysis type analysistype = "MLST" profiletype = "MLSTprofile" # MLST targets are stored in targetpath/Organism/<genus>/MLST/alleles currenttargetpath = "%sOrganism" % targetpath # Print this out before the loop print '\nIndexing %s target files' % analysistype for strain in organismdict: # If there is not an MLST scheme installed for a particular organism, then the script will crash when it tries # to find the necessary files, as they are not present. Allow index errors to pass try: # Using the organismdict entry (genus) generated in the 16S analysis, set the allele and profile paths # NB: This will come up multiple times with this script, but I only allowed a small amount of freedom in # the placement of folders. Usually, there are strict folder hierarchies, which must be followed mlstpath = glob("%s/%s/*MLST*" % (currenttargetpath, organismdict[strain].keys()[0]))[0] + "/alleles" profilepath = glob("%s/%s/*MLST*" % (currenttargetpath, organismdict[strain].keys()[0]))[0] + "/profile" # Try to find the precomputed .json profile file profilefile = glob("%s/*.json" % profilepath) # If it does not exist, find the .txt profile file. As the script requires a small amount of formatting on # the profile file prior to analysis, I forced a changed in file extension to hopefully ensure that this # formatting has been performed if not profilefile: profilefile = glob("%s/*.txt" % profilepath)[0] else: profilefile = profilefile[0] # Create the bait target file (if necessary) baittargets(currenttargetpath, analysistype) # If a precomputed hash file is present in the bait folder, use it to save on processing time hashfile = glob("%s/bait/*.gz" % mlstpath) if hashfile: baitfile = hashfile[0] # Otherwise, use the .fasta bait file created above else: baitfile = glob("%s/bait/*.fa*" % mlstpath)[0] # Set the bait type variable using the genus of the strain and the analysis type baittype = "%s_MLST" % organismdict[strain].keys()[0] # Store the baittype variable in seqdict seqdict[strain]["bait"]["fastqFiles"][baittype] = baitfile # Get the targets into a list targets = glob("%s/*.*fa" % mlstpath) # Remove faidx processed files targets = [target for target in targets if ".fai" not in target] # Store the target list, and the profile file in seqdict seqdict[strain]["targets"][analysistype] = targets seqdict[strain]["MLSTprofile"] = profilefile # Index the SMALT targets SMALT.smaltindextargetsprocesses(targets, mlstpath) except IndexError: pass # Bait! print "Filtering .fastq files with %s targets" % analysistype baitrprocesses(analysistype) # Get the MLST profile into a dictionary print "\nLoading %s profiles" % analysistype profiledict, genedict = rawMLST.profilR(seqdict, profiletype) print '\nPerforming %s reference mapping' % analysistype # Perform SMALT reference mapping SMALT.smaltmappingprocesses(seqdict, analysistype, "SMALT") print "\nSorting mapped %s files" % analysistype # Use samtools to sort reference mapped bam files bamProcessor.sortingprocesses(seqdict, analysistype) print '\nIndexing sorted %s files' % analysistype # Use samtools to index sorted reference mapped bam files bamProcessor.bamindexingprocesses(seqdict, analysistype) print '\nParsing %s results' % analysistype # Use pysamstats to parse indexed sorted reference mapped bam files mlstmatches = bamPysamStats.bamparseprocesses(seqdict, analysistype) print '\nFinding multilocus sequence types' # Determine sequence types sequencetypes = rawMLST.sequenceTyper(mlstmatches, profiledict, genedict) # Create a report rawMLST.MLSTreportMaker(seqdict, sequencetypes, analysistype, reportfolder, organismdict, organismlist, path)