Esempio n. 1
0
def rmlst():
    """Performs the necessary analyses on strains using armi targets"""
    global targetpath, seqdict
    # Set the analysis type
    analysistype = "rMLST"
    # Set the profile name
    profiletype = "rMLSTprofile"
    # Set the path of the analysistype data
    currenttargetpath = "%s%s" % (targetpath, analysistype)
    # Set the allele and profile path variables
    rmlstpath = currenttargetpath + "/alleles"
    rmlstprofilepath = currenttargetpath + "/profile"
    # Find the .json version of the profile file
    profilefile = glob("%s/*.json" % rmlstprofilepath)
    # If there is no .json file, use the .txt profile file instead. Note that this file must be edited (by removing
    # any columns after the last gene)
    if not profilefile:
        profilefile = glob("%s/*.txt" % rmlstprofilepath)[0]
    else:
        profilefile = profilefile[0]
    # Because the combined database of alleles is so large (~143 MB), it is filtered with uSearch prior to baiting
    # this is not done automatically, so the set-up of the bait files is slightly different here
    # In order to save time, a precomputed hash file is used
    baitfile = glob("%s/bait/*.gz" % rmlstpath)[0]
    baittargets(rmlstpath, analysistype)
    # Get the target files into a list
    targets = glob("%s/*.*fa" % rmlstpath)
    # Filter the targets
    targets = [target for target in targets if ".fai" not in target and "Concatenated" not in target]
    # Add the bait file, the profile file, and a list of the targets to seqdict
    for strain in seqdict:
        seqdict[strain]["bait"]["fastqFiles"][analysistype] = baitfile
        seqdict[strain]["targets"][analysistype] = targets
        seqdict[strain][profiletype] = profilefile
    # Index the targets
    print "\nIndexing %s targets" % analysistype
    SMALT.smaltindextargetsprocesses(targets, rmlstpath)
    # Bait!
    print "Filtering .fastq files with %s targets" % analysistype
    baitrprocesses(analysistype)
    # Get the MLST profile into a dictionary
    print "\nLoading %s profiles" % analysistype
    profiledict, genedict = rawMLST.profilR(seqdict, profiletype)
    # Use SMALT to perform reference mapping
    print '\nPerforming %s reference mapping' % analysistype
    SMALT.smaltmappingprocesses(seqdict, analysistype, "SMALT")
    # Use samtools to sort bam files
    print "\nSorting mapped %s files" % analysistype
    bamProcessor.sortingprocesses(seqdict, analysistype)
    # Use samtools to index bam files
    print '\nIndexing sorted %s files' % analysistype
    bamProcessor.bamindexingprocesses(seqdict, analysistype)
    # Use pysamstats to parse bam files
    print '\nParsing %s results' % analysistype
    mlstmatches = bamPysamStats.bamparseprocesses(seqdict, analysistype)
    # Determine sequence types
    print '\nFinding multilocus sequence types'
    sequencetypes = rawMLST.sequenceTyper(mlstmatches, profiledict, genedict)
    # Create a report
    rawMLST.rMLSTreportMaker(seqdict, sequencetypes, analysistype, reportfolder, path)
Esempio n. 2
0
def mlst(organismdict, organismlist):
    """
    Performs the necessary analyses on strains using genus-specific MLST targets
    :param organismdict: dictionary of the 16S results
    :param organismlist: list of all the genera in the current analysis
    """
    global targetpath, seqdict
    # Set the analysis type
    analysistype = "MLST"
    profiletype = "MLSTprofile"
    # MLST targets are stored in targetpath/Organism/<genus>/MLST/alleles
    currenttargetpath = "%sOrganism" % targetpath
    # Print this out before the loop
    print '\nIndexing %s target files' % analysistype
    for strain in organismdict:
        # If there is not an MLST scheme installed for a particular organism, then the script will crash when it tries
        # to find the necessary files, as they are not present. Allow index errors to pass
        try:
            # Using the organismdict entry (genus) generated in the 16S analysis, set the allele and profile paths
            # NB: This will come up multiple times with this script, but I only allowed a small amount of freedom in
            # the placement of folders. Usually, there are strict folder hierarchies, which must be followed
            mlstpath = glob("%s/%s/*MLST*" % (currenttargetpath, organismdict[strain].keys()[0]))[0] + "/alleles"
            profilepath = glob("%s/%s/*MLST*" % (currenttargetpath, organismdict[strain].keys()[0]))[0] + "/profile"
            # Try to find the precomputed .json profile file
            profilefile = glob("%s/*.json" % profilepath)
            # If it does not exist, find the .txt profile file. As the script requires a small amount of formatting on
            # the profile file prior to analysis, I forced a changed in file extension to hopefully ensure that this
            # formatting has been performed
            if not profilefile:
                profilefile = glob("%s/*.txt" % profilepath)[0]
            else:
                profilefile = profilefile[0]
            # Create the bait target file (if necessary)
            baittargets(currenttargetpath, analysistype)
            # If a precomputed hash file is present in the bait folder, use it to save on processing time
            hashfile = glob("%s/bait/*.gz" % mlstpath)
            if hashfile:
                baitfile = hashfile[0]
            # Otherwise, use the .fasta bait file created above
            else:
                baitfile = glob("%s/bait/*.fa*" % mlstpath)[0]
            # Set the bait type variable using the genus of the strain and the analysis type
            baittype = "%s_MLST" % organismdict[strain].keys()[0]
            # Store the baittype variable in seqdict
            seqdict[strain]["bait"]["fastqFiles"][baittype] = baitfile
            # Get the targets into a list
            targets = glob("%s/*.*fa" % mlstpath)
            # Remove faidx processed files
            targets = [target for target in targets if ".fai" not in target]
            # Store the target list, and the profile file in seqdict
            seqdict[strain]["targets"][analysistype] = targets
            seqdict[strain]["MLSTprofile"] = profilefile
            # Index the SMALT targets
            SMALT.smaltindextargetsprocesses(targets, mlstpath)
        except IndexError:
            pass
    # Bait!
    print "Filtering .fastq files with %s targets" % analysistype
    baitrprocesses(analysistype)
    # Get the MLST profile into a dictionary
    print "\nLoading %s profiles" % analysistype
    profiledict, genedict = rawMLST.profilR(seqdict, profiletype)
    print '\nPerforming %s reference mapping' % analysistype
    # Perform SMALT reference mapping
    SMALT.smaltmappingprocesses(seqdict, analysistype, "SMALT")
    print "\nSorting mapped %s files" % analysistype
    # Use samtools to sort reference mapped bam files
    bamProcessor.sortingprocesses(seqdict, analysistype)
    print '\nIndexing sorted %s files' % analysistype
    # Use samtools to index sorted reference mapped bam files
    bamProcessor.bamindexingprocesses(seqdict, analysistype)
    print '\nParsing %s results' % analysistype
    # Use pysamstats to parse indexed sorted reference mapped bam files
    mlstmatches = bamPysamStats.bamparseprocesses(seqdict, analysistype)
    print '\nFinding multilocus sequence types'
    # Determine sequence types
    sequencetypes = rawMLST.sequenceTyper(mlstmatches, profiledict, genedict)
    # Create a report
    rawMLST.MLSTreportMaker(seqdict, sequencetypes, analysistype, reportfolder, organismdict, organismlist, path)