예제 #1
0
def armi():
    """Performs the necessary analyses on strains using armi targets"""
    global targetpath, seqdict
    # Set the analysis type
    analysistype = "armi"
    # Set the path of the target files
    currenttargetpath = "%s%s" % (targetpath, analysistype)
    # Create the bait files as necessary
    baittargets(currenttargetpath, analysistype)
    # The cat file is all the target files concatenated together
    catfile = "%s/%sConcatenated.fasta" % (currenttargetpath, analysistype)
    # If the cat file doesn't exist, create it from the bait file generated in baittargets()
    if not os.path.isfile(catfile):
        shutil.copyfile("%s/bait/%sBait.fa" % (currenttargetpath, analysistype), catfile)
    # In order to save time, a precomputed hash file is used
    hashfile = glob("%s/bait/*.gz" % currenttargetpath)
    # If this precomputed hash exists, use it
    if hashfile:
        baitfile = hashfile[0]
    # Otherwise use the .fasta bait file
    else:
        baitfile = glob("%s/bait/*.fa*" % currenttargetpath)[0]
    # Get the full target database into a variable
    armidatabase = glob("%s/*.fa*" % currenttargetpath)
    # Filter faidx processed targets from the list
    armidatabase = [target for target in armidatabase if ".fai" not in target and "Concatenated" not in target]
    # Add necessary variables to seqdict
    for folderName in seqdict:
        seqdict[folderName]["bait"]["fastqFiles"][analysistype] = baitfile
        seqdict[folderName]["targets"][analysistype] = armidatabase
        seqdict[folderName]["concatenatedTargets"][analysistype] = catfile
    print "Filtering .fastq files with %s targets" % analysistype
    # Run the baiting process
    baitrprocesses(analysistype)
    print "\nIndexing %s targets" % analysistype
    # Index the combined target file
    SMALTcombined.SMALTindexTargets(catfile, currenttargetpath)
    print '\nPerforming %s reference mapping' % analysistype
    # Use SMALT to perform reference mapping of the combined target file
    SMALTcombined.SMALTmappingProcesses(seqdict, analysistype, "SMALT")
    print "\nSorting mapped %s files" % analysistype
    # Use samtools to sort the bam file
    bamProcessorCombined.sortingprocesses(seqdict, analysistype)
    print '\nIndexing sorted %s files' % analysistype
    # Use samtools to index the sorted bam file
    bamProcessorCombined.bamindexingprocesses(seqdict, analysistype)
    print '\nParsing %s results' % analysistype
    # Use pysamstats to parse the bam files. Mike's armi module is called from within bamPysamStatsCombined
    bamPysamStatsCombined.bamParseProcesses(seqdict, analysistype, reportfolder)
예제 #2
0
def pathotyper(organismdict, organismlist, analysistype):
    """
    Performs the necessary analyses on strains using genus-specific pathotype/serotype targets
    :param organismdict: dictionary of the 16S results
    :param organismlist: list of all the genera in the current analysis
    :param analysistype: string of the current analysis type
    """
    global targetpath, seqdict
    # Targets are stored in targetpath/Organism/<genus>/<analysistype>/
    currenttargetpath = "%sOrganism" % targetpath
    # Print this prior to the loop
    print '\nIndexing %s target files' % analysistype
    # As strains will be processed depending differently based on their genus, they must be processed separately
    for strain in organismdict:
        # Try/except account for genera without pathotyping schemes
        try:
            # Set the target path
            pathopath = glob("%s/%s/%s" % (currenttargetpath, organismdict[strain].keys()[0], analysistype))[0]
            # Create the bait targets if necessary
            baittargets(pathopath, analysistype)
            # The cat file will be used in the "combined" reference mapping
            catfile = "%s/%sConcatenated.fasta" % (pathopath, analysistype)
            # If the cat file does not exist, copy the bait file from the bait folder to the target folder
            if not os.path.isfile(catfile):
                shutil.copyfile("%s/bait/%sBait.fa" % (pathopath, analysistype), catfile)
            # Find the files to be used in baiting - if a precomputed hash file is present, use it
            hashfile = glob("%s/bait/*.gz" % pathopath)
            if hashfile:
                baitfile = hashfile[0]
            else:
                baitfile = glob("%s/bait/*.fa*" % pathopath)[0]
            # Set the baittype variable as the genus, and the analysis type
            baittype = "%s_%s" % (organismdict[strain].keys()[0], analysistype)
            # Get all the targets into a list
            # Even though the cat file will be used for the reference mapping rather than the individual target files,
            # the target names (taken from the name of the target files) is still used in the parsing of results
            targets = glob("%s/*.fa*" % pathopath)
            # Remove faidx processed files from the list
            targets = [target for target in targets if ".fai" not in target and "Concatenated" not in target]
            # Add the bait file, the cat file, and the list of targets to seqdict
            seqdict[strain]["bait"]["fastqFiles"][baittype] = baitfile
            seqdict[strain]["targets"][analysistype] = targets
            seqdict[strain]["concatenatedTargets"][analysistype] = catfile
            # Index the SMALT targets
            SMALTcombined.SMALTindexTargets(catfile, pathopath)
        except IndexError:
            pass
    # Bait!
    print "Filtering .fastq files with %s targets" % analysistype
    baitrprocesses(analysistype)
    print '\nPerforming reference mapping'
    # Use SMALT to perform reference mapping
    SMALTcombined.SMALTmappingProcesses(seqdict, analysistype, "SMALT")
    print '\nSorting mapped %s files' % analysistype
    # Use samtools to sort bam files
    bamProcessorCombined.sortingprocesses(seqdict, analysistype)
    print '\nIndexing sorted %s files' % analysistype
    # Use samtools to index sorted bam files
    bamProcessorCombined.bamindexingprocesses(seqdict, analysistype)
    print '\nParsing %s results' % analysistype
    # Use pysamstats to parse results
    pathomatches = bamPysamStatsCombined.bamParseProcesses(seqdict, analysistype, reportfolder)
    # Create a report
    pathoreportr(pathomatches, analysistype, organismdict, organismlist)