def bamparse((strain, target, bamfile, targetname)):
    """Parses bam files using pysam stats"""
    import pysamstats
    parsedict = defaultdict(bamPysamStats.make_dict)
    # Use the stat_baseq_ext (extended base quality statistics) function of pysam stats to return records parsed
    # from sorted bam files
    for rec in pysamstats.stat_baseq_ext(alignmentfile=bamfile, fafile=target):
        # Values of interest can be retrieved using the appropriate keys
        # Simple filtering statement: if the number of matches at a particular position in the reference sequence is
        # greater than the number of mismatches, and the total depth is 5 or more, add the position of the results
        if rec['matches'] > rec['mismatches'] and rec['reads_all'] > 4:
            # Populate the dictionary with the appropriate values
            parsedict[strain][target][rec['chrom']][float(rec['pos'])][rec['reads_all']] = rec['rms_baseq']
    bamPysamStats.dotter()

    # dotter()
    return parsedict
Ejemplo n.º 2
0
def bamparse((strain, target, bamfile, targetname)):
    """Parses bam files using pysam stats"""
    import pysamstats
    parsedict = defaultdict(bamPysamStats.make_dict)
    # Use the stat_baseq_ext (extended base quality statistics) function of pysam stats to return records parsed
    # from sorted bam files
    for rec in pysamstats.stat_baseq_ext(alignmentfile=bamfile, fafile=target):
        # Values of interest can be retrieved using the appropriate keys
        # Simple filtering statement: if the number of matches at a particular position in the reference sequence is
        # greater than the number of mismatches, and the total depth is 5 or more, add the position of the results
        if rec['matches'] > rec['mismatches'] and rec['reads_all'] > 4:
            # Populate the dictionary with the appropriate values
            parsedict[strain][target][rec['chrom']][float(
                rec['pos'])][rec['reads_all']] = rec['rms_baseq']
    bamPysamStats.dotter()

    # dotter()
    return parsedict
Ejemplo n.º 3
0
    def parsearmi(self):
        """
        Multiprocessing for parsing bam files
        :param seqdict: dictionary containing import path and name information of files and folders
        :param analysistype: string of the analysis type
        """
        from glob import glob
        # Initialise dictionary, argument list, and Pool
        loadedresultsdict = defaultdict(bamPysamStats.make_dict)
        bamparseprocessesargs = []
        bamparseprocessespool = Pool()
        # Iterate through the strains
        for strain in self.seqdict:

            #  Store the identityCutoff in seqDict
            self.seqdict[strain]["cutoff"][
                self.analysistype] = self.identitycutoff
            # Retrieve bait type and determine the directory of the fastq files from seqdict
            baittype = self.seqdict[strain]["bait"]["fastqFiles"].keys()[0]
            fastqdir = os.path.split(
                self.seqdict[strain]["bait"]["fastqFiles"][baittype][0])[0]
            # Set the name of the JSON file to store the results
            jsonprofile = "%s/%s_matchDict.json" % (fastqdir,
                                                    self.analysistype)
            # If the JSON file hasn't been created, parse the bam files
            if not os.path.isfile(jsonprofile):
                bamfile = glob(
                    '{}/ARMIConcatenated/*_sorted.bam'.format(fastqdir))[0]
                for target in self.seqdict[strain]["targets"][
                        self.analysistype]:
                    # Get the name of the target from the target variable
                    targetname = os.path.basename(target).split(".")[0]
                    # Set the input/output dir
                    # outdir = "%s/%s" % (fastqdir, targetname)
                    # Make a list of the sorted bam files
                    # bamfile = glob("%s/*_sorted.bam" % outdir)[0]
                    # catfile = self.seqdict[strain]["concatenatedTargets"][self.analysistype]
                    # print "!", catfile

                    # Append a tuple of the required arguments to the argument list
                    bamparseprocessesargs.append(
                        (strain, target, bamfile, targetname))
                    # print strain, target, bamfile
            # If the JSON file exists, read the results from it rather than performing the parsing again
            else:
                # Open the JSON file
                with open(jsonprofile, "rb") as jsonreport:
                    # Load the results from the JSON file into a dictionary
                    loadedresultsdict[strain].update(json.load(jsonreport))
                    bamPysamStats.dotter()
        # Run the multiprocessed bam parsing
        parselist = bamparseprocessespool.map(bamparse, bamparseprocessesargs)
        # Change the returned list of dictionaries into a nested dictionary
        self.parseddict = bamPysamStats.filler(parselist)
        # Load the length of the targets using the .fai files generated in the bamParse function
        self.seqdict = bamPysamStats.targetlength(self.seqdict,
                                                  self.analysistype)
        # Iterate through the strains in order to write the results to a JSON file
        for strain in self.seqdict:
            # Get the bait type
            baittype = self.seqdict[strain]["bait"]["fastqFiles"].keys()[0]
            fastqdir = os.path.split(
                self.seqdict[strain]["bait"]["fastqFiles"][baittype][0])[0]
            # Define the JSON profile file
            jsonprofile = "%s/%s_matchDict.json" % (fastqdir,
                                                    self.analysistype)
            # If the file doesn't exist, create it, and fill it with results
            if not os.path.isfile(jsonprofile):
                jsonreport = open(jsonprofile, "wb")
                output = json.dumps(self.parseddict[strain],
                                    sort_keys=True,
                                    indent=4,
                                    separators=(',', ': '))
                jsonreport.write(output)
                jsonreport.close()
        self.armiparser()
    def parsearmi(self):
        """
        Multiprocessing for parsing bam files
        :param seqdict: dictionary containing import path and name information of files and folders
        :param analysistype: string of the analysis type
        """
        from glob import glob
        # Initialise dictionary, argument list, and Pool
        loadedresultsdict = defaultdict(bamPysamStats.make_dict)
        bamparseprocessesargs = []
        bamparseprocessespool = Pool()
        # Iterate through the strains
        for strain in self.seqdict:

            #  Store the identityCutoff in seqDict
            self.seqdict[strain]["cutoff"][self.analysistype] = self.identitycutoff
            # Retrieve bait type and determine the directory of the fastq files from seqdict
            baittype = self.seqdict[strain]["bait"]["fastqFiles"].keys()[0]
            fastqdir = os.path.split(self.seqdict[strain]["bait"]["fastqFiles"][baittype][0])[0]
            # Set the name of the JSON file to store the results
            jsonprofile = "%s/%s_matchDict.json" % (fastqdir, self.analysistype)
            # If the JSON file hasn't been created, parse the bam files
            if not os.path.isfile(jsonprofile):
                bamfile = glob('{}/ARMIConcatenated/*_sorted.bam'.format(fastqdir))[0]
                for target in self.seqdict[strain]["targets"][self.analysistype]:
                    # Get the name of the target from the target variable
                    targetname = os.path.basename(target).split(".")[0]
                    # Set the input/output dir
                    # outdir = "%s/%s" % (fastqdir, targetname)
                    # Make a list of the sorted bam files
                    # bamfile = glob("%s/*_sorted.bam" % outdir)[0]
                    # catfile = self.seqdict[strain]["concatenatedTargets"][self.analysistype]
                    # print "!", catfile


                    # Append a tuple of the required arguments to the argument list
                    bamparseprocessesargs.append((strain, target, bamfile, targetname))
                    # print strain, target, bamfile
            # If the JSON file exists, read the results from it rather than performing the parsing again
            else:
                # Open the JSON file
                with open(jsonprofile, "rb") as jsonreport:
                    # Load the results from the JSON file into a dictionary
                    loadedresultsdict[strain].update(json.load(jsonreport))
                    bamPysamStats.dotter()
        # Run the multiprocessed bam parsing
        parselist = bamparseprocessespool.map(bamparse, bamparseprocessesargs)
        # Change the returned list of dictionaries into a nested dictionary
        self.parseddict = bamPysamStats.filler(parselist)
        # Load the length of the targets using the .fai files generated in the bamParse function
        self.seqdict = bamPysamStats.targetlength(self.seqdict, self.analysistype)
        # Iterate through the strains in order to write the results to a JSON file
        for strain in self.seqdict:
            # Get the bait type
            baittype = self.seqdict[strain]["bait"]["fastqFiles"].keys()[0]
            fastqdir = os.path.split(self.seqdict[strain]["bait"]["fastqFiles"][baittype][0])[0]
            # Define the JSON profile file
            jsonprofile = "%s/%s_matchDict.json" % (fastqdir, self.analysistype)
            # If the file doesn't exist, create it, and fill it with results
            if not os.path.isfile(jsonprofile):
                jsonreport = open(jsonprofile, "wb")
                output = json.dumps(self.parseddict[strain], sort_keys=True, indent=4, separators=(',', ': '))
                jsonreport.write(output)
                jsonreport.close()
        self.armiparser()