def bamparse((strain, target, bamfile, targetname)): """Parses bam files using pysam stats""" import pysamstats parsedict = defaultdict(bamPysamStats.make_dict) # Use the stat_baseq_ext (extended base quality statistics) function of pysam stats to return records parsed # from sorted bam files for rec in pysamstats.stat_baseq_ext(alignmentfile=bamfile, fafile=target): # Values of interest can be retrieved using the appropriate keys # Simple filtering statement: if the number of matches at a particular position in the reference sequence is # greater than the number of mismatches, and the total depth is 5 or more, add the position of the results if rec['matches'] > rec['mismatches'] and rec['reads_all'] > 4: # Populate the dictionary with the appropriate values parsedict[strain][target][rec['chrom']][float(rec['pos'])][rec['reads_all']] = rec['rms_baseq'] bamPysamStats.dotter() # dotter() return parsedict
def bamparse((strain, target, bamfile, targetname)): """Parses bam files using pysam stats""" import pysamstats parsedict = defaultdict(bamPysamStats.make_dict) # Use the stat_baseq_ext (extended base quality statistics) function of pysam stats to return records parsed # from sorted bam files for rec in pysamstats.stat_baseq_ext(alignmentfile=bamfile, fafile=target): # Values of interest can be retrieved using the appropriate keys # Simple filtering statement: if the number of matches at a particular position in the reference sequence is # greater than the number of mismatches, and the total depth is 5 or more, add the position of the results if rec['matches'] > rec['mismatches'] and rec['reads_all'] > 4: # Populate the dictionary with the appropriate values parsedict[strain][target][rec['chrom']][float( rec['pos'])][rec['reads_all']] = rec['rms_baseq'] bamPysamStats.dotter() # dotter() return parsedict
def parsearmi(self): """ Multiprocessing for parsing bam files :param seqdict: dictionary containing import path and name information of files and folders :param analysistype: string of the analysis type """ from glob import glob # Initialise dictionary, argument list, and Pool loadedresultsdict = defaultdict(bamPysamStats.make_dict) bamparseprocessesargs = [] bamparseprocessespool = Pool() # Iterate through the strains for strain in self.seqdict: # Store the identityCutoff in seqDict self.seqdict[strain]["cutoff"][ self.analysistype] = self.identitycutoff # Retrieve bait type and determine the directory of the fastq files from seqdict baittype = self.seqdict[strain]["bait"]["fastqFiles"].keys()[0] fastqdir = os.path.split( self.seqdict[strain]["bait"]["fastqFiles"][baittype][0])[0] # Set the name of the JSON file to store the results jsonprofile = "%s/%s_matchDict.json" % (fastqdir, self.analysistype) # If the JSON file hasn't been created, parse the bam files if not os.path.isfile(jsonprofile): bamfile = glob( '{}/ARMIConcatenated/*_sorted.bam'.format(fastqdir))[0] for target in self.seqdict[strain]["targets"][ self.analysistype]: # Get the name of the target from the target variable targetname = os.path.basename(target).split(".")[0] # Set the input/output dir # outdir = "%s/%s" % (fastqdir, targetname) # Make a list of the sorted bam files # bamfile = glob("%s/*_sorted.bam" % outdir)[0] # catfile = self.seqdict[strain]["concatenatedTargets"][self.analysistype] # print "!", catfile # Append a tuple of the required arguments to the argument list bamparseprocessesargs.append( (strain, target, bamfile, targetname)) # print strain, target, bamfile # If the JSON file exists, read the results from it rather than performing the parsing again else: # Open the JSON file with open(jsonprofile, "rb") as jsonreport: # Load the results from the JSON file into a dictionary loadedresultsdict[strain].update(json.load(jsonreport)) bamPysamStats.dotter() # Run the multiprocessed bam parsing parselist = bamparseprocessespool.map(bamparse, bamparseprocessesargs) # Change the returned list of dictionaries into a nested dictionary self.parseddict = bamPysamStats.filler(parselist) # Load the length of the targets using the .fai files generated in the bamParse function self.seqdict = bamPysamStats.targetlength(self.seqdict, self.analysistype) # Iterate through the strains in order to write the results to a JSON file for strain in self.seqdict: # Get the bait type baittype = self.seqdict[strain]["bait"]["fastqFiles"].keys()[0] fastqdir = os.path.split( self.seqdict[strain]["bait"]["fastqFiles"][baittype][0])[0] # Define the JSON profile file jsonprofile = "%s/%s_matchDict.json" % (fastqdir, self.analysistype) # If the file doesn't exist, create it, and fill it with results if not os.path.isfile(jsonprofile): jsonreport = open(jsonprofile, "wb") output = json.dumps(self.parseddict[strain], sort_keys=True, indent=4, separators=(',', ': ')) jsonreport.write(output) jsonreport.close() self.armiparser()
def parsearmi(self): """ Multiprocessing for parsing bam files :param seqdict: dictionary containing import path and name information of files and folders :param analysistype: string of the analysis type """ from glob import glob # Initialise dictionary, argument list, and Pool loadedresultsdict = defaultdict(bamPysamStats.make_dict) bamparseprocessesargs = [] bamparseprocessespool = Pool() # Iterate through the strains for strain in self.seqdict: # Store the identityCutoff in seqDict self.seqdict[strain]["cutoff"][self.analysistype] = self.identitycutoff # Retrieve bait type and determine the directory of the fastq files from seqdict baittype = self.seqdict[strain]["bait"]["fastqFiles"].keys()[0] fastqdir = os.path.split(self.seqdict[strain]["bait"]["fastqFiles"][baittype][0])[0] # Set the name of the JSON file to store the results jsonprofile = "%s/%s_matchDict.json" % (fastqdir, self.analysistype) # If the JSON file hasn't been created, parse the bam files if not os.path.isfile(jsonprofile): bamfile = glob('{}/ARMIConcatenated/*_sorted.bam'.format(fastqdir))[0] for target in self.seqdict[strain]["targets"][self.analysistype]: # Get the name of the target from the target variable targetname = os.path.basename(target).split(".")[0] # Set the input/output dir # outdir = "%s/%s" % (fastqdir, targetname) # Make a list of the sorted bam files # bamfile = glob("%s/*_sorted.bam" % outdir)[0] # catfile = self.seqdict[strain]["concatenatedTargets"][self.analysistype] # print "!", catfile # Append a tuple of the required arguments to the argument list bamparseprocessesargs.append((strain, target, bamfile, targetname)) # print strain, target, bamfile # If the JSON file exists, read the results from it rather than performing the parsing again else: # Open the JSON file with open(jsonprofile, "rb") as jsonreport: # Load the results from the JSON file into a dictionary loadedresultsdict[strain].update(json.load(jsonreport)) bamPysamStats.dotter() # Run the multiprocessed bam parsing parselist = bamparseprocessespool.map(bamparse, bamparseprocessesargs) # Change the returned list of dictionaries into a nested dictionary self.parseddict = bamPysamStats.filler(parselist) # Load the length of the targets using the .fai files generated in the bamParse function self.seqdict = bamPysamStats.targetlength(self.seqdict, self.analysistype) # Iterate through the strains in order to write the results to a JSON file for strain in self.seqdict: # Get the bait type baittype = self.seqdict[strain]["bait"]["fastqFiles"].keys()[0] fastqdir = os.path.split(self.seqdict[strain]["bait"]["fastqFiles"][baittype][0])[0] # Define the JSON profile file jsonprofile = "%s/%s_matchDict.json" % (fastqdir, self.analysistype) # If the file doesn't exist, create it, and fill it with results if not os.path.isfile(jsonprofile): jsonreport = open(jsonprofile, "wb") output = json.dumps(self.parseddict[strain], sort_keys=True, indent=4, separators=(',', ': ')) jsonreport.write(output) jsonreport.close() self.armiparser()