def _mainLoop(self): # Read in the existing modifications.gff modReader = GffReader(self.args.modifications) # Set up some additional headers to be injected headers = [ ('source', 'kineticModificationCaller 1.3.1'), ('source-commandline', " ".join(sys.argv)), ('attribute-description', 'modsfwd - count of detected DNA modifications on forward strand'), ('attribute-description', 'modsrev - count of detected DNA modifications on reverse strand') ] # Get modification calls hits = [{"pos": x.start, "strand": x.strand} for x in modReader if x.type == 'modified_base'] # Summary reader summaryFile = file(self.args.alignmentSummary) # Modified gff file summaryWriter = file(self.args.outfile, "w") self.seqMap = {} inHeader = True # Loop through for line in summaryFile: # Pass any metadata line straight through if line[0] == "#": # Parse headers splitFields = line.replace('#', '').split(' ') field = splitFields[0] value = " ".join(splitFields[1:]) if field == 'sequence-header': [internalTag, delim, externalTag] = value.strip().partition(' ') self.seqMap[internalTag] = externalTag print >>summaryWriter, line.strip() continue if inHeader: # We are at the end of the header -- write the tool-specific headers for field in headers: print >>summaryWriter, ("##%s %s" % field) inHeader = False # Parse the line rec = Gff3Record.fromString(line) if rec.type == 'region': # Get the hits in this interval, add them to the gff record intervalHits = [h for h in hits if rec.start <= h['pos'] <= rec.end] strand0Hits = len([h for h in intervalHits if h['strand'] == '+']) strand1Hits = len([h for h in intervalHits if h['strand'] == '-']) rec.modsfwd = strand0Hits rec.modsrev = strand1Hits print >>summaryWriter, str(rec)
def test_fromString(self): newRecord = Gff3Record.fromString(str(self.RECORD)) assert str(self.RECORD) == str(newRecord)
def test_fromString(self): newRecord = Gff3Record.fromString(str(self.record)) assert_equal(str(self.record), str(newRecord))
def main(): headers = [ ("source", "GenomicConsensus %s" % __VERSION__), ("pacbio-alignment-summary-version", "0.6"), ("source-commandline", " ".join(sys.argv)), ] desc = "Augment the alignment_summary.gff file with consensus and variants information." parser = argparse.ArgumentParser(description=desc) parser.add_argument("--variantsGff", type=str, help="Input variants.gff or variants.gff.gz filename", required=True) parser.add_argument("--output", "-o", type=str, help="Output alignment_summary.gff filename") parser.add_argument("inputAlignmentSummaryGff", type=str, help="Input alignment_summary.gff filename") options = parser.parse_args() inputVariantsGff = GffReader(options.variantsGff) inputAlignmentSummaryGff = GffReader(options.inputAlignmentSummaryGff) summaries = {} for gffRecord in inputAlignmentSummaryGff: region = Region(gffRecord.seqid, gffRecord.start, gffRecord.end) summaries[region] = {"ins": 0, "del": 0, "sub": 0, "cQv": (0, 0, 0)} inputAlignmentSummaryGff.close() counterNames = { "insertion": "ins", "deletion": "del", "substitution": "sub" } for variantGffRecord in inputVariantsGff: for region in summaries: summary = summaries[region] if (region.seqid == variantGffRecord.seqid and region.start <= variantGffRecord.start <= region.end): counterName = counterNames[variantGffRecord.type] variantLength = max(len(variantGffRecord.reference), len(variantGffRecord.variantSeq)) summary[counterName] += variantLength # TODO: base consensusQV on effective coverage summary["cQv"] = (20, 20, 20) inputAlignmentSummaryGff = open(options.inputAlignmentSummaryGff) outputAlignmentSummaryGff = open(options.output, "w") inHeader = True for line in inputAlignmentSummaryGff: line = line.rstrip() # Pass any metadata line straight through if line[0] == "#": print >> outputAlignmentSummaryGff, line.strip() continue if inHeader: # We are at the end of the header -- write the tool-specific headers for k, v in headers: print >> outputAlignmentSummaryGff, ("##%s %s" % (k, v)) inHeader = False # Parse the line rec = Gff3Record.fromString(line) if rec.type == "region": summary = summaries[(rec.seqid, rec.start, rec.end)] if "cQv" in summary: cQvTuple = summary["cQv"] line += ";%s=%s" % ("cQv", ",".join( str(int(f)) for f in cQvTuple)) for counterName in counterNames.values(): if counterName in summary: line += ";%s=%d" % (counterName, summary[counterName]) print >> outputAlignmentSummaryGff, line
def _mainLoop(self): # Read in the existing modifications.gff modReader = GffReader(self.modifications) headerString = ",".join(['"' + x + '"' for x in self.knownModificationEvents]) # Set up some additional headers to be injected headers = [ ("source", "kineticModificationCaller 1.3.3"), ("source-commandline", " ".join(sys.argv)), ( "attribute-description", "modsfwd - count of detected DNA modifications on forward strand by modification event type", ), ( "attribute-description", "modsrev - count of detected DNA modifications on reverse strand by modification event type", ), ("region-modsfwd", headerString), ("region-modsfwd", headerString), ] hitsByEvent = dict([(x, []) for x in self.knownModificationEvents]) # Get modification calls hits = [ {"pos": x.start, "strand": x.strand, "seqid": x.seqid, "type": x.type} for x in modReader if x.type in self.knownModificationEvents ] # Summary reader summaryFile = file(self.alignmentSummary) # Modified gff file summaryWriter = file(self.outfile, "w") self.seqMap = {} inHeader = True # Loop through for line in summaryFile: # Pass any metadata line straight through if line[0] == "#": # Parse headers splitFields = line.replace("#", "").split(" ") field = splitFields[0] value = " ".join(splitFields[1:]) if field == "sequence-header": [internalTag, delim, externalTag] = value.strip().partition(" ") self.seqMap[internalTag] = externalTag print >> summaryWriter, line.strip() continue if inHeader: # We are at the end of the header -- write the tool-specific headers for field in headers: print >> summaryWriter, ("##%s %s" % field) inHeader = False # Parse the line rec = Gff3Record.fromString(line) if rec.type == "region": # Get the hits in this interval, add them to the gff record intervalHits = [h for h in hits if rec.start <= h["pos"] <= rec.end and rec.seqid == h["seqid"]] cFwd = self.countModificationTypes([h for h in intervalHits if h["strand"] == "+"]) cRev = self.countModificationTypes([h for h in intervalHits if h["strand"] == "-"]) rec.modsfwd = ",".join([str(cFwd[x]) for x in self.knownModificationEvents]) rec.modsrev = ",".join([str(cRev[x]) for x in self.knownModificationEvents]) print >> summaryWriter, str(rec) return 0
def _mainLoop(self): # Read in the existing modifications.gff modReader = GffReader(self.modifications) headerString = ",".join( ['"' + x + '"' for x in self.knownModificationEvents]) # Set up some additional headers to be injected headers = [ ('source', 'kineticModificationCaller 1.3.3'), ('source-commandline', " ".join(sys.argv)), ('attribute-description', 'modsfwd - count of detected DNA modifications on forward strand by modification event type' ), ('attribute-description', 'modsrev - count of detected DNA modifications on reverse strand by modification event type' ), ('region-modsfwd', headerString), ('region-modsfwd', headerString) ] hitsByEvent = dict([(x, []) for x in self.knownModificationEvents]) # Get modification calls hits = [{ "pos": x.start, "strand": x.strand, "seqid": x.seqid, "type": x.type } for x in modReader if x.type in self.knownModificationEvents] # Summary reader summaryFile = file(self.alignmentSummary) # Modified gff file summaryWriter = file(self.outfile, "w") self.seqMap = {} inHeader = True # Loop through for line in summaryFile: # Pass any metadata line straight through if line[0] == "#": # Parse headers splitFields = line.replace('#', '').split(' ') field = splitFields[0] value = " ".join(splitFields[1:]) if field == 'sequence-header': [internalTag, delim, externalTag] = value.strip().partition(' ') self.seqMap[internalTag] = externalTag print(line.strip(), file=summaryWriter) continue if inHeader: # We are at the end of the header -- write the tool-specific headers for field in headers: print(("##%s %s" % field), file=summaryWriter) inHeader = False # Parse the line rec = Gff3Record.fromString(line) if rec.type == 'region': # Get the hits in this interval, add them to the gff record intervalHits = [ h for h in hits if rec.start <= h['pos'] <= rec.end and rec.seqid == h['seqid'] ] cFwd = self.countModificationTypes( [h for h in intervalHits if h['strand'] == '+']) cRev = self.countModificationTypes( [h for h in intervalHits if h['strand'] == '-']) rec.modsfwd = ",".join( [str(cFwd[x]) for x in self.knownModificationEvents]) rec.modsrev = ",".join( [str(cRev[x]) for x in self.knownModificationEvents]) print(str(rec), file=summaryWriter) return 0