for cStart, cEnd in sv[chrName].overlap((start, end)): for cSvID, cScore in svDups[(chrName, cStart, cEnd)] + [ sv[chrName][(cStart, cEnd)] ]: if record.ID != cSvID: if (cScore > record.INFO['PE']) or ( (cScore == record.INFO['PE']) and (cSvID < record.ID)): countBetterHits += 1 if countBetterHits > 2: foundBetterHit = True else: for cStart, cEnd in sv[record.CHROM].overlap( (record.POS, record.INFO['END'])): for cSvID, cScore in svDups[(record.CHROM, cStart, cEnd)] + [ sv[record.CHROM][(cStart, cEnd)] ]: if (record.ID != cSvID) and (overlapValid( (record.POS, record.INFO['END']), (cStart, cEnd))): if (cScore > record.INFO['PE']) or ( (cScore == record.INFO['PE']) and (cSvID < record.ID)): foundBetterHit = True break if foundBetterHit: break if not foundBetterHit: record.INFO['RDRATIO'] = rdRat[record.ID] record.INFO['SOMATIC'] = True vcf_writer.write_record(record)
vcf_writer = vcf.Writer(open(args.outFile, 'w'), vcf_reader, lineterminator='\n') for record in vcf_reader: if (record.CHROM not in sv.keys()) or ((record.POS, record.INFO['END']) not in sv[record.CHROM].keys()): continue overlapList = sv[record.CHROM].overlap( (record.POS, record.INFO['END'])) foundBetterHit = False if not args.keepOverlap: for cStart, cEnd in sv[record.CHROM].overlap( (record.POS, record.INFO['END'])): if foundBetterHit: break for cSvID, cScore, cCt in svDups[ (record.CHROM, cStart, cEnd)] + [sv[record.CHROM][(cStart, cEnd)]]: if record.ID != cSvID: if (cScore > record.INFO['PE']) or ( (cScore == record.INFO['PE']) and (cSvID < record.ID)): if overlapValid((record.POS, record.INFO['END']), (cStart, cEnd), 0.1, 10000000): foundBetterHit = True break # Output VCF record if not foundBetterHit: vcf_writer.write_record(record)
rdRatio2 = 1.0 if int(svControlID2) in control.keys(): rdRatio2 = altRefReadDepthRatio(inv3to3['rc'], control[int(svControlID2)], inv3to3['hap']) print(record.CHROM, record.POS, record.INFO['END'], record.ID, record.CHROM, s2, e2, inv3to3['id'], spacer, delLength, cc, rdRatio1, rdRatio2) if (not args.readDepth) or ((rdRatio1<0.8) and (rdRatio2<0.8)): score = float(min(peCount, inv3to3['pe'])) * float(cc) if score > invInfo['score']: invInfo = {'id': inv3to3['id'], 'start': min(s1, s2), 'end': max(e1, e2), 'score': score} if invInfo['score'] >= 0: if not invRegion.has_key(record.CHROM): invRegion[record.CHROM] = banyan.SortedDict(key_type=(int, int), alg=banyan.RED_BLACK_TREE, updator=banyan.OverlappingIntervalsUpdator) G.add_node((record.ID, invInfo['id'])) G.node[(record.ID, invInfo['id'])]['Score'] = invInfo['score'] for invIStart, invIEnd in invRegion[record.CHROM].overlap((invInfo['start'], invInfo['end'])): (id1, id2) = invRegion[record.CHROM][(invIStart, invIEnd)] if overlapValid((invInfo['start'], invInfo['end']), (invIStart, invIEnd), 0.1, 10000): G.add_edge((record.ID, invInfo['id']), (id1, id2)) invRegion[record.CHROM][(invInfo['start'], invInfo['end'])] = (record.ID, invInfo['id']) # Pick best pair of inversions out of all overlapping calls idPairs = dict() for H in networkx.connected_component_subgraphs(G): bestScore = -1.0 for n, d in H.nodes_iter(data=True): if d['Score'] > bestScore: bestScore = d['Score'] bestSVs = n idPairs[bestSVs] = bestScore # Extract selected calls selectedSVs = dict()
rdRatio = numpy.median(numpy.array(hetRC))/numpy.median(numpy.array(refRC)) if ((record.INFO['SVTYPE'] == "DEL") and (rdRatio < 0.8)) or ((record.INFO['SVTYPE'] == "DUP") and (rdRatio >= 1.3) and (rdRatio <= 1.75)): validRdRatio = True else: validRdRatio = True # Check quality #print(record.CHROM, svStart, svEnd, record.ID, qIndex, numpy.percentile(ratioRef, 99), altgq, refgq, altratio, sep="\t") if (validRdRatio) and (qIndex > quality) and (numpy.percentile(ratioRef, 99) <= args.maxRefRatio): if not cnvRegion.has_key(record.CHROM): cnvRegion[record.CHROM] = banyan.SortedDict(key_type=(int, int), alg=banyan.RED_BLACK_TREE, updator=banyan.OverlappingIntervalsUpdator) G.add_node(record.ID) G.node[record.ID]['Score'] = support for cnvIStart, cnvIEnd in cnvRegion[record.CHROM].overlap((svStart, svEnd)): otherID = cnvRegion[record.CHROM][(cnvIStart, cnvIEnd)] if (record.INFO['SVTYPE'] == "INS") or (overlapValid((svStart, svEnd), (cnvIStart, cnvIEnd), 0.1, 10000)): G.add_edge(record.ID, otherID) cnvRegion[record.CHROM][(svStart - 15, svEnd + 15)] = record.ID # padding for PRECISE insertion # Pick best deletion/duplication for all overlapping calls selectedSVs = set() for H in networkx.connected_component_subgraphs(G): bestScore = -1.0 for n, d in H.nodes_iter(data=True): if d['Score'] > bestScore: bestScore = d['Score'] bestSV = n if bestSV is not None: selectedSVs.add(bestSV) # Extract selected calls
if not sv.has_key(record.CHROM): sv[record.CHROM] = banyan.SortedDict(key_type=(int, int), alg=banyan.RED_BLACK_TREE, updator=banyan.OverlappingIntervalsUpdator) if (record.POS, record.INFO['END']) not in sv[record.CHROM]: sv[record.CHROM][(record.POS, record.INFO['END'])] = (record.ID, record.INFO['PE'], record.INFO['CT']) else: svDups[(record.CHROM, record.POS, record.INFO['END'])].append((record.ID, record.INFO['PE'], record.INFO['CT'])) # Output vcf records if args.vcfFile: vcf_reader = vcf.Reader(open(args.vcfFile), 'r', compressed=True) if args.vcfFile.endswith('.gz') else vcf.Reader(open(args.vcfFile), 'r', compressed=False) vcf_writer = vcf.Writer(open(args.outFile, 'w'), vcf_reader, lineterminator='\n') for record in vcf_reader: if (record.CHROM not in sv.keys()) or ((record.POS, record.INFO['END']) not in sv[record.CHROM].keys()): continue overlapList = sv[record.CHROM].overlap((record.POS, record.INFO['END'])) foundBetterHit = False if not args.keepOverlap: for cStart, cEnd in sv[record.CHROM].overlap((record.POS, record.INFO['END'])): if foundBetterHit: break for cSvID, cScore, cCt in svDups[(record.CHROM, cStart, cEnd)] + [sv[record.CHROM][(cStart, cEnd)]]: if record.ID != cSvID: if (cScore > record.INFO['PE']) or ((cScore == record.INFO['PE']) and (cSvID < record.ID)): if overlapValid((record.POS, record.INFO['END']), (cStart, cEnd), 0.1, 10000000): foundBetterHit = True break # Output VCF record if not foundBetterHit: vcf_writer.write_record(record)
'score': score } if dupInfo['score'] >= 0: if not dupRegion.has_key(record.CHROM): dupRegion[record.CHROM] = banyan.SortedDict( key_type=(int, int), alg=banyan.RED_BLACK_TREE, updator=banyan.OverlappingIntervalsUpdator) G.add_node((record.ID, dupInfo['id'])) G.node[(record.ID, dupInfo['id'])]['Score'] = dupInfo['score'] for dupIStart, dupIEnd in dupRegion[record.CHROM].overlap( (dupInfo['start'], dupInfo['end'])): (id1, id2) = dupRegion[record.CHROM][(dupIStart, dupIEnd)] if overlapValid((dupInfo['start'], dupInfo['end']), (dupIStart, dupIEnd), 0.1, 10000): G.add_edge((record.ID, dupInfo['id']), (id1, id2)) dupRegion[record.CHROM][(dupInfo['start'], dupInfo['end'])] = (record.ID, dupInfo['id']) # Pick best pair idPairs = dict() for H in networkx.connected_component_subgraphs(G): bestScore = -1.0 for n, d in H.nodes_iter(data=True): if d['Score'] > bestScore: bestScore = d['Score'] bestSVs = n idPairs[bestSVs] = bestScore
if len(hap): svStart = record.POS svEnd = record.INFO['END'] svControlID = re.sub(r"^[A-Z0]*","", record.ID) rdRatio = altRefReadDepthRatio(rc, sv[int(svControlID)], hap) #print(record.CHROM, svStart, svEnd, record.ID, rdRatio, sep="\t") if rdRatio is not None: if ((record.INFO['SVTYPE'] == "DEL") and (rdRatio < 0.8)) or ((record.INFO['SVTYPE'] == "DUP") and (rdRatio >= 1.3) and (rdRatio <= 1.75)): # Valid Call if not cnvRegion.has_key(record.CHROM): cnvRegion[record.CHROM] = banyan.SortedDict(key_type=(int, int), alg=banyan.RED_BLACK_TREE, updator=banyan.OverlappingIntervalsUpdator) G.add_node(record.ID) G.node[record.ID]['Score'] = peCount for cnvIStart, cnvIEnd in cnvRegion[record.CHROM].overlap((svStart, svEnd)): otherID = cnvRegion[record.CHROM][(cnvIStart, cnvIEnd)] if overlapValid((svStart, svEnd), (cnvIStart, cnvIEnd), 0.1, 10000): G.add_edge(record.ID, otherID) cnvRegion[record.CHROM][(svStart, svEnd)] = record.ID # Pick best deletion/duplication for all overlapping calls selectedSVs = set() for H in networkx.connected_component_subgraphs(G): bestScore = -1.0 for n, d in H.nodes_iter(data=True): if d['Score'] > bestScore: bestScore = d['Score'] bestSV = n if bestSV is not None: selectedSVs.add(bestSV) # Extract selected calls
(rdRatio < 0.8)) or ((record.INFO['SVTYPE'] == "DUP") and (rdRatio >= 1.3) and (rdRatio <= 1.75)): # Valid Call if not cnvRegion.has_key(record.CHROM): cnvRegion[record.CHROM] = banyan.SortedDict( key_type=(int, int), alg=banyan.RED_BLACK_TREE, updator=banyan.OverlappingIntervalsUpdator) G.add_node(record.ID) G.node[record.ID]['Score'] = peCount for cnvIStart, cnvIEnd in cnvRegion[ record.CHROM].overlap((svStart, svEnd)): otherID = cnvRegion[record.CHROM][(cnvIStart, cnvIEnd)] if overlapValid((svStart, svEnd), (cnvIStart, cnvIEnd), 0.1, 10000): G.add_edge(record.ID, otherID) cnvRegion[record.CHROM][(svStart, svEnd)] = record.ID # Pick best deletion/duplication for all overlapping calls selectedSVs = set() for H in networkx.connected_component_subgraphs(G): bestScore = -1.0 for n, d in H.nodes_iter(data=True): if d['Score'] > bestScore: bestScore = d['Score'] bestSV = n if bestSV is not None: selectedSVs.add(bestSV) # Extract selected calls
if record.ID not in validRecordID: continue # Judge wether overlapping calls are better foundBetterHit = False if args.svType == "TRA": countBetterHits = 0 for (chrName, start, end) in [ (record.CHROM, record.POS - traWindow, record.POS + traWindow), (record.INFO["CHR2"], record.INFO["END"] - traWindow, record.INFO["END"] + traWindow), ]: for cStart, cEnd in sv[chrName].overlap((start, end)): for cSvID, cScore in svDups[(chrName, cStart, cEnd)] + [sv[chrName][(cStart, cEnd)]]: if record.ID != cSvID: if (cScore > record.INFO["PE"]) or ((cScore == record.INFO["PE"]) and (cSvID < record.ID)): countBetterHits += 1 if countBetterHits > 2: foundBetterHit = True else: for cStart, cEnd in sv[record.CHROM].overlap((record.POS, record.INFO["END"])): for cSvID, cScore in svDups[(record.CHROM, cStart, cEnd)] + [sv[record.CHROM][(cStart, cEnd)]]: if (record.ID != cSvID) and (overlapValid((record.POS, record.INFO["END"]), (cStart, cEnd))): if (cScore > record.INFO["PE"]) or ((cScore == record.INFO["PE"]) and (cSvID < record.ID)): foundBetterHit = True break if foundBetterHit: break if not foundBetterHit: record.INFO["RDRATIO"] = rdRat[record.ID] record.INFO["SOMATIC"] = True vcf_writer.write_record(record)
# Check quality #print(record.CHROM, svStart, svEnd, record.ID, qIndex, numpy.percentile(ratioRef, 99), altgq, refgq, altratio, sep="\t") if (validRdRatio) and (qIndex > quality) and (numpy.percentile( ratioRef, 99) <= args.maxRefRatio): if not cnvRegion.has_key(record.CHROM): cnvRegion[record.CHROM] = banyan.SortedDict( key_type=(int, int), alg=banyan.RED_BLACK_TREE, updator=banyan.OverlappingIntervalsUpdator) G.add_node(record.ID) G.node[record.ID]['Score'] = support for cnvIStart, cnvIEnd in cnvRegion[record.CHROM].overlap( (svStart, svEnd)): otherID = cnvRegion[record.CHROM][(cnvIStart, cnvIEnd)] if (record.INFO['SVTYPE'] == "INS") or (overlapValid( (svStart, svEnd), (cnvIStart, cnvIEnd), 0.1, 10000)): G.add_edge(record.ID, otherID) cnvRegion[record.CHROM][( svStart - 15, svEnd + 15)] = record.ID # padding for PRECISE insertion # Pick best deletion/duplication for all overlapping calls selectedSVs = set() for H in networkx.connected_component_subgraphs(G): bestScore = -1.0 for n, d in H.nodes_iter(data=True): if d['Score'] > bestScore: bestScore = d['Score'] bestSV = n if bestSV is not None:
maxBpOffset = max(abs(s2-s1), abs(e2-e1)) cc = carrierConcordance(nonRefHap, dup5to3['hap']) if (nestedO >= minNestedOverlap) and (minBpOffset < maxInsertionOffset) and (maxBpOffset > minDuplicationLength) and (cc >= minCarrierConcordance): rdRatio = rdAltRefRatio(((s1, e1), (s2, e2)), (nonRefHap, dup5to3['hap']), (rc, dup5to3['rc'])) if validRdRatio(recO/nestedO, rdRatio, args.readDepth)[0]: score = float(min(peCount, dup5to3['pe'])) * float(cc) if score > dupInfo['score']: dupInfo = {'id': dup5to3['id'], 'start': min(s1, s2), 'end': max(e1, e2), 'score': score} if dupInfo['score'] >= 0: if not dupRegion.has_key(record.CHROM): dupRegion[record.CHROM] = banyan.SortedDict(key_type=(int, int), alg=banyan.RED_BLACK_TREE, updator=banyan.OverlappingIntervalsUpdator) G.add_node((record.ID, dupInfo['id'])) G.node[(record.ID, dupInfo['id'])]['Score'] = dupInfo['score'] for dupIStart, dupIEnd in dupRegion[record.CHROM].overlap((dupInfo['start'], dupInfo['end'])): (id1, id2) = dupRegion[record.CHROM][(dupIStart, dupIEnd)] if overlapValid((dupInfo['start'], dupInfo['end']), (dupIStart, dupIEnd), 0.1, 10000): G.add_edge((record.ID, dupInfo['id']), (id1, id2)) dupRegion[record.CHROM][(dupInfo['start'], dupInfo['end'])] = (record.ID, dupInfo['id']) # Pick best pair idPairs = dict() for H in networkx.connected_component_subgraphs(G): bestScore = -1.0 for n, d in H.nodes_iter(data=True): if d['Score'] > bestScore: bestScore = d['Score'] bestSVs = n idPairs[bestSVs] = bestScore # Extract selected calls selectedSVs = dict()
invInfo = { "id": inv3to3["id"], "start": min(s1, s2), "end": max(e1, e2), "score": score, } if invInfo["score"] >= 0: if not invRegion.has_key(record.CHROM): invRegion[record.CHROM] = banyan.SortedDict( key_type=(int, int), alg=banyan.RED_BLACK_TREE, updator=banyan.OverlappingIntervalsUpdator ) G.add_node((record.ID, invInfo["id"])) G.node[(record.ID, invInfo["id"])]["Score"] = invInfo["score"] for invIStart, invIEnd in invRegion[record.CHROM].overlap((invInfo["start"], invInfo["end"])): (id1, id2) = invRegion[record.CHROM][(invIStart, invIEnd)] if overlapValid((invInfo["start"], invInfo["end"]), (invIStart, invIEnd), 0.1, 10000): G.add_edge((record.ID, invInfo["id"]), (id1, id2)) invRegion[record.CHROM][(invInfo["start"], invInfo["end"])] = (record.ID, invInfo["id"]) # Pick best pair of inversions out of all overlapping calls idPairs = dict() for H in networkx.connected_component_subgraphs(G): bestScore = -1.0 for n, d in H.nodes_iter(data=True): if d["Score"] > bestScore: bestScore = d["Score"] bestSVs = n idPairs[bestSVs] = bestScore # Extract selected calls selectedSVs = dict()