def main(): # parse the command line headLimit = None requireEof = True for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=", 1)[1] if (arg.startswith("--head=")): headLimit = int_with_unit(argVal) elif (arg in ["--noendmark", "--noeof", "--nomark"]): # (unadvertised) requireEof = False elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) # process the alignments alignmentNum = 0 for a in alignments(stdin, requireEof): alignmentNum += 1 if (headLimit != None) and (alignmentNum > headLimit): print >> stderr, "limit of %d alignments reached" % headLimit break positionalStats = a.positional_stats() numPositions = len(positionalStats) vec = [None] * (2 * numPositions + 1) vec[0] = a.lineNumber for (pos, stats) in enumerate(positionalStats): if ("m" not in stats): raise ValueError, \ "\"m\" missing from positional information for alignment at line %d" \ % a.lineNumber if ("x" not in stats): raise ValueError, \ "\"x\" missing from positional information for alignment at line %d" \ % a.lineNumber vec[1 + pos] = stats["m"] vec[1 + numPositions + pos] = stats["x"] print "\t".join(map(str, vec))
def main(): # parse the command line minMRatio = None requireEof = True for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=",1)[1] if (arg.startswith("--minmratio=")): minMRatio = parse_noise_rate(argVal) if (not (0.0 <= minMRatio <= 1.0)): exit("%s: mratio has to be between 0 and 1 (e.g. 0.85 or 85%%)\n%s" % (os_path.basename(argv[0]),arg)) elif (arg.startswith("--maxnoise=")): minMRatio = 1 - parse_noise_rate(argVal) if (not (0.0 <= minMRatio <= 1.0)): exit("%s: noise has to be between 0 and 1 (e.g. 0.15 or 15%%)\n%s" % (os_path.basename(argv[0]),arg)) elif (arg in ["--noendmark","--noeof","--nomark"]): # (unadvertised) requireEof = False elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) # process the alignments print "\t".join(["#line","seq","strand","start","end","querybp","mRatio","nErrors","errors"]) for a in alignments(stdin,requireEof): if (minMRatio != None) and (a.mRatio < minMRatio): continue errorPositions = [] for (ix,ch) in enumerate(a.errorText): if (ch == "x"): errorPositions += [float(ix)/(len(a.errorText)-1)] print "\t".join([str(a.lineNumber), a.seqName,a.strand,str(a.start),str(a.end), str(a.motifBaseCount), "%.1f%%" % (100*a.mRatio), str(len(errorPositions))] + map(lambda x:"%.3f"%x,errorPositions))
def collect_alignments(f, testWhich, headLimit=None, subsampleK=None, subsampleN=None, requireEof=True): alignmentList = [] mxMatrix = [] unitLength = None alignmentNum = 0 for a in alignments(f, requireEof): alignmentNum += 1 if (reportProgress != None) \ and ((alignmentNum == 1) or (alignmentNum % reportProgress == 0)): print >>stderr, "progress: reading alignment %s" \ % (commatize(alignmentNum)) if (headLimit != None) and (alignmentNum > headLimit): print >> stderr, "limit of %d alignments reached" % headLimit break if (subsampleN != None): if ((alignmentNum - 1) % subsampleN != (subsampleK - 1)): continue if (testWhich == "matches-insertions"): # note [1] mxRow = positional_error_vector(a, modified="m-i") else: mxRow = positional_error_vector(a) if (mxRow == None): raise ValueError, \ "alignment at line %d does not contain positional information" \ % a.lineNumber if (unitLength == None): unitLength = len(mxRow) / 2 elif (len(mxRow) != 2 * unitLength): raise ValueError, \ "alignments have different motif lengths, %d and %d (detected at line %d)" \ % (unitLength,len(mxRow)/2,a.lineNumber) alignmentList += [a] mxMatrix += [mxRow] return (unitLength, alignmentList, mxMatrix)
def main(): # parse the command line minMRatio = None requireEof = True for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=", 1)[1] if (arg.startswith("--minmratio=")): minMRatio = parse_noise_rate(argVal) if (not (0.0 <= minMRatio <= 1.0)): exit( "%s: mratio has to be between 0 and 1 (e.g. 0.85 or 85%%)\n%s" % (os_path.basename(argv[0]), arg)) elif (arg.startswith("--maxnoise=")): minMRatio = 1 - parse_noise_rate(argVal) if (not (0.0 <= minMRatio <= 1.0)): exit( "%s: noise has to be between 0 and 1 (e.g. 0.15 or 15%%)\n%s" % (os_path.basename(argv[0]), arg)) elif (arg in ["--noendmark", "--noeof", "--nomark"]): # (unadvertised) requireEof = False elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) # process the alignments for a in alignments(stdin, requireEof): if (minMRatio != None) and (a.mRatio < minMRatio): continue print "\t".join([ a.seqName, str(a.start), str(a.end), ".", "%d" % (1000 * a.mRatio), a.strand ])
def main(): global debug # parse the command line countRatio = 1 headLimit = None requireEof = True debug = [] for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=", 1)[1] if (arg.startswith("--minwordratio=")) or ( arg.startswith("--ratio=")) or (arg.startswith("R=")): countRatio = float_or_fraction(argVal) elif (arg.startswith("--head=")): headLimit = int_with_unit(argVal) elif (arg in ["--noendmark", "--noeof", "--nomark"]): # (unadvertised) requireEof = False elif (arg == "--debug"): debug += ["debug"] elif (arg.startswith("--debug=")): debug += argVal.split(",") elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) # process the alignments alignmentNum = 0 for a in alignments(stdin, requireEof): alignmentNum += 1 if (headLimit != None) and (alignmentNum > headLimit): print >> stderr, "limit of %d alignments reached" % headLimit break if (alignmentNum > 1): print print "\n".join(a.lines) motifText = a.motifText seqText = a.seqText if ("noflip" in debug): pass elif (a.strand == "-") and (a.start < a.end): # alignment was reported in reverse complement of motif, so flip it motifText = reverse_complement(motifText) seqText = reverse_complement(seqText) (motifChunks, seqChunks) = chunkify(a.motif, motifText, seqText) wordCounts = Counter() for word in seqChunks: word = word.replace("-", "") if (word != a.motif): word = word.lower() wordCounts[word] += 1 if (a.motif in wordCounts): motifCount = wordCounts[a.motif] else: motifCount = 0 wordCounts = [(wordCounts[word], abs(len(word) - len(a.motif)), word) for word in wordCounts if (wordCounts[word] >= motifCount * countRatio)] wordCounts.sort() wordCounts.reverse() print "# aligned words %s" % \ " ".join(["%s:%d"%(word,count) for (count,_,word) in wordCounts]) if ("chunks" in debug): if ("noflip" in debug): seqChunks = [ reverse_complement(word) for word in seqChunks[::-1] ] motifChunks = [ reverse_complement(word) for word in motifChunks[::-1] ] print "# words: %s" % " ".join(seqChunks) print "# motif: %s" % " ".join(motifChunks) if (requireEof): print "# ncrf end-of-file"
def main(): # parse the command line minMRatio = None requireEof = True for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=", 1)[1] if (arg.startswith("--minmratio=")): minMRatio = parse_noise_rate(argVal) if (not (0.0 <= minMRatio <= 1.0)): exit( "%s: mratio has to be between 0 and 1 (e.g. 0.85 or 85%%)\n%s" % (os_path.basename(argv[0]), arg)) elif (arg.startswith("--maxnoise=")): minMRatio = 1 - parse_noise_rate(argVal) if (not (0.0 <= minMRatio <= 1.0)): exit( "%s: noise has to be between 0 and 1 (e.g. 0.15 or 15%%)\n%s" % (os_path.basename(argv[0]), arg)) elif (arg in ["--noendmark", "--noeof", "--nomark"]): # (unadvertised) requireEof = False elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) # process the alignments print "\t".join([ "#line", "motif", "seq", "start", "end", "strand", "seqLen", "querybp", "consensus" ]) userHasntBeenWarned = True for a in alignments(stdin, requireEof): seqLenStr = "NA" if (a.seqLen != None): seqLenStr = str(a.seqLen) if (hasattr(a, "mRatio")): if (minMRatio != None) and (a.mRatio < minMRatio): continue consensuses = [] for line in a.lines: if (not line.startswith("# consensus")): continue line = line.split(None, 2) consensuses += [line[2]] if (consensuses == []): if (userHasntBeenWarned): print >>stderr, \ ("WARNING: input alignments did not contain a consensus line" + "\n (ncrf_consensus_filter would create that line)") userHasntBeenWarned = False consensus = "(missing)" else: consensus = ",".join(consensuses) print "\t".join([ str(a.lineNumber), a.motif, a.seqName, str(a.start), str(a.end), a.strand, seqLenStr, str(a.motifBaseCount), consensus ])
def main(): # parse the command line writeHeader = False writeWhat = "per alignment" headLimit = None requireEof = True for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=",1)[1] if (arg in ["--withheader","--with=header","--with:header"]): writeHeader = True elif (arg in ["--sumonly","--sum=only","--sum:only"]): writeWhat = "sum only" elif (arg.startswith("--head=")): headLimit = int_with_unit(argVal) elif (arg in ["--noendmark","--noeof","--nomark"]): # (unadvertised) requireEof = False elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) # process the alignments sum = {"m":0, "mm":0, "io":0, "ix":0, "do":0, "dx":0} alignmentNum = 0 for a in alignments(stdin,requireEof): alignmentNum +=1 if (headLimit != None) and (alignmentNum > headLimit): print >>stderr, "limit of %d alignments reached" % headLimit break (nMatch,nMismatch,nInsO,nInsX,nDelO,nDelX) = extract_events(a) if (writeHeader): print "\t".join(["line","motif","mRatio","m","mm","io","ix","do","dx"]) writeHeader = False if (writeWhat == "per alignment"): vec = [a.lineNumber,a.motif,a.mRatio,nMatch,nMismatch,nInsO,nInsX,nDelO,nDelX] print "\t".join(map(str,vec)) sum["m"] += nMatch sum["mm"] += nMismatch sum["io"] += nInsO sum["ix"] += nInsX sum["do"] += nDelO sum["dx"] += nDelX sum["events"] = (sum["m"] + sum["mm"] + sum["io"] + sum["ix"] + sum["do"] + sum["dx"]) if (alignmentNum == 0): print >>stderr, "WARNING: input contained no alignments" elif (writeWhat == "sum only"): mRatio = float(sum["m"]) / sum["events"] mRatio = "%.3f" % mRatio vec = ["all",a.motif,mRatio,sum["m"],sum["mm"],sum["io"],sum["ix"],sum["do"],sum["dx"]] print "\t".join(map(str,vec))
def main(): # parse the command line minMRatio = None requireEof = True for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=",1)[1] if (arg.startswith("--minmratio=")): minMRatio = parse_noise_rate(argVal) if (not (0.0 <= minMRatio <= 1.0)): exit("%s: mratio has to be between 0 and 1 (e.g. 0.85 or 85%%)\n%s" % (os_path.basename(argv[0]),arg)) elif (arg.startswith("--maxnoise=")): minMRatio = 1 - parse_noise_rate(argVal) if (not (0.0 <= minMRatio <= 1.0)): exit("%s: noise has to be between 0 and 1 (e.g. 0.15 or 15%%)\n%s" % (os_path.basename(argv[0]),arg)) elif (arg in ["--noendmark","--noeof","--nomark"]): # (unadvertised) requireEof = False elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) # process the alignments print "\t".join(["#line","motif", "seq","start","end","strand", "seqLen","querybp", "mRatio","m","mm","i","d"]) userHasntBeenWarned = True for a in alignments(stdin,requireEof): seqLenStr = "NA" mRatioStr = "NA" nMatchStr = "NA" nMismatchStr = "NA" nInsertionsStr = "NA" nDeletionsStr = "NA" if (a.seqLen != None): seqLenStr = str(a.seqLen) if (hasattr(a,"mRatio")): if (minMRatio != None) and (a.mRatio < minMRatio): continue mRatioStr = "%.1f%%" % (100*a.mRatio) if (hasattr(a,"nMatch")): nMatchStr = str(a.nMatch) if (hasattr(a,"nMismatch")): nMismatchStr = str(a.nMismatch) if (hasattr(a,"nInsertions")): nInsertionsStr = str(a.nInsertions) if (hasattr(a,"nDeletions")): nDeletionsStr = str(a.nDeletions) if (mRatioStr == "NA"): if (userHasntBeenWarned): print >>stderr, \ ("WARNING: input alignments did not contain an event stats line" + "\n (NCRF --stats=events would create that line)") userHasntBeenWarned = False print "\t".join([str(a.lineNumber), a.motif, a.seqName,str(a.start),str(a.end),a.strand, seqLenStr,str(a.motifBaseCount), mRatioStr, nMatchStr,nMismatchStr,nInsertionsStr,nDeletionsStr])
def main(): global debug # parse the command line maxMRatio = 0.85 minColumns = 10 headLimit = None reportClumps = False requireEof = True debug = [] for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=",1)[1] if (arg.startswith("--maxMRatio=")): maxMRatio = parse_noise_rate(argVal) if (not (0.0 <= minMRatio <= 1.0)): exit("%s: mratio has to be between 0 and 1 (e.g. 0.85 or 85%%)\n%s" % (os_path.basename(argv[0]),arg)) elif (arg.startswith("--minnoise=")): maxMRatio = 1 - parse_noise_rate(argVal) if (not (0.0 <= minMRatio <= 1.0)): exit("%s: noise has to be between 0 and 1 (e.g. 0.15 or 15%%)\n%s" % (os_path.basename(argv[0]),arg)) elif (arg.startswith("--mincolumns=")) or (arg.startswith("--mindenom=")): minColumns =int(argVal) if (minColumns < 2): usage("minimum length has to be at least two columns\n%s" % arg) elif (arg.startswith("--head=")): headLimit = int_with_unit(argVal) elif (arg == "--report:clumps") or (arg == "--report=clumps"): reportClumps = True elif (arg in ["--noendmark","--noeof","--nomark"]): # (unadvertised) requireEof = False elif (arg == "--debug"): debug += ["debug"] elif (arg.startswith("--debug=")): debug += argVal.split(",") elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) # process the alignments alignmentNum = 0 for a in alignments(stdin,requireEof): alignmentNum +=1 if (headLimit != None) and (alignmentNum > headLimit): print >>stderr, "limit of %d alignments reached" % headLimit break if (a.errorText == None): exit("%s: alignment at line %d doesn't include error text" % (os_path.basename(argv[0]),a.lineNumber)) if ("detail" in debug): print >>stderr, "\nlooking for clumps in %s %c %u-%u" \ % (a.seqName,a.strand,a.start,a.end) clumps = find_clumps(a.errorText,1-maxMRatio,minColumns, positiveCh='x',negativeCh='=') clumpText = ["-"] * len(a.errorText) for (start,end) in clumps: for ix in xrange(start,end): clumpText[ix] = "*" clumpText = "".join(clumpText) prefixLen = 1 + a.lines[0].find(" =") if (prefixLen < 0): prefixLen = 1 + a.lines[0].find(" x") if (alignmentNum > 1): print a.lines.insert(3,"# %-*s%s" % (prefixLen-2,"noise clumps",clumpText)) print a if (reportClumps): for (start,end) in clumps: errorCount = matchCount = 0 for ch in a.errorText[start:end]: if (ch == 'x'): errorCount += 1 elif (ch == '='): matchCount += 1 print >>stderr, "line %d (%d,%d) m=%s x=%s mRatio: %.2f%%" \ % (a.lineNumber, start,end,matchCount,errorCount, (100.0*matchCount)/(matchCount+errorCount)) if (requireEof): print "# ncrf end-of-file"
def sliced_consensus_filter(f, motifsOfInterest, nameToMotif, sliceWidth, sliceStep): global userHasBeenWarned if (reportMsa) and (not userHasBeenWarned): print >> stderr, "WARNING: sliced consensus doesn't report MSA, ignoring that request" userHasBeenWarned = True alignmentNum = 0 alignmentsWritten = 0 for a in alignments(f, requireEof): alignmentNum += 1 if (reportProgress != None): if (alignmentNum == 1) or (alignmentNum % reportProgress == 0): print >>stderr, "progress: testing alignment %s" \ % commatize(alignmentNum) if (headLimit != None) and (alignmentNum > headLimit): print >> stderr, "limit of %d alignments reached" % headLimit break if (a.motif in nameToMotif): a.motif = nameToMotif[a.motif] if (motifsOfInterest != None) and (a.motif not in motifsOfInterest): continue if ([ch for ch in a.motif if (ch not in "ACGT")] != []): abort_warn_about_named_motifs(a) motifText = a.motifText seqText = a.seqText if ("noflip" in debug): pass elif (a.strand == "-") and (a.start < a.end): # alignment was reported in reverse complement of motif, so flip it motifText = reverse_complement(motifText) seqText = reverse_complement(seqText) # look for consensus over each slice, separately consensuses = set() numSlices = (len(motifText) + sliceStep - 1) / sliceStep # (an overestimate) minSlice = 10 * len(a.motif) for sliceNum in xrange(numSlices): sliceStart = sliceNum * sliceStep sliceEnd = min(sliceStart + sliceWidth, len(motifText)) if (sliceEnd - sliceStart < minSlice): break motifTextSlice = motifText[sliceStart:sliceEnd] seqTextSlice = seqText[sliceStart:sliceEnd] # derive consensus(es) seqChunks = chunkify(a.motif, motifTextSlice, seqTextSlice) if ("consensus" in debug): print >> stderr print >>stderr, "%d score=%d slice.start=%d slice.end=%d" \ % (a.lineNumber,a.score,sliceStart,sliceEnd) sliceConsensuses = derive_consensuses( seqChunks, winnerThreshold=winnerThreshold) sliceConsensuses = list(sliceConsensuses) if (sliceConsensuses == []): consensuses.add(None) else: for word in sliceConsensuses: consensuses.add(word) if ("consensus" in debug): for word in sliceConsensuses: print >> stderr, "consensus %s" % word consensuses = list(consensuses) # discard the alignment if it meets the filtering criterion (if there # is any such criterion) if (filterToKeep == "consensus"): if (a.motif not in consensuses): continue # (discard it) elif (filterToKeep == "non consensus"): if (a.motif in consensuses): continue # (discard it) else: # if (filterToKeep == "no filter"): pass # copy the (unfiltered) alignment to the output if (alignmentsWritten > 0): print alignmentsWritten += 1 print "\n".join(a.lines) # report the consensus, if we're supposed to if (reportConsensus): if (consensuses == []): print "# consensus (none)" else: canonicalized = [] for motif in consensuses: if (motif == None): continue if (motif != a.motif) and (canonicalizeConsensuses): (motif, strand) = canonical_motif(motif) canonicalized += [motif] if (None in consensuses): canonicalized += ["(none)"] print "# consensus %s" % ",".join(canonicalized) if (requireEof): print "# ncrf end-of-file"
def simple_consensus_filter(f, motifsOfInterest, nameToMotif): alignmentNum = 0 alignmentsWritten = 0 for a in alignments(f, requireEof): alignmentNum += 1 if (reportProgress != None): if (alignmentNum == 1) or (alignmentNum % reportProgress == 0): print >>stderr, "progress: testing alignment %s" \ % commatize(alignmentNum) if (headLimit != None) and (alignmentNum > headLimit): print >> stderr, "limit of %d alignments reached" % headLimit break if (a.motif in nameToMotif): a.motif = nameToMotif[a.motif] if (motifsOfInterest != None) and (a.motif not in motifsOfInterest): continue if ([ch for ch in a.motif if (ch not in "ACGT")] != []): abort_warn_about_named_motifs(a) motifText = a.motifText seqText = a.seqText if ("noflip" in debug): pass elif (a.strand == "-") and (a.start < a.end): # alignment was reported in reverse complement of motif, so flip it motifText = reverse_complement(motifText) seqText = reverse_complement(seqText) # derive consensus(es) seqChunks = chunkify(a.motif, motifText, seqText) if ("consensus" in debug): print >> stderr print >> stderr, "%d score=%d" % (a.lineNumber, a.score) consensuses = derive_consensuses(seqChunks, winnerThreshold=winnerThreshold) consensuses = list(consensuses) # discard the alignment if it meets the filtering criterion (if there # is any such criterion) if (filterToKeep == "consensus"): if (a.motif not in consensuses): continue # (discard it) elif (filterToKeep == "non consensus"): if (a.motif in consensuses): continue # (discard it) else: # if (filterToKeep == "no filter"): pass # copy the (unfiltered) alignment to the output if (alignmentsWritten > 0): print alignmentsWritten += 1 print "\n".join(a.lines) # report the consensus, if we're supposed to if (reportConsensus): if (consensuses == []): print "# consensus (none)" else: canonicalized = [] for motif in consensuses: if (motif != a.motif) and (canonicalizeConsensuses): (motif, strand) = canonical_motif(motif) canonicalized += [motif] print "# consensus %s" % ",".join(canonicalized) # report the MSA from which the consensus was derived, if we're # supposed to if (reportMsa): motifLen = len(a.motif) positionLength = [1] * motifLen for chunk in seqChunks: for (motifIx, seqNucs) in enumerate(chunk): if (seqNucs == None): continue positionLength[motifIx] = max(positionLength[motifIx], len(seqNucs)) line = [] for (motifIx, motifNuc) in enumerate(a.motif): line += [motifNuc.ljust(positionLength[motifIx], ".")] print "# msa.query %s" % "".join(line) for chunk in seqChunks: line = [] for (motifIx, seqNucs) in enumerate(chunk): if (seqNucs == None): line += ["." * positionLength[motifIx]] elif (seqNucs == a.motif[motifIx]): line += ["=" * positionLength[motifIx]] else: line += [seqNucs.ljust(positionLength[motifIx], ".")] print "# msa.seq %s" % "".join(line) if (requireEof): print "# ncrf end-of-file"