def main(): # parse the command line reportTotals = False for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=", 1)[1] if (arg in ["--report:totals", "--report:total"]): reportTotals = True elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) # accumulate the length distributions lengthToCount = {} for (length, count) in read_length_counts(stdin): if (length not in lengthToCount): lengthToCount[length] = count else: lengthToCount[length] += count # report the total distribution lengths = [length for length in lengthToCount] lengths.sort() print "\n".join( ["%d\t%d" % (length, lengthToCount[length]) for length in lengths]) if (reportTotals): numSequences = sum([lengthToCount[length] for length in lengths]) if (numSequences == 0): print >> stderr, "0 sequences / 0 bp total" else: totalBp = sum( [lengthToCount[length] * length for length in lengths]) avgSequence = int(round(float(totalBp) / numSequences)) print >>stderr, "%s sequences / %s bp total / %s bp average" \ % (commatize(numSequences),commatize(totalBp),commatize(avgSequence))
def main(): # parse the command line reportProgress = None for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=", 1)[1] if (arg.startswith("--progress=")): reportProgress = int_with_unit(argVal) elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) # process the fasta sequences lengthToCount = {} inputCount = inputBp = 0 for (seqLen) in read_fasta_lengths(stdin): inputCount += 1 inputBp += seqLen if (reportProgress != None): if (inputCount % reportProgress == 0): print >>stderr, "%s sequences read (%s nts, avg=%s)" \ % (commatize(inputCount),commatize(inputBp), commatize(int(round(float(inputBp)/inputCount)))) if (seqLen not in lengthToCount): lengthToCount[seqLen] = 1 else: lengthToCount[seqLen] += 1 # report the distribution lengths = [length for length in lengthToCount] lengths.sort() print "\n".join( ["%d\t%d" % (length, lengthToCount[length]) for length in lengths])
def collect_alignments(f, testWhich, headLimit=None, subsampleK=None, subsampleN=None, requireEof=True): alignmentList = [] mxMatrix = [] unitLength = None alignmentNum = 0 for a in alignments(f, requireEof): alignmentNum += 1 if (reportProgress != None) \ and ((alignmentNum == 1) or (alignmentNum % reportProgress == 0)): print >>stderr, "progress: reading alignment %s" \ % (commatize(alignmentNum)) if (headLimit != None) and (alignmentNum > headLimit): print >> stderr, "limit of %d alignments reached" % headLimit break if (subsampleN != None): if ((alignmentNum - 1) % subsampleN != (subsampleK - 1)): continue if (testWhich == "matches-insertions"): # note [1] mxRow = positional_error_vector(a, modified="m-i") else: mxRow = positional_error_vector(a) if (mxRow == None): raise ValueError, \ "alignment at line %d does not contain positional information" \ % a.lineNumber if (unitLength == None): unitLength = len(mxRow) / 2 elif (len(mxRow) != 2 * unitLength): raise ValueError, \ "alignments have different motif lengths, %d and %d (detected at line %d)" \ % (unitLength,len(mxRow)/2,a.lineNumber) alignmentList += [a] mxMatrix += [mxRow] return (unitLength, alignmentList, mxMatrix)
def main(): global nameFieldW,lengthFieldW,countFieldW,rangeFieldW global debug # parse the command line genomeFilename = None readsFilename = None cigarFilename = None intervalsFilename = None intervalsAreCatalog = False motifs = None chromsOfInterest = None minLength = None noiselessGenome = True reportProgress = None nameFieldW = 1 lengthFieldW = 1 countFieldW = 1 rangeFieldW = 1 debug = [] for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=",1)[1] if (arg.startswith("--genome=")): genomeFilename = argVal elif (arg.startswith("--reads=")) or (arg.startswith("--read=")): readsFilename = argVal elif (arg.startswith("--cigars=")) or (arg.startswith("--cigar=")): cigarFilename = argVal elif (arg.startswith("--intervals=")) or (arg.startswith("--interval=")): if (intervalsFilename != None): usage("--intervals and --catalog are mutually exclusive") intervalsFilename = argVal intervalsAreCatalog = False elif (arg.startswith("--catalog=")): if (intervalsFilename != None): usage("--intervals and --catalog are mutually exclusive") intervalsFilename = argVal intervalsAreCatalog = True elif (arg.startswith("--motif=")): if (motifs == None): motifs = set() motifs.add(argVal) elif (arg.startswith("--chromosome=")) or (arg.startswith("--chromosomes=")) \ or (arg.startswith("--chrom=")) or (arg.startswith("--chroms=")): if (chromsOfInterest == None): chromsOfInterest = set() for chrom in argVal.split(","): chromsOfInterest.add(chrom) elif (arg.startswith("--minlength=")) or (arg.startswith("--minlen=")): try: minLength = int(argVal) if (minLength < 0): raise ValueError if (minLength == 0): minLength = None except ValueError: usage("bad length in \"%s\"" % arg) elif (arg == "--noisygenome"): noiselessGenome = False elif (arg.startswith("--progress=")): reportProgress = int_with_unit(argVal) elif (arg.startswith("--fields=")) or (arg.startswith("F=")): (nameFieldW,lengthFieldW,countFieldW,rangeFieldW) = argVal.split(",",4) nameFieldW = max(int(nameFieldW),1) lengthFieldW = max(int(lengthFieldW),1) countFieldW = max(int(countFieldW),1) rangeFieldW = max(int(rangeFieldW),1) elif (arg.startswith("--namefield=")) or (arg.startswith("F1=")): nameFieldW = max(int(argVal),1) elif (arg.startswith("--lengthfield=")) or (arg.startswith("F2=")): lengthFieldW = max(int(argVal),1) elif (arg.startswith("--countfield=")) or (arg.startswith("F3=")): countFieldW = max(int(argVal),1) elif (arg.startswith("--intervalfield=")) or (arg.startswith("F4=")): rangeFieldW = max(int(argVal),1) elif (arg == "--debug"): debug += ["debug"] elif (arg.startswith("--debug=")): debug += argVal.split(",") elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) if (genomeFilename == None): usage("you need to give me a genome file") if (readsFilename == None): usage("you need to give me a reads file") if (cigarFilename == None): usage("you need to give me a cigar strings file") if (motifs != None) and (not intervalsAreCatalog): usage("--motifs requires --catalog") # read the intervals # # nota bene: this can modify chromsOfInterest, restricting it to the # chromosomes in the intervals list chromToIntervals = None motifsSeen = set() if (intervalsFilename != None): chromToIntervals = {} if (intervalsFilename.endswith(".gz")) or (intervalsFilename.endswith(".gzip")): intervalsF = gzip_open(intervalsFilename,"rt") else: intervalsF = file(intervalsFilename,"rt") for (lineNumber,chrom,gStart,gEnd,tags) in read_intervals(intervalsF): if (chromsOfInterest != None) and (chrom not in chromsOfInterest): continue if (chrom not in chromToIntervals): chromToIntervals[chrom] = [] if (intervalsAreCatalog): if (tags == None): exit("%s: not enough fields at line %d (%d, expected at least %d)" % (os_path.basename(argv[0]),lineNumber,len(fields),4)) (motif,strand) = (tags[0][:-1],tags[0][-1:]) if ("." in motif): motif = motif[:motif.find(".")] if (strand not in ["+","-"]) or (not is_nucleotide_string(motif)): exit("%s: bad motif at line %d: \"%s\"" % (os_path.basename(argv[0]),lineNumber,tags[0])) if (motifs != None): if (motif not in motifs): continue motifsSeen.add(motif) else: motif = strand = None chromToIntervals[chrom] += [(gStart,gEnd,motif,strand)] intervalsF.close() for chrom in chromToIntervals: chromToIntervals[chrom].sort() if (chromsOfInterest == None): chromsOfInterest = set(chromToIntervals) else: for chrom in chromsOfInterest: if (chrom not in chromToIntervals): chromsOfInterest.remove(chrom) if (motifs != None): for motif in motifs: if (motif not in motifsSeen): print >>stderr, "WARNING \"%s\" was not seen in %s" \ % (motif,intervalsFilename) # read the genome chromToSequence = {} if (genomeFilename.endswith(".gz")) or (genomeFilename.endswith(".gzip")): genomeF = gzip_open(genomeFilename,"rt") else: genomeF = file(genomeFilename,"rt") for (chrom,seq) in read_fasta_sequences(genomeF,chromsOfInterest): if (chrom in chromToSequence): exit("%s: \"%s\" appears more than once in \"%s\"" % (os_path.basename(argv[0]),chrom,genomeFilename)) chromToSequence[chrom] = seq genomeF.close() if (chromsOfInterest != None): for chrom in chromsOfInterest: if (chrom not in chromToSequence): exit("%s: \"%s\" doesn't appear in \"%s\"" % (os_path.basename(argv[0]),chrom,genomeFilename)) # read the cigar strings if (cigarFilename.endswith(".gz")) or (cigarFilename.endswith(".gzip")): cigarF = gzip_open(cigarFilename,"rt") else: cigarF = file(cigarFilename,"rt") readNameToCigar = {} for (lineNumber,line,readName,chrom,strand,gStart,gEnd,cigar) in read_cigars(cigarF): if (chromsOfInterest != None) and (chrom not in chromsOfInterest): continue (rLength,gLength) = cigar_lengths(cigar) readNameToCigar[readName] = (chrom,gStart,gEnd,gLength,strand,rLength,cigar) if (gLength != gEnd-gStart): exit("%s: bad cigar line (at line %d); cigar doesn't match interval length (%d vs %d)\n%s" % (os_path.basename(argv[0]),lineNumber,gLength,gEnd-gStart,line)) cigarF.close() # process the reads if (readsFilename.endswith(".gz")) or (readsFilename.endswith(".gzip")): readsF = gzip_open(readsFilename,"rt") else: readsF = file(readsFilename,"rt") readNum = alignmentsReported = 0 for (readName,rNucs) in read_fasta_sequences(readsF): readNum += 1 if (reportProgress != None) \ and ((readNum == 1) or (readNum % reportProgress == 0)): print >>stderr, "progress: processing read #%s %s (%s alignments reported so far)" \ % (commatize(readNum),readName,commatize(alignmentsReported)) if (readName not in readNameToCigar): exit("%s: \"%s\" doesn't appear in \"%s\"" % (os_path.basename(argv[0]),readNameToCigar,cigarFilename)) (chrom,gStart,gEnd,gLength,strand,rLength,cigar) = readNameToCigar[readName] gNucs = chromToSequence[chrom][gStart:gEnd] if (strand == "-"): gNucs = reverse_complement(gNucs) a = Alignment() a.readName = readName a.rStart = 0 a.rEnd = rLength a.rLength = rLength a.rNucs = rNucs a.chrom = chrom a.strand = strand a.gStart = gStart a.gEnd = gEnd a.gNucs = gNucs a.score = 0 a.motif = "%s:%d-%d%s" % (chrom,a.gStart,a.gEnd,strand) (a.rText,a.gText) = reconstruct_alignment(rNucs,gNucs,cigar) if (chromToIntervals == None): if (minLength != None) and (a.gEnd-a.gStart < minLength): continue print_alignment(a) alignmentsReported += 1 else: intervals = chromToIntervals[chrom] for (s,e,motif,mStrand) in intersecting_intervals(intervals,gStart,gEnd): aSliced = slice_alignment(a,s,e) if (minLength != None) and (aSliced.gEnd-aSliced.gStart < minLength): continue print_alignment(aSliced) alignmentsReported += 1 if ("intervalsanity" in debug): rText = remove_gaps(aSliced.rText) realText = rNucs[aSliced.rStart:aSliced.rEnd] if (realText != rText): exit("%s: sanity check failed for read:\n\"%s\"\n\"%s\"" % (os_path.basename(argv[0]),rText,realText)) gText = remove_gaps(aSliced.gText).upper() realText = chromToSequence[chrom][aSliced.gStart:aSliced.gEnd] if (strand == "-"): realText = reverse_complement(realText) if (realText != gText): exit("%s: sanity check failed for genome:\n\"%s\"\n\"%s\"" % (os_path.basename(argv[0]),gText,realText)) print >>stderr, "%s: sanity check passed for read %s" \ % (os_path.basename(argv[0]),readName) if (motif != None): positionalStats = positonal_stats(aSliced,motif,mStrand, noiselessGenome=noiselessGenome) print_positonal_stats(positionalStats) readsF.close() print "# ncrf end-of-file" if (reportProgress != None): print >>stderr, "progress: %s reads processed (%s alignments reported)" \ % (commatize(readNum),commatize(alignmentsReported))
def main(): global debug # parse the command line distributionFilename = None remainderFilename = None wrapLength = 100 reportProgress = None debug = [] for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=",1)[1] if (arg.startswith("--remainder=")): remainderFilename = argVal elif (arg.startswith("--wrap=")): wrapLength = int(argVal) if (wrapLength <= 0): wrapLength = None elif (arg.startswith("--seed=")): random_seed(argVal) elif (arg.startswith("--progress=")): reportProgress = int_with_unit(argVal) elif (arg == "--debug"): debug += ["debug"] elif (arg.startswith("--debug=")): debug += argVal.split(",") elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) elif (distributionFilename == None): distributionFilename = arg else: usage("unrecognized option: %s" % arg) if (distributionFilename == None): usage("you must provide a length-distribution filename") # read the distribution intervals = IntervalDict() distribF = file(distributionFilename,"rt") for spec in read_distribution_spec(distribF,distributionFilename): (lineNumber,minLength,maxLength,outCount,inCount) = spec interval = intervals.add(minLength,maxLength) if (interval == None): # interval overlaps an existing interval interval = Interval(minLength,maxLength) previous = intervals.overlapper(minLength,maxLength) assert (False), \ "%s (line %d) overlaps %s (line %d)" \ % (interval,lineNumber,previous,previous.lineNumber) interval.lineNumber = lineNumber interval.outCount = outCount interval.inCount = inCount distribF.close () if ("distribution" in debug): for interval in intervals: print >>stderr, "%s %d %d" \ % (interval,interval.outCount,interval.inCount) # process the reads # # this filters reads based on the length (on the interval containing the # length); if we expect to see E more sequences of this length (including # this one), and we are to output N of those, we output this sequence with # probability N/E; and we adjust N and E for this length accordingly inputCount = outputCount = inputBp = outputBp = 0 for (name,seq) in read_fasta_sequences(stdin): seqLen = len(seq) inputCount += 1 inputBp += seqLen if (reportProgress != None): if (inputCount % reportProgress == 0): print >>stderr, "%s sequences read, %s written (%.1f%%); %s nts read, %s written" \ % (commatize(inputCount),commatize(outputCount), 100.0*outputCount/inputCount, commatize(inputBp),commatize(outputBp)) try: interval = intervals[seqLen] except KeyError: continue if (interval.inCount <= 0): print >>stderr, "ERROR: for length %d (%s), actual input exceeded expected input count" \ % (seqLen,interval) if (remainderFilename != None): print >>stderr, " (writing remainders to %s)" % remainderFilename remainderF = file(remainderFilename,"wt") write_remainders(remainderF,intervals) remainderF.close () assert (False) if (interval.outCount == 0): keepSeq = False else: keepSeq = (randint(1,interval.inCount) <= interval.outCount) interval.inCount -= 1 if (not keepSeq): continue interval.outCount -= 1 outputCount += 1 outputBp += seqLen print ">%s" % name if (wrapLength == None): print seq else: for i in range(0,seqLen,wrapLength): print seq[i:i+wrapLength] # write the remainders if (remainderFilename != None): remainderF = file(remainderFilename,"wt") write_remainders(remainderF,intervals) remainderF.close ()
def main(): global warnOnError # parse the command line minMapQ = None writeHeader = False writeWhat = "per alignment" warnOnError = False headLimit = None reportProgress = None for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=", 1)[1] if (arg.startswith("--mapq=")) or (arg.startswith("--MAPQ=")) or ( arg.startswith("MAPQ=")): minMapQ = int(argVal) elif (arg in ["--withheader", "--with=header", "--with:header"]): writeHeader = True elif (arg in ["--sumonly", "--sum=only", "--sum:only"]): writeWhat = "sum only" elif (arg == "--warnandcontinue"): warnOnError = True elif (arg.startswith("--head=")): headLimit = int_with_unit(argVal) elif (arg.startswith("--progress=")): reportProgress = int_with_unit(argVal) elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) # process the sam records sum = {"m": 0, "mm": 0, "io": 0, "ix": 0, "do": 0, "dx": 0} recordNum = alignmentNum = 0 for a in read_sam_plain(stdin, minMapQ=minMapQ): recordNum += 1 if (reportProgress != None) and (recordNum % reportProgress == 0): sum["events"] = (sum["m"] + sum["mm"] + sum["io"] + sum["ix"] + sum["do"] + sum["dx"]) mRatio = float(sum["m"]) / sum["events"] vec = [ mRatio, sum["m"], sum["mm"], sum["io"], sum["ix"], sum["do"], sum["dx"] ] print >>stderr, "progress: processing sam record %s (mRatio=%.3f m=%d mm=%d io=%d ix=%d do=%d dx=%d)" \ % (commatize(recordNum), mRatio,sum["m"],sum["mm"],sum["io"],sum["ix"],sum["do"],sum["dx"]) if (headLimit != None) and (recordNum > headLimit): print >> stderr, "limit of %s sam records reached" % commatize( headLimit) break if (a.rName == "*"): continue # read did not align if (minMapQ != None) and (a.mapQ < minMapQ): continue alignmentNum += 1 events = sam_to_events(a) if (type(events) == str): print >> stderr, events continue (nMatch, nMismatch, nInsO, nInsX, nDelO, nDelX) = events if (writeHeader): print "\t".join( ["line", "read", "mRatio", "m", "mm", "io", "ix", "do", "dx"]) writeHeader = False if (writeWhat == "per alignment"): mRatio = float(nMatch) / (nMatch + nMismatch + nInsO + nInsX + nDelO + nDelX) mRatio = "%.3f" % mRatio vec = [ a.lineNumber, a.qName, mRatio, nMatch, nMismatch, nInsO, nInsX, nDelO, nDelX ] print "\t".join(map(str, vec)) sum["m"] += nMatch sum["mm"] += nMismatch sum["io"] += nInsO sum["ix"] += nInsX sum["do"] += nDelO sum["dx"] += nDelX sum["events"] = (sum["m"] + sum["mm"] + sum["io"] + sum["ix"] + sum["do"] + sum["dx"]) if (alignmentNum == 0): print >> stderr, "WARNING: input contained no alignments" elif (writeWhat == "sum only"): alignmentNumStr = "(%d)" % alignmentNum mRatio = float(sum["m"]) / sum["events"] mRatio = "%.3f" % mRatio vec = [ "all", alignmentNumStr, mRatio, sum["m"], sum["mm"], sum["io"], sum["ix"], sum["do"], sum["dx"] ] print "\t".join(map(str, vec))
def sliced_consensus_filter(f, motifsOfInterest, nameToMotif, sliceWidth, sliceStep): global userHasBeenWarned if (reportMsa) and (not userHasBeenWarned): print >> stderr, "WARNING: sliced consensus doesn't report MSA, ignoring that request" userHasBeenWarned = True alignmentNum = 0 alignmentsWritten = 0 for a in alignments(f, requireEof): alignmentNum += 1 if (reportProgress != None): if (alignmentNum == 1) or (alignmentNum % reportProgress == 0): print >>stderr, "progress: testing alignment %s" \ % commatize(alignmentNum) if (headLimit != None) and (alignmentNum > headLimit): print >> stderr, "limit of %d alignments reached" % headLimit break if (a.motif in nameToMotif): a.motif = nameToMotif[a.motif] if (motifsOfInterest != None) and (a.motif not in motifsOfInterest): continue if ([ch for ch in a.motif if (ch not in "ACGT")] != []): abort_warn_about_named_motifs(a) motifText = a.motifText seqText = a.seqText if ("noflip" in debug): pass elif (a.strand == "-") and (a.start < a.end): # alignment was reported in reverse complement of motif, so flip it motifText = reverse_complement(motifText) seqText = reverse_complement(seqText) # look for consensus over each slice, separately consensuses = set() numSlices = (len(motifText) + sliceStep - 1) / sliceStep # (an overestimate) minSlice = 10 * len(a.motif) for sliceNum in xrange(numSlices): sliceStart = sliceNum * sliceStep sliceEnd = min(sliceStart + sliceWidth, len(motifText)) if (sliceEnd - sliceStart < minSlice): break motifTextSlice = motifText[sliceStart:sliceEnd] seqTextSlice = seqText[sliceStart:sliceEnd] # derive consensus(es) seqChunks = chunkify(a.motif, motifTextSlice, seqTextSlice) if ("consensus" in debug): print >> stderr print >>stderr, "%d score=%d slice.start=%d slice.end=%d" \ % (a.lineNumber,a.score,sliceStart,sliceEnd) sliceConsensuses = derive_consensuses( seqChunks, winnerThreshold=winnerThreshold) sliceConsensuses = list(sliceConsensuses) if (sliceConsensuses == []): consensuses.add(None) else: for word in sliceConsensuses: consensuses.add(word) if ("consensus" in debug): for word in sliceConsensuses: print >> stderr, "consensus %s" % word consensuses = list(consensuses) # discard the alignment if it meets the filtering criterion (if there # is any such criterion) if (filterToKeep == "consensus"): if (a.motif not in consensuses): continue # (discard it) elif (filterToKeep == "non consensus"): if (a.motif in consensuses): continue # (discard it) else: # if (filterToKeep == "no filter"): pass # copy the (unfiltered) alignment to the output if (alignmentsWritten > 0): print alignmentsWritten += 1 print "\n".join(a.lines) # report the consensus, if we're supposed to if (reportConsensus): if (consensuses == []): print "# consensus (none)" else: canonicalized = [] for motif in consensuses: if (motif == None): continue if (motif != a.motif) and (canonicalizeConsensuses): (motif, strand) = canonical_motif(motif) canonicalized += [motif] if (None in consensuses): canonicalized += ["(none)"] print "# consensus %s" % ",".join(canonicalized) if (requireEof): print "# ncrf end-of-file"
def simple_consensus_filter(f, motifsOfInterest, nameToMotif): alignmentNum = 0 alignmentsWritten = 0 for a in alignments(f, requireEof): alignmentNum += 1 if (reportProgress != None): if (alignmentNum == 1) or (alignmentNum % reportProgress == 0): print >>stderr, "progress: testing alignment %s" \ % commatize(alignmentNum) if (headLimit != None) and (alignmentNum > headLimit): print >> stderr, "limit of %d alignments reached" % headLimit break if (a.motif in nameToMotif): a.motif = nameToMotif[a.motif] if (motifsOfInterest != None) and (a.motif not in motifsOfInterest): continue if ([ch for ch in a.motif if (ch not in "ACGT")] != []): abort_warn_about_named_motifs(a) motifText = a.motifText seqText = a.seqText if ("noflip" in debug): pass elif (a.strand == "-") and (a.start < a.end): # alignment was reported in reverse complement of motif, so flip it motifText = reverse_complement(motifText) seqText = reverse_complement(seqText) # derive consensus(es) seqChunks = chunkify(a.motif, motifText, seqText) if ("consensus" in debug): print >> stderr print >> stderr, "%d score=%d" % (a.lineNumber, a.score) consensuses = derive_consensuses(seqChunks, winnerThreshold=winnerThreshold) consensuses = list(consensuses) # discard the alignment if it meets the filtering criterion (if there # is any such criterion) if (filterToKeep == "consensus"): if (a.motif not in consensuses): continue # (discard it) elif (filterToKeep == "non consensus"): if (a.motif in consensuses): continue # (discard it) else: # if (filterToKeep == "no filter"): pass # copy the (unfiltered) alignment to the output if (alignmentsWritten > 0): print alignmentsWritten += 1 print "\n".join(a.lines) # report the consensus, if we're supposed to if (reportConsensus): if (consensuses == []): print "# consensus (none)" else: canonicalized = [] for motif in consensuses: if (motif != a.motif) and (canonicalizeConsensuses): (motif, strand) = canonical_motif(motif) canonicalized += [motif] print "# consensus %s" % ",".join(canonicalized) # report the MSA from which the consensus was derived, if we're # supposed to if (reportMsa): motifLen = len(a.motif) positionLength = [1] * motifLen for chunk in seqChunks: for (motifIx, seqNucs) in enumerate(chunk): if (seqNucs == None): continue positionLength[motifIx] = max(positionLength[motifIx], len(seqNucs)) line = [] for (motifIx, motifNuc) in enumerate(a.motif): line += [motifNuc.ljust(positionLength[motifIx], ".")] print "# msa.query %s" % "".join(line) for chunk in seqChunks: line = [] for (motifIx, seqNucs) in enumerate(chunk): if (seqNucs == None): line += ["." * positionLength[motifIx]] elif (seqNucs == a.motif[motifIx]): line += ["=" * positionLength[motifIx]] else: line += [seqNucs.ljust(positionLength[motifIx], ".")] print "# msa.seq %s" % "".join(line) if (requireEof): print "# ncrf end-of-file"
def main(): global reportProgress, batchSize global debug # parse the command line testMethod = "min-max" numTrials = 10 * 1000 # (only used for testMethod == "min-max") numNeededToPass = 1 # (only used for testMethod == "min-max") effectSize = 0.3 # (only used for testMethod == "chi-square") power = 0.8 # (only used for testMethod == "chi-square") discardWhich = "bad" testWhich = "matches-insertions" warnOnUntested = False subsampleK = None subsampleN = None headLimit = None batchSize = None # (will be replace by method-specific result) reportAs = "ncrf" requireEof = True prngSeed = defaultPrngSeed reportProgress = None debug = [] for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=", 1)[1] if (arg == "--method=min-max"): testMethod = "min-max" elif (arg.startswith("--trials=")): if ("/" in argVal): (numNeededToPass, numTrials) = map(int_with_unit, argVal.split("/", 1)) if (numTrials < 1): usage("bad value in: %s (trials must be at least 1)" % arg) if (not 1 <= numNeededToPass <= numTrials): usage( "bad value in: %s (num-in-bounds must be in range 1..trials)" % arg) else: (numNeededToPass, numTrials) = (1, int_with_unit(argVal)) if (numTrials < 1): usage("bad value in: %s (trials must be at least 1)" % arg) elif (arg in ["--method=chi-squared", "--method=chi-square"]): # (unadvertised, see [4]) testMethod = "chi-squared" elif (arg.startswith("--effectsize=")): # (unadvertised, see [4]) effectSize = parse_probability(argVal) elif (arg.startswith("--power=")): # (unadvertised, see [4]) power = parse_probability(argVal) elif (arg in ["--discard:bad", "--discard=bad"]): discardWhich = "bad" elif (arg in ["--discard:good", "--discard=good"]): discardWhich = "good" elif (arg in ["--discard:none", "--discard=none"]): discardWhich = "none" elif (arg in [ "--test:matches-insertions", "--test=matches-insertions", "--test:m-i", "--test=m-i" ]): testWhich = "matches-insertions" elif (arg in ["--test:matches", "--test=matches"]): testWhich = "matches" elif (arg in ["--test:errors", "--test=errors"]): testWhich = "errors" elif (arg == "--warn:untested") or (arg == "--warn=matrix"): warnOnUntested = True elif (arg.startswith("--head=")): headLimit = int_with_unit(argVal) elif (arg.startswith("--subsample=")): (subsampleK, subsampleN) = map(int, argVal.split("/", 2)) if (not 0 < subsampleK <= subsampleN): usage("bad subsample description in %s" % arg) elif (arg.startswith("--progress=")): reportProgress = int_with_unit(argVal) elif (arg.startswith("--batch=") ): # (no longer advertised, since it only applies to R) batchSize = int(argVal) elif (arg == "--report:matrix") or ( arg == "--report=matrix"): # (unadvertised) reportAs = "matrix" elif (arg == "--report:silent") or ( arg == "--report=silent"): # (unadvertised) reportAs = "silent" elif (arg in ["--noendmark", "--noeof", "--nomark"]): # (unadvertised) requireEof = False elif (arg.startswith("--seed=")): seed = argVal if (seed in ["none", "None", "NONE"]): prngSeed = None elif (seed in ["default", "Default", "DEFAULT"]): prngSeed = defaultPrngSeed else: # nota bene: if the seed is a number, use it as a number, since # string seeds can produce different sequences on # different versions/builds of python try: seed = int(seed) except ValueError: try: seed = float(seed) except ValueError: pass prngSeed = seed elif (arg == "--debug"): debug += ["debug"] elif (arg.startswith("--debug=")): debug += argVal.split(",") elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) if (reportAs in ["matrix", "silent"]): discardWhich = "none" if (testMethod == "chi-squared"): testDescription = "positional chi-squared" if (batchSize == None): batchSize = 30 elif (testMethod == "min-max"): testDescription = "positional min-max" if (batchSize == None): batchSize = 1 else: exit("%s: internal error: unrecognized test method: \"%s\"" % (os_path.basename(argv[0]), testMethod)) # initialize the PRNG, if needed if (testMethod == "min-max"): if (prngSeed != None): random_seed(prngSeed) else: if (prngSeed not in [None, defaultPrngSeed]): print >>stderr, "WARNING: ignoring request to use PRNG with \"%s\"" \ % testMethod # make sure the shell commands we're gonna use have been installed if (testMethod == "chi-squared"): if (not shell_command_exists("Rscript")): exit(( "%s: Unable to run the shell command \"Rscript\";" + "\n .. Either R hasn't been installed, or the command-line shell" + " can't find it.") % os_path.basename(argv[0])) # collect the alignments; we need to collect the positional info for all # alignments, to feed to R in batches (doing them one-by-one was incredibly # slow); hopefully this won't become a memory problem (unitLength,alignmentList,mxMatrix) \ = collect_alignments(stdin,testWhich, headLimit=headLimit, subsampleK=subsampleK,subsampleN=subsampleN, requireEof=requireEof) numAlignments = len(alignmentList) if (reportProgress != None): print >>stderr, "progress: read %s alignments" \ % (commatize(numAlignments)) # assess the alignments, batch-by-batch if (reportProgress != None): progressReported = -1 accepted = [] outcomeCount = {True: 0, False: 0, None: 0} for batchStartIx in xrange(0, numAlignments, batchSize): alignmentsTested = batchStartIx if (reportProgress != None): rBlock = (progressReported + 1) / reportProgress aBlock = (alignmentsTested + 1) / reportProgress if (alignmentsTested == 0) or (aBlock != rBlock): print >>stderr, "progress: testing alignment %s (%d uniform, %d non-uniform, %d untested)" \ % (commatize(1+alignmentsTested), outcomeCount[True], outcomeCount[False], outcomeCount[None]) progressReported = alignmentsTested batchEndIx = min(batchStartIx + batchSize, numAlignments) if ("batch" in debug): print >>stderr, "using R for alignments %d thru %d" \ % (batchStartIx+1,batchEndIx) mxBatch = mxMatrix[batchStartIx:batchEndIx] aBatch = alignmentList[batchStartIx:batchEndIx] if (testMethod == "chi-squared"): batchResult = mx_significance_tests(mxBatch, testWhich, effectSize, power) if (type(batchResult) == str): exit(("%s: internal error: having trouble with R" + " (with alignment batch %d..%d)" + "\nHere's what R reported:\n%s") % (os_path.basename( argv[0]), batchStartIx, batchEndIx, batchResult)) else: # if (testMethod == "min-max"): batchResult = min_max_tests(aBatch, mxBatch, batchStartIx, testWhich, numTrials, numNeededToPass) if (type(batchResult) == str): exit(("%s: internal error: having trouble with min-max test" + " (with alignment batch %d..%d)" + "\nHere's what was reported:\n%s") % (os_path.basename( argv[0]), batchStartIx, batchEndIx, batchResult)) if (len(batchResult) != batchEndIx - batchStartIx): exit(( "%s: internal error: number of test outcomes reported by R (%d)" + "\n .. doesn't match the number of tests given to R (%d)") % (os_path.basename( argv[0]), len(batchResult), batchEndIx - batchStartIx)) accepted += batchResult if (warnOnUntested): for alignmentNum in xrange(batchStartIx, batchEndIx): testOutcome = accepted[alignmentNum] if (testOutcome == None): print >>stderr, "WARNING: alignment number %d (at line %d) could not be tested" \ % (alignmentNum,1+alignmentList[alignmentNum].lineNumber) for alignmentNum in xrange(batchStartIx, batchEndIx): testOutcome = accepted[alignmentNum] outcomeCount[testOutcome] += 1 # process the alignments and their assessments # $$$ untested alignments should be processed by some other test -- for # example (if we're testing by error counts), a perfect alignment # currently gets discarded because it can't be tested if (reportAs in ["matrix", "silent"]): outcomeMapping = { True: "not_rejected", False: "rejected", None: "untested" } else: # if (reportAs == "ncrf"): if (testWhich == "matches-insertions"): outcomeMapping = { True: "match-insert uniformity not rejected", False: "match-insert uniformity rejected", None: "untested" } elif (testWhich == "errors"): outcomeMapping = { True: "error uniformity not rejected", False: "error uniformity rejected", None: "untested" } else: # if (testWhich == "matches"): outcomeMapping = { True: "match uniformity not rejected", False: "match uniformity rejected", None: "untested" } outcomeNameW = max( [len(outcomeMapping[testOutcome]) for testOutcome in outcomeMapping]) for testOutcome in [True, False, None]: outcomeName = outcomeMapping[testOutcome] count = outcomeCount[testOutcome] reportStr = "%-*s %d" % (outcomeNameW + 1, "%s:" % outcomeName, count) if (numAlignments > 0): reportStr += " (%.2f%%)" % (100.0 * count / numAlignments) print >> stderr, reportStr if (reportAs == "matrix"): # see note [3] above for the format of the matrix file for (alignmentNum, a) in enumerate(alignmentList): testOutcome = accepted[alignmentNum] vec = [a.lineNumber, outcomeMapping[testOutcome] ] + mxMatrix[alignmentNum] print "\t".join(map(str, vec)) elif (reportAs == "silent"): pass else: # if (reportAs == "ncrf"): numKept = 0 isFirst = True for (alignmentNum, a) in enumerate(alignmentList): testOutcome = accepted[alignmentNum] if (discardWhich == "good"): if (testOutcome == True): continue elif (discardWhich == "bad"): if (testOutcome != True): continue if (discardWhich == "none"): testInfo = "# %s: %s" % (testDescription, outcomeMapping[testOutcome]) (startIx, endIx) = a.positional_stats_indexes() a.lines.insert(endIx, testInfo) if (isFirst): isFirst = False else: print print a numKept += 1 reportStr = "kept %d of %d alignments" % (numKept, numAlignments) if (numAlignments > 0): reportStr += ", %.2f%%" % (100.0 * numKept / numAlignments) print >> stderr, reportStr if (requireEof): print "# ncrf end-of-file"