def main(): # parse the command line headLimit = None requireEof = True for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=", 1)[1] if (arg.startswith("--head=")): headLimit = int_with_unit(argVal) elif (arg in ["--noendmark", "--noeof", "--nomark"]): # (unadvertised) requireEof = False elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) # process the alignments alignmentNum = 0 for a in alignments(stdin, requireEof): alignmentNum += 1 if (headLimit != None) and (alignmentNum > headLimit): print >> stderr, "limit of %d alignments reached" % headLimit break positionalStats = a.positional_stats() numPositions = len(positionalStats) vec = [None] * (2 * numPositions + 1) vec[0] = a.lineNumber for (pos, stats) in enumerate(positionalStats): if ("m" not in stats): raise ValueError, \ "\"m\" missing from positional information for alignment at line %d" \ % a.lineNumber if ("x" not in stats): raise ValueError, \ "\"x\" missing from positional information for alignment at line %d" \ % a.lineNumber vec[1 + pos] = stats["m"] vec[1 + numPositions + pos] = stats["x"] print "\t".join(map(str, vec))
def main(): # parse the command line reportProgress = None for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=", 1)[1] if (arg.startswith("--progress=")): reportProgress = int_with_unit(argVal) elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) # process the fasta sequences lengthToCount = {} inputCount = inputBp = 0 for (seqLen) in read_fasta_lengths(stdin): inputCount += 1 inputBp += seqLen if (reportProgress != None): if (inputCount % reportProgress == 0): print >>stderr, "%s sequences read (%s nts, avg=%s)" \ % (commatize(inputCount),commatize(inputBp), commatize(int(round(float(inputBp)/inputCount)))) if (seqLen not in lengthToCount): lengthToCount[seqLen] = 1 else: lengthToCount[seqLen] += 1 # report the distribution lengths = [length for length in lengthToCount] lengths.sort() print "\n".join( ["%d\t%d" % (length, lengthToCount[length]) for length in lengths])
def main(): global debug # parse the command line countRatio = 1 headLimit = None requireEof = True debug = [] for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=", 1)[1] if (arg.startswith("--minwordratio=")) or ( arg.startswith("--ratio=")) or (arg.startswith("R=")): countRatio = float_or_fraction(argVal) elif (arg.startswith("--head=")): headLimit = int_with_unit(argVal) elif (arg in ["--noendmark", "--noeof", "--nomark"]): # (unadvertised) requireEof = False elif (arg == "--debug"): debug += ["debug"] elif (arg.startswith("--debug=")): debug += argVal.split(",") elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) # process the alignments alignmentNum = 0 for a in alignments(stdin, requireEof): alignmentNum += 1 if (headLimit != None) and (alignmentNum > headLimit): print >> stderr, "limit of %d alignments reached" % headLimit break if (alignmentNum > 1): print print "\n".join(a.lines) motifText = a.motifText seqText = a.seqText if ("noflip" in debug): pass elif (a.strand == "-") and (a.start < a.end): # alignment was reported in reverse complement of motif, so flip it motifText = reverse_complement(motifText) seqText = reverse_complement(seqText) (motifChunks, seqChunks) = chunkify(a.motif, motifText, seqText) wordCounts = Counter() for word in seqChunks: word = word.replace("-", "") if (word != a.motif): word = word.lower() wordCounts[word] += 1 if (a.motif in wordCounts): motifCount = wordCounts[a.motif] else: motifCount = 0 wordCounts = [(wordCounts[word], abs(len(word) - len(a.motif)), word) for word in wordCounts if (wordCounts[word] >= motifCount * countRatio)] wordCounts.sort() wordCounts.reverse() print "# aligned words %s" % \ " ".join(["%s:%d"%(word,count) for (count,_,word) in wordCounts]) if ("chunks" in debug): if ("noflip" in debug): seqChunks = [ reverse_complement(word) for word in seqChunks[::-1] ] motifChunks = [ reverse_complement(word) for word in motifChunks[::-1] ] print "# words: %s" % " ".join(seqChunks) print "# motif: %s" % " ".join(motifChunks) if (requireEof): print "# ncrf end-of-file"
def main(): # parse the command line arraysFilename = None motifs = [] sequenceName = None sequenceLen = 0 numRepeats = None genNeighbors = 0.0 genMixture = 0.0 lengthsFilename = None minFill = None errorProfile = None catalogFilename = None wrapLength = 100 for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=",1)[1] if (arg.startswith("--arrays=")): arraysFilename = argVal elif (arg.startswith("--name=")): sequenceName = argVal elif (arg.startswith("--length=")) or (arg.startswith("--len=")) or (arg.startswith("L=")): if (argVal.endswith("%")): sequenceLen = float(argVal[:-1]) / 100.0 assert (sequenceLen >= 1.0) sequenceLen = ("%",sequenceLen) elif (argVal.startswith("+")): sequenceLen = int_with_unit(argVal[1:]) assert (sequenceLen >= 0) sequenceLen = ("+",sequenceLen) else: sequenceLen = int_with_unit(argVal) assert (sequenceLen >= 0) elif (arg.startswith("--repeats=")) or (arg.startswith("N=")): numRepeats = int_with_unit(argVal) assert (numRepeats > 0) elif (arg.startswith("--motif:neighbor=")): genNeighbors = parse_probability(argVal) elif (arg.startswith("--motif:mixture=")): genMixture = parse_probability(argVal) elif (arg.startswith("--lengths=")): lengthsFilename = argVal elif (arg.startswith("--minfill=")) or (arg.startswith("F=")): minFill = int(argVal) if (minFill < 0): print >>stderr, "WARNING: \"%s\" interpreted as no minimum fill" % argVal minFill = None if (minFill == 0): minFill = None elif (arg.startswith("--errors=")): errorProfile = None if (argVal in ["pacbio","pacbio.v3","pacbio.GIAB","pacbio.giab"]): errorProfile = errorProfilePacbioV3 elif (argVal == "pacbio.v2"): # for historical reasons, v2 is an alias for v3 errorProfile = errorProfilePacbioV3 elif (argVal in ["pacbio.v1","pacbio.Guiblet","pacbio.guiblet"]): errorProfile = errorProfilePacbioV1 elif (argVal in ["pacbio.readsim"]): errorProfile = errorProfilePacbioReadsim elif (argVal in ["nanopore","nanopore.v3","nanopore.GIAB","nanopore.giab"]): errorProfile = errorProfileNanoporeV3 elif (argVal == "nanopore.v2"): # for historical reasons, v2 is an alias for v3 errorProfile = errorProfileNanoporeV3 elif (argVal in ["nanopore.v1","nanopore.Jain","nanopore.jain"]): errorProfile = errorProfileNanoporeV1 elif (argVal in ["nanopore.readsim"]): errorProfile = errorProfileNanoporeReadSim elif (":" in argVal): try: errorProfile = parse_error_spec(argVal) except ValueError: pass else: p = parse_probability(argVal) errorProfile = {"mm":p, "i":p, "d":p } if (errorProfile == None): usage("\"%s\" is not a valid error spec" % argVal) subProb = errorProfile["mm"] insOpenProb = errorProfile["i"] delOpenProb = errorProfile["d"] insExtendProb = delExtendProb = 0.0 elif (arg.startswith("--catalog=")): catalogFilename = argVal elif (arg.startswith("--wrap=")): wrapLength = int(argVal) if (wrapLength <= 0): wrapLength = None elif (arg.startswith("--seed=")): # nota bene: if the seed is a number, use it as a number, since # string seeds can produce different sequences on # different versions/builds of python seed = argVal try: seed = int(seed) except ValueError: try: seed = float(seed) except ValueError: pass random_seed(seed) elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) elif (is_nucleotide_string(arg)): motifs += [arg.upper()] else: usage("unrecognized option: %s" % arg) if (arraysFilename != None): if (motifs != []): usage("command line <motif>s cannot be used with --arrays") if (numRepeats != None): usage("--repeats cannot be used with --arrays") if (lengthsFilename != None): usage("--lengths cannot be used with --arrays") if (genNeighbors != 0.0): usage("--motif:neighbor cannot be used with --arrays") if (genMixture != 0.0): usage("--motif:mixture cannot be used with --arrays") elif (motifs == []): usage("you have to give me at least one motif") if (numRepeats == None) and (arraysFilename != None): numRepeats = 1 # read the arrays file, if we have one repeatLengths = {} haveSpecificArrays = False if (arraysFilename != None): haveSpecificArrays = True f = file(arraysFilename,"rt") numRepeats = 0 for (length,motif,_) in read_arrays(f,arraysFilename): numRepeats += 1 if (motif not in repeatLengths): motifs += [(motif)] repeatLengths[motif] = [length] else: repeatLengths[motif] += [length] f.close() if (motifs == []): usage("array file \"%s\" contains no arrays" % arraysFilename) # read the lengths file if (repeatLengths == {}): if (lengthsFilename == None): lengths = read_integers(stdin) for motif in motifs: repeatLengths[motif] = lengths elif ("{motif}" not in lengthsFilename): f = file(lengthsFilename,"rt") lengths = read_integers(f,lengthsFilename) f.close() for motif in motifs: repeatLengths[motif] = lengths else: for motif in motifs: motifLengthsFilename = lengthsFilename.replace("{motif}",motif) f = file(motifLengthsFilename,"rt") lengths = read_integers(f,motifLengthsFilename) f.close() repeatLengths[motif] = lengths # generate the number and type of motifs we'll embed # # note: to satisfy the requirement that the same seed generates the same # pre-error sequence, we should have no variance in the use of the # PRNG until after we've generated that sequence; see "point A" below embeddings = [] if (haveSpecificArrays): for motif in motifs: for length in repeatLengths[motif]: strand = choice(["+","-"]) offset = choice(xrange(len(motif))) embeddings += [(1.0,motif,motif,strand,offset,length)] shuffle(embeddings) else: for _ in xrange(numRepeats): motif = choice(motifs) length = choice(repeatLengths[motif]) u = unit_random() if (genNeighbors > 0) and (u < genNeighbors): motif = motif_neighbor(motif) (mix,motif2) = (1.0,motif) elif (genMixture > 0) and (u < genNeighbors+genMixture): (mix,motif2) = (0.5,motif_neighbor(motif)) else: (mix,motif2) = (1.0,motif) strand = choice(["+","-"]) offset = choice(xrange(len(motif))) embeddings += [(mix,motif,motif2,strand,offset,length)] totalRepeatBp = sum([length for (_,_,_,_,_,length) in embeddings]) # assign each repeat a position within the "fill" sequence; note that we # might have more than one repeat assigned to the same position, in which # case they will be back-to-back with no fill between them if (type(sequenceLen) == tuple): (op,sequenceLen) = sequenceLen if (op == "%"): sequenceLen = int(round(totalRepeatBp*sequenceLen)) else: # if (op == "+"): sequenceLen = totalRepeatBp + sequenceLen if (totalRepeatBp > sequenceLen): fillBp = 0 if (sequenceLen > 0): print >>stderr, "WARNING: length of embedded repeats (%d) exceeds specified" % totalRepeatBp print >>stderr, " sequence length (%d); there will be no fill DNA" % sequenceLen elif (minFill != None): fillBp = sequenceLen - totalRepeatBp totalMinFill = (numRepeats+1) * minFill if (totalMinFill > fillBp): print >>stderr, "WARNING: minimum fill of %d cannot be achieved" % minFill print >>stderr, " total minimum fill (%d) exceeds total fill (%d)" % (totalMinFill,fillBp) minFill = fillBp / (numRepeats+1) fillBp -= minFill * (numRepeats+1) else: fillBp = sequenceLen - totalRepeatBp fillPositions = [randint(0,fillBp) for _ in xrange(numRepeats)] fillPositions.sort() if (minFill != None): fillBp += minFill * (numRepeats+1) for rptNum in xrange(numRepeats): fillPositions[rptNum] += (rptNum+1) * minFill # generate the sequence catalog = None if (catalogFilename != None): catalog = [] fillSeq = str(EchyDna(fillBp)) seq = [] seqPos = 0 prevEnd = 0 fillPos = 0 for (ix,pos) in enumerate(fillPositions): if (fillPos < pos): seq += [fillSeq[fillPos:pos]] seqPos += pos - fillPos fillPos = pos (mix,motif,motif2,strand,offset,length) = embeddings[ix] if (catalog != None): c = CatalogEntry() c.start = seqPos c.end = seqPos+length c.mix = mix c.motif = motif c.motif2 = motif2 c.strand = strand c.repeatLength = length c.offset = offset catalog += [c] enoughCopies = (length+offset+len(motif)-1) / len(motif) if (strand == "-"): motif = reverse_complement(motif) if (mix >= 1.0): repeat = motif * enoughCopies else: repeat = [] for _ in xrange(enoughCopies): if (unit_random() < mix): repeat += [motif] else: repeat += [motif2] repeat = "".join(repeat) seq += repeat[offset:offset+length] seqPos += length prevEnd = seqPos if (fillPos < fillBp): seq += [fillSeq[fillPos:fillBp]] seq = "".join(seq) #=== point A: it's now safe to make additional use of the PRNG === # apply error profile events = profile = None if (argVal in ["pacbio","pacbio.v3","pacbio.GIAB","pacbio.giab"]): errorProfile = errorProfilePacbioV3 elif (argVal == "pacbio.v2"): # for historical reasons, v2 is an alias for v3 errorProfile = errorProfilePacbioV3 elif (argVal in ["pacbio.v1","pacbio.Guiblet","pacbio.guiblet"]): errorProfile = errorProfilePacbioV1 elif (argVal in ["pacbio.readsim"]): errorProfile = errorProfilePacbioReadsim elif (argVal in ["nanopore","nanopore.v3","nanopore.GIAB","nanopore.giab"]): errorProfile = errorProfileNanoporeV3 elif (argVal == "nanopore.v2"): # for historical reasons, v2 is an alias for v3 errorProfile = errorProfileNanoporeV3 elif (argVal in ["nanopore.v1","nanopore.Jain","nanopore.jain"]): errorProfile = errorProfileNanoporeV1 elif (argVal in ["nanopore.readsim"]): errorProfile = errorProfileNanoporeReadSim elif (type(errorProfile) == float): eRate = errorProfile / 3.0; profile = {"mm":eRate, "i":eRate, "d":eRate } elif (type(errorProfile) == dict): profile = dict(errorProfile) if (profile != None): print >>stderr, "(applying error profile mm=%.2f%% i=%.2f%% d=%.2f%%)" \ % (100*profile["mm"],100*profile["i"],100*profile["d"]) (seq,catalog,events) = apply_errors(profile,seq,catalog) # write the sequence if (sequenceName != None): print ">%s" % sequenceName if (wrapLength == None): print seq else: for i in range(0,len(seq),wrapLength): print seq[i:i+wrapLength] # write the catalog if (catalogFilename != None): catalogF = file(catalogFilename,"wt") if (sequenceName in [None,""]): seqNameForCatalog = "seq" else: seqNameForCatalog = sequenceName if (events == None): print >>catalogF, "#%s\t%s\t%s\t%s\t%s\t%s\t%s" \ % ("chrom","start","end","motif","rptLen","len","fill") else: print >>catalogF, "#%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" \ % ("chrom","start","end","motif","rptLen","len","fill", "mRatio","m","mm","i","d") prevEnd = 0 for (catIx,c) in enumerate(catalog): motifStr = c.motif if (c.mix < 1.0): motifStr += "," + c.motif2 motifStr += ".%s%s" % (c.offset,c.strand) if (events == None): print >>catalogF, "%s\t%s\t%s\t%s\t%s\t%s\t%s" \ % (seqNameForCatalog,c.start,c.end,motifStr, c.repeatLength,c.end-c.start,c.start-prevEnd) else: if (catIx in events): (m,mm,i,d) = events[catIx] mRatio = "%.1f%%" % (100.0*m/(m+mm+i+d)) else: mRatio = m = mm = i = d = "NA" print >>catalogF, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" \ % (seqNameForCatalog,c.start,c.end,motifStr, c.repeatLength,c.end-c.start,c.start-prevEnd, mRatio,m,mm,i,d) prevEnd = c.end catalogF.close()
def main(): # parse the command line numValues = None mu = 0.0 sigma = 1.0 roundEm = None precision = None for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=",1)[1] if (arg.startswith("--mu=")): mu = float_or_fraction(argVal) elif (arg.startswith("--sigma=")): sigma = float_or_fraction(argVal) elif (arg == "--round"): roundEm = "round" elif (arg == "--floor"): roundEm = "floor" elif (arg == "--ceiling"): roundEm = "ceiling" elif (arg.startswith("--precision=")): precision = int(argVal) elif (arg.startswith("--seed=")): # nota bene: if the seed is a number, use it as a number, since # string seeds can produce different sequences on # different versions/builds of python seed = argVal try: seed = int(seed) except ValueError: try: seed = float(seed) except ValueError: pass random_seed(seed) elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) elif (numValues == None): numValues = int_with_unit(arg) else: usage("unrecognized option: %s" % arg) if (numValues == None): numValues = 1 if (roundEm != None) and (precision != None): usage("can't use --precision with --%s" % roundEm) if (precision == None): vFmt = "%s" elif (precision <= 0): vFmt = "%d." else: vFmt = "%%.%df" % precision # generate the values for _ in xrange(numValues): v = gauss(mu,sigma) if (roundEm == "round"): v = int(round(v)) elif (roundEm == "floor"): v = int(floor(v)) elif (roundEm == "ceiling"): v = int(ceil (v)) print vFmt % v
def main(): global nameFieldW,lengthFieldW,countFieldW,rangeFieldW global debug # parse the command line genomeFilename = None readsFilename = None cigarFilename = None intervalsFilename = None intervalsAreCatalog = False motifs = None chromsOfInterest = None minLength = None noiselessGenome = True reportProgress = None nameFieldW = 1 lengthFieldW = 1 countFieldW = 1 rangeFieldW = 1 debug = [] for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=",1)[1] if (arg.startswith("--genome=")): genomeFilename = argVal elif (arg.startswith("--reads=")) or (arg.startswith("--read=")): readsFilename = argVal elif (arg.startswith("--cigars=")) or (arg.startswith("--cigar=")): cigarFilename = argVal elif (arg.startswith("--intervals=")) or (arg.startswith("--interval=")): if (intervalsFilename != None): usage("--intervals and --catalog are mutually exclusive") intervalsFilename = argVal intervalsAreCatalog = False elif (arg.startswith("--catalog=")): if (intervalsFilename != None): usage("--intervals and --catalog are mutually exclusive") intervalsFilename = argVal intervalsAreCatalog = True elif (arg.startswith("--motif=")): if (motifs == None): motifs = set() motifs.add(argVal) elif (arg.startswith("--chromosome=")) or (arg.startswith("--chromosomes=")) \ or (arg.startswith("--chrom=")) or (arg.startswith("--chroms=")): if (chromsOfInterest == None): chromsOfInterest = set() for chrom in argVal.split(","): chromsOfInterest.add(chrom) elif (arg.startswith("--minlength=")) or (arg.startswith("--minlen=")): try: minLength = int(argVal) if (minLength < 0): raise ValueError if (minLength == 0): minLength = None except ValueError: usage("bad length in \"%s\"" % arg) elif (arg == "--noisygenome"): noiselessGenome = False elif (arg.startswith("--progress=")): reportProgress = int_with_unit(argVal) elif (arg.startswith("--fields=")) or (arg.startswith("F=")): (nameFieldW,lengthFieldW,countFieldW,rangeFieldW) = argVal.split(",",4) nameFieldW = max(int(nameFieldW),1) lengthFieldW = max(int(lengthFieldW),1) countFieldW = max(int(countFieldW),1) rangeFieldW = max(int(rangeFieldW),1) elif (arg.startswith("--namefield=")) or (arg.startswith("F1=")): nameFieldW = max(int(argVal),1) elif (arg.startswith("--lengthfield=")) or (arg.startswith("F2=")): lengthFieldW = max(int(argVal),1) elif (arg.startswith("--countfield=")) or (arg.startswith("F3=")): countFieldW = max(int(argVal),1) elif (arg.startswith("--intervalfield=")) or (arg.startswith("F4=")): rangeFieldW = max(int(argVal),1) elif (arg == "--debug"): debug += ["debug"] elif (arg.startswith("--debug=")): debug += argVal.split(",") elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) if (genomeFilename == None): usage("you need to give me a genome file") if (readsFilename == None): usage("you need to give me a reads file") if (cigarFilename == None): usage("you need to give me a cigar strings file") if (motifs != None) and (not intervalsAreCatalog): usage("--motifs requires --catalog") # read the intervals # # nota bene: this can modify chromsOfInterest, restricting it to the # chromosomes in the intervals list chromToIntervals = None motifsSeen = set() if (intervalsFilename != None): chromToIntervals = {} if (intervalsFilename.endswith(".gz")) or (intervalsFilename.endswith(".gzip")): intervalsF = gzip_open(intervalsFilename,"rt") else: intervalsF = file(intervalsFilename,"rt") for (lineNumber,chrom,gStart,gEnd,tags) in read_intervals(intervalsF): if (chromsOfInterest != None) and (chrom not in chromsOfInterest): continue if (chrom not in chromToIntervals): chromToIntervals[chrom] = [] if (intervalsAreCatalog): if (tags == None): exit("%s: not enough fields at line %d (%d, expected at least %d)" % (os_path.basename(argv[0]),lineNumber,len(fields),4)) (motif,strand) = (tags[0][:-1],tags[0][-1:]) if ("." in motif): motif = motif[:motif.find(".")] if (strand not in ["+","-"]) or (not is_nucleotide_string(motif)): exit("%s: bad motif at line %d: \"%s\"" % (os_path.basename(argv[0]),lineNumber,tags[0])) if (motifs != None): if (motif not in motifs): continue motifsSeen.add(motif) else: motif = strand = None chromToIntervals[chrom] += [(gStart,gEnd,motif,strand)] intervalsF.close() for chrom in chromToIntervals: chromToIntervals[chrom].sort() if (chromsOfInterest == None): chromsOfInterest = set(chromToIntervals) else: for chrom in chromsOfInterest: if (chrom not in chromToIntervals): chromsOfInterest.remove(chrom) if (motifs != None): for motif in motifs: if (motif not in motifsSeen): print >>stderr, "WARNING \"%s\" was not seen in %s" \ % (motif,intervalsFilename) # read the genome chromToSequence = {} if (genomeFilename.endswith(".gz")) or (genomeFilename.endswith(".gzip")): genomeF = gzip_open(genomeFilename,"rt") else: genomeF = file(genomeFilename,"rt") for (chrom,seq) in read_fasta_sequences(genomeF,chromsOfInterest): if (chrom in chromToSequence): exit("%s: \"%s\" appears more than once in \"%s\"" % (os_path.basename(argv[0]),chrom,genomeFilename)) chromToSequence[chrom] = seq genomeF.close() if (chromsOfInterest != None): for chrom in chromsOfInterest: if (chrom not in chromToSequence): exit("%s: \"%s\" doesn't appear in \"%s\"" % (os_path.basename(argv[0]),chrom,genomeFilename)) # read the cigar strings if (cigarFilename.endswith(".gz")) or (cigarFilename.endswith(".gzip")): cigarF = gzip_open(cigarFilename,"rt") else: cigarF = file(cigarFilename,"rt") readNameToCigar = {} for (lineNumber,line,readName,chrom,strand,gStart,gEnd,cigar) in read_cigars(cigarF): if (chromsOfInterest != None) and (chrom not in chromsOfInterest): continue (rLength,gLength) = cigar_lengths(cigar) readNameToCigar[readName] = (chrom,gStart,gEnd,gLength,strand,rLength,cigar) if (gLength != gEnd-gStart): exit("%s: bad cigar line (at line %d); cigar doesn't match interval length (%d vs %d)\n%s" % (os_path.basename(argv[0]),lineNumber,gLength,gEnd-gStart,line)) cigarF.close() # process the reads if (readsFilename.endswith(".gz")) or (readsFilename.endswith(".gzip")): readsF = gzip_open(readsFilename,"rt") else: readsF = file(readsFilename,"rt") readNum = alignmentsReported = 0 for (readName,rNucs) in read_fasta_sequences(readsF): readNum += 1 if (reportProgress != None) \ and ((readNum == 1) or (readNum % reportProgress == 0)): print >>stderr, "progress: processing read #%s %s (%s alignments reported so far)" \ % (commatize(readNum),readName,commatize(alignmentsReported)) if (readName not in readNameToCigar): exit("%s: \"%s\" doesn't appear in \"%s\"" % (os_path.basename(argv[0]),readNameToCigar,cigarFilename)) (chrom,gStart,gEnd,gLength,strand,rLength,cigar) = readNameToCigar[readName] gNucs = chromToSequence[chrom][gStart:gEnd] if (strand == "-"): gNucs = reverse_complement(gNucs) a = Alignment() a.readName = readName a.rStart = 0 a.rEnd = rLength a.rLength = rLength a.rNucs = rNucs a.chrom = chrom a.strand = strand a.gStart = gStart a.gEnd = gEnd a.gNucs = gNucs a.score = 0 a.motif = "%s:%d-%d%s" % (chrom,a.gStart,a.gEnd,strand) (a.rText,a.gText) = reconstruct_alignment(rNucs,gNucs,cigar) if (chromToIntervals == None): if (minLength != None) and (a.gEnd-a.gStart < minLength): continue print_alignment(a) alignmentsReported += 1 else: intervals = chromToIntervals[chrom] for (s,e,motif,mStrand) in intersecting_intervals(intervals,gStart,gEnd): aSliced = slice_alignment(a,s,e) if (minLength != None) and (aSliced.gEnd-aSliced.gStart < minLength): continue print_alignment(aSliced) alignmentsReported += 1 if ("intervalsanity" in debug): rText = remove_gaps(aSliced.rText) realText = rNucs[aSliced.rStart:aSliced.rEnd] if (realText != rText): exit("%s: sanity check failed for read:\n\"%s\"\n\"%s\"" % (os_path.basename(argv[0]),rText,realText)) gText = remove_gaps(aSliced.gText).upper() realText = chromToSequence[chrom][aSliced.gStart:aSliced.gEnd] if (strand == "-"): realText = reverse_complement(realText) if (realText != gText): exit("%s: sanity check failed for genome:\n\"%s\"\n\"%s\"" % (os_path.basename(argv[0]),gText,realText)) print >>stderr, "%s: sanity check passed for read %s" \ % (os_path.basename(argv[0]),readName) if (motif != None): positionalStats = positonal_stats(aSliced,motif,mStrand, noiselessGenome=noiselessGenome) print_positonal_stats(positionalStats) readsF.close() print "# ncrf end-of-file" if (reportProgress != None): print >>stderr, "progress: %s reads processed (%s alignments reported)" \ % (commatize(readNum),commatize(alignmentsReported))
def main(): global summaryHeaderLine global debug summaryHeaderLine = None # parse the command line inputFilenames = [] outTemplate = None headLimit = None debug = [] for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=", 1)[1] if (arg.startswith("--out=")): outTemplate = argVal elif (arg.startswith("--head=")): headLimit = int_with_unit(argVal) elif (arg == "--debug"): debug += ["debug"] elif (arg.startswith("--debug=")): debug += argVal.split(",") elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: if (arg not in inputFilenames): inputFilenames += [arg] if (inputFilenames == []): usage("you have to give me at least one summary file") writeSingletonsSeparately = (outTemplate != None) and ("{motif}" in outTemplate) # collect the alignments seqToSummaries = {} seqOrder = [] summaryNum = 0 for filename in inputFilenames: if (filename.endswith(".gz")) or (filename.endswith(".gzip")): f = gzip_open(filename, "rt") else: f = file(filename, "rt") for summary in read_summary(f, filename): summaryNum += 1 if (headLimit != None) and (summaryNum > headLimit): print >> stderr, "limit of %d summaries reached" % headLimit break if (summary.seq not in seqToSummaries): seqOrder += [summary.seq] seqToSummaries[summary.seq] = [] seqToSummaries[summary.seq] += [summary] f.close() # partition the alignments into overlapping groups seqToGroups = {} for seq in seqOrder: seqToGroups[seq] = overlapping_groups(seqToSummaries[seq]) if ("groups" in debug): for seq in seqOrder: for group in seqToGroups[seq]: print >> stderr, "===" for summary in group: print >> stderr, summary.line # collect groups by motif subset subsetToGroups = {} singletons = set() for seq in seqOrder: for group in seqToGroups[seq]: subset = set([summary.motif for summary in group]) subset = list(subset) subset.sort() subset = tuple(subset) if (subset not in subsetToGroups): subsetToGroups[subset] = [group] else: subsetToGroups[subset] += [group] if (len(subset) == 1): singletons.add(subset[0]) # if we're to report un-overlapped alignments separately, do so now (and # remove them from the groups) singletons = list(singletons) singletons.sort() if (writeSingletonsSeparately): for motif in singletons: subset = (motif, ) motifFilename = outTemplate.replace("{motif}", motif) motifF = file(motifFilename, "wt") print >> stderr, "writing to \"%s\"" % motifFilename if (summaryHeaderLine != None): print >> motifF, summaryHeaderLine for group in subsetToGroups[subset]: for summary in group: print >> motifF, summary.line del subsetToGroups[subset] # report overlapping alignment groups (and un-overlapped groups if we # didn't report them already) if (outTemplate == None): outF = stdout if (list(subsetToGroups) == []): print >> stderr, "no alignments to write to console" elif ("{motif}" not in outTemplate): outF = file(outTemplate, "wt") if (list(subsetToGroups) == []): print >> stderr, "no alignments to write to \"%s\"" % outTemplate else: print >> stderr, "writing to \"%s\"" % outTemplate else: outFilename = outTemplate.replace("{motif}", "overlaps") outF = file(outFilename, "wt") if (list(subsetToGroups) == []): print >> stderr, "no alignments to write to \"%s\"" % outFilename else: print >> stderr, "writing to \"%s\"" % outFilename motifCountToSubsets = {} for subset in subsetToGroups: motifCount = len(subset) if (motifCount not in motifCountToSubsets): motifCountToSubsets[motifCount] = [subset] else: motifCountToSubsets[motifCount] += [subset] motifCounts = list(motifCountToSubsets) motifCounts.sort() isFirstGroup = True for motifCount in motifCounts: subsets = motifCountToSubsets[motifCount] subsets.sort() for subset in subsets: for group in subsetToGroups[subset]: if (isFirstGroup): if (summaryHeaderLine != None): print >> outF, summaryHeaderLine isFirstGroup = False else: print >> outF # (line to separate groups) for summary in group: print >> outF, summary.line if (outF != stdout): outF.close()
def main(): # parse the command line names = [] nameToVal = {} fixedNames = Set() radius = 1 ballKind = "sparse hypercube" sampleSize = None rejectCriteria = [] excludeCenter = False for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=", 1)[1] if (arg.startswith("--fixed=")): for name in argVal.split(","): fixedNames.add(name) elif (arg.startswith("--radius=")): if ("by" not in argVal): radius = abs(int(argVal)) else: (radius, step) = argVal.split("by", 1) radius = abs(int(radius)) step = abs(int(step)) assert (radius % step == 0) if (step != 1): radius = (radius / step, step) elif (arg in ["--ball:sparse", "--ball=sparse"]): ballKind = "sparse hypercube" elif (arg in ["--ball:hyper", "--ball=hyper"]): ballKind = "hypercube" elif (arg in ["--ball:spikey", "--ball=spikey", "--spikey"]): ballKind = "spikey burr" elif (arg.startswith("--sample=")): sampleSize = int_with_unit(argVal) elif (arg.startswith("--reject=")): rejectCriteria += [argVal] elif (arg == "--nocenter"): excludeCenter = True elif (arg.startswith("--seed=")): # nota bene: if the seed is a number, use it as a number, since # string seeds can produce different sequences on # different versions/builds of python seed = argVal try: seed = int(seed) except ValueError: try: seed = float(seed) except ValueError: pass random_seed(seed) elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) elif ("=" in arg): name = arg.split("=", 1)[0] val = int(argVal) if (name in nameToVal) and (nameToVal[name] != val): usage( "you have given me more than one value for %s, %d and %d" % (name, nameToVal[name], val)) if (name not in nameToVal): names += [name] nameToVal[name] = val else: usage("unrecognized option: %s" % arg) if (names == []): usage("you have to give me at least one parameter to vary") for name in fixedNames: if (name not in nameToVal): print >> stderr, "WARNING: no value was provided for \"%s\"" % name # separate fixed and varying names variables = [] variableToVal = {} for name in names: if (name not in fixedNames): variables += [name] variableToVal[name] = nameToVal[name] # generate the parameter sets if (ballKind == "spikey burr"): ball = SpikeyBurr(variables, variableToVal, radius) elif (ballKind == "sparse hypercube"): if (sampleSize == None): sampleSize = 1 ball = SparseHyperCube(variables, variableToVal, radius, sampleSize, excludeCenter) else: # if (ballKind == "hypercube"): ball = HyperCube(variables, variableToVal, radius) if (sampleSize != None): ballSize = ball.size(excludeCenter) if (sampleSize >= ballSize): sampleSize = None if (sampleSize == None): for params in ball.ball(): if (excludeCenter): if (params_are_same(params, variableToVal)): continue reject = False for formula in rejectCriteria: if (evaluate(formula, params) == True): reject = True break if (reject): continue for name in names: if (name not in params): params[name] = nameToVal[name] print " ".join(["%s=%d" % (name, params[name]) for name in names]) else: leftToSample = sampleSize leftInBall = ballSize for params in ball.ball(): if (excludeCenter): if (params_are_same(params, variableToVal)): continue reject = False for formula in rejectCriteria: if (evaluate(formula, params) == True): reject = True break if (reject): continue if (randint(0, leftInBall - 1) < leftToSample): for name in names: if (name not in params): params[name] = nameToVal[name] print " ".join( ["%s=%d" % (name, params[name]) for name in names]) leftToSample -= 1 leftInBall -= 1
def main(): # parse the command line writeHeader = False writeWhat = "per alignment" headLimit = None requireEof = True for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=",1)[1] if (arg in ["--withheader","--with=header","--with:header"]): writeHeader = True elif (arg in ["--sumonly","--sum=only","--sum:only"]): writeWhat = "sum only" elif (arg.startswith("--head=")): headLimit = int_with_unit(argVal) elif (arg in ["--noendmark","--noeof","--nomark"]): # (unadvertised) requireEof = False elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) # process the alignments sum = {"m":0, "mm":0, "io":0, "ix":0, "do":0, "dx":0} alignmentNum = 0 for a in alignments(stdin,requireEof): alignmentNum +=1 if (headLimit != None) and (alignmentNum > headLimit): print >>stderr, "limit of %d alignments reached" % headLimit break (nMatch,nMismatch,nInsO,nInsX,nDelO,nDelX) = extract_events(a) if (writeHeader): print "\t".join(["line","motif","mRatio","m","mm","io","ix","do","dx"]) writeHeader = False if (writeWhat == "per alignment"): vec = [a.lineNumber,a.motif,a.mRatio,nMatch,nMismatch,nInsO,nInsX,nDelO,nDelX] print "\t".join(map(str,vec)) sum["m"] += nMatch sum["mm"] += nMismatch sum["io"] += nInsO sum["ix"] += nInsX sum["do"] += nDelO sum["dx"] += nDelX sum["events"] = (sum["m"] + sum["mm"] + sum["io"] + sum["ix"] + sum["do"] + sum["dx"]) if (alignmentNum == 0): print >>stderr, "WARNING: input contained no alignments" elif (writeWhat == "sum only"): mRatio = float(sum["m"]) / sum["events"] mRatio = "%.3f" % mRatio vec = ["all",a.motif,mRatio,sum["m"],sum["mm"],sum["io"],sum["ix"],sum["do"],sum["dx"]] print "\t".join(map(str,vec))
def main(): global debug # parse the command line distributionFilename = None remainderFilename = None wrapLength = 100 reportProgress = None debug = [] for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=",1)[1] if (arg.startswith("--remainder=")): remainderFilename = argVal elif (arg.startswith("--wrap=")): wrapLength = int(argVal) if (wrapLength <= 0): wrapLength = None elif (arg.startswith("--seed=")): random_seed(argVal) elif (arg.startswith("--progress=")): reportProgress = int_with_unit(argVal) elif (arg == "--debug"): debug += ["debug"] elif (arg.startswith("--debug=")): debug += argVal.split(",") elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) elif (distributionFilename == None): distributionFilename = arg else: usage("unrecognized option: %s" % arg) if (distributionFilename == None): usage("you must provide a length-distribution filename") # read the distribution intervals = IntervalDict() distribF = file(distributionFilename,"rt") for spec in read_distribution_spec(distribF,distributionFilename): (lineNumber,minLength,maxLength,outCount,inCount) = spec interval = intervals.add(minLength,maxLength) if (interval == None): # interval overlaps an existing interval interval = Interval(minLength,maxLength) previous = intervals.overlapper(minLength,maxLength) assert (False), \ "%s (line %d) overlaps %s (line %d)" \ % (interval,lineNumber,previous,previous.lineNumber) interval.lineNumber = lineNumber interval.outCount = outCount interval.inCount = inCount distribF.close () if ("distribution" in debug): for interval in intervals: print >>stderr, "%s %d %d" \ % (interval,interval.outCount,interval.inCount) # process the reads # # this filters reads based on the length (on the interval containing the # length); if we expect to see E more sequences of this length (including # this one), and we are to output N of those, we output this sequence with # probability N/E; and we adjust N and E for this length accordingly inputCount = outputCount = inputBp = outputBp = 0 for (name,seq) in read_fasta_sequences(stdin): seqLen = len(seq) inputCount += 1 inputBp += seqLen if (reportProgress != None): if (inputCount % reportProgress == 0): print >>stderr, "%s sequences read, %s written (%.1f%%); %s nts read, %s written" \ % (commatize(inputCount),commatize(outputCount), 100.0*outputCount/inputCount, commatize(inputBp),commatize(outputBp)) try: interval = intervals[seqLen] except KeyError: continue if (interval.inCount <= 0): print >>stderr, "ERROR: for length %d (%s), actual input exceeded expected input count" \ % (seqLen,interval) if (remainderFilename != None): print >>stderr, " (writing remainders to %s)" % remainderFilename remainderF = file(remainderFilename,"wt") write_remainders(remainderF,intervals) remainderF.close () assert (False) if (interval.outCount == 0): keepSeq = False else: keepSeq = (randint(1,interval.inCount) <= interval.outCount) interval.inCount -= 1 if (not keepSeq): continue interval.outCount -= 1 outputCount += 1 outputBp += seqLen print ">%s" % name if (wrapLength == None): print seq else: for i in range(0,seqLen,wrapLength): print seq[i:i+wrapLength] # write the remainders if (remainderFilename != None): remainderF = file(remainderFilename,"wt") write_remainders(remainderF,intervals) remainderF.close ()
def main(): global warnOnError # parse the command line minMapQ = None writeHeader = False writeWhat = "per alignment" warnOnError = False headLimit = None reportProgress = None for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=", 1)[1] if (arg.startswith("--mapq=")) or (arg.startswith("--MAPQ=")) or ( arg.startswith("MAPQ=")): minMapQ = int(argVal) elif (arg in ["--withheader", "--with=header", "--with:header"]): writeHeader = True elif (arg in ["--sumonly", "--sum=only", "--sum:only"]): writeWhat = "sum only" elif (arg == "--warnandcontinue"): warnOnError = True elif (arg.startswith("--head=")): headLimit = int_with_unit(argVal) elif (arg.startswith("--progress=")): reportProgress = int_with_unit(argVal) elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) # process the sam records sum = {"m": 0, "mm": 0, "io": 0, "ix": 0, "do": 0, "dx": 0} recordNum = alignmentNum = 0 for a in read_sam_plain(stdin, minMapQ=minMapQ): recordNum += 1 if (reportProgress != None) and (recordNum % reportProgress == 0): sum["events"] = (sum["m"] + sum["mm"] + sum["io"] + sum["ix"] + sum["do"] + sum["dx"]) mRatio = float(sum["m"]) / sum["events"] vec = [ mRatio, sum["m"], sum["mm"], sum["io"], sum["ix"], sum["do"], sum["dx"] ] print >>stderr, "progress: processing sam record %s (mRatio=%.3f m=%d mm=%d io=%d ix=%d do=%d dx=%d)" \ % (commatize(recordNum), mRatio,sum["m"],sum["mm"],sum["io"],sum["ix"],sum["do"],sum["dx"]) if (headLimit != None) and (recordNum > headLimit): print >> stderr, "limit of %s sam records reached" % commatize( headLimit) break if (a.rName == "*"): continue # read did not align if (minMapQ != None) and (a.mapQ < minMapQ): continue alignmentNum += 1 events = sam_to_events(a) if (type(events) == str): print >> stderr, events continue (nMatch, nMismatch, nInsO, nInsX, nDelO, nDelX) = events if (writeHeader): print "\t".join( ["line", "read", "mRatio", "m", "mm", "io", "ix", "do", "dx"]) writeHeader = False if (writeWhat == "per alignment"): mRatio = float(nMatch) / (nMatch + nMismatch + nInsO + nInsX + nDelO + nDelX) mRatio = "%.3f" % mRatio vec = [ a.lineNumber, a.qName, mRatio, nMatch, nMismatch, nInsO, nInsX, nDelO, nDelX ] print "\t".join(map(str, vec)) sum["m"] += nMatch sum["mm"] += nMismatch sum["io"] += nInsO sum["ix"] += nInsX sum["do"] += nDelO sum["dx"] += nDelX sum["events"] = (sum["m"] + sum["mm"] + sum["io"] + sum["ix"] + sum["do"] + sum["dx"]) if (alignmentNum == 0): print >> stderr, "WARNING: input contained no alignments" elif (writeWhat == "sum only"): alignmentNumStr = "(%d)" % alignmentNum mRatio = float(sum["m"]) / sum["events"] mRatio = "%.3f" % mRatio vec = [ "all", alignmentNumStr, mRatio, sum["m"], sum["mm"], sum["io"], sum["ix"], sum["do"], sum["dx"] ] print "\t".join(map(str, vec))
def main(): global debug # parse the command line maxMRatio = 0.85 minColumns = 10 headLimit = None reportClumps = False requireEof = True debug = [] for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=",1)[1] if (arg.startswith("--maxMRatio=")): maxMRatio = parse_noise_rate(argVal) if (not (0.0 <= minMRatio <= 1.0)): exit("%s: mratio has to be between 0 and 1 (e.g. 0.85 or 85%%)\n%s" % (os_path.basename(argv[0]),arg)) elif (arg.startswith("--minnoise=")): maxMRatio = 1 - parse_noise_rate(argVal) if (not (0.0 <= minMRatio <= 1.0)): exit("%s: noise has to be between 0 and 1 (e.g. 0.15 or 15%%)\n%s" % (os_path.basename(argv[0]),arg)) elif (arg.startswith("--mincolumns=")) or (arg.startswith("--mindenom=")): minColumns =int(argVal) if (minColumns < 2): usage("minimum length has to be at least two columns\n%s" % arg) elif (arg.startswith("--head=")): headLimit = int_with_unit(argVal) elif (arg == "--report:clumps") or (arg == "--report=clumps"): reportClumps = True elif (arg in ["--noendmark","--noeof","--nomark"]): # (unadvertised) requireEof = False elif (arg == "--debug"): debug += ["debug"] elif (arg.startswith("--debug=")): debug += argVal.split(",") elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) # process the alignments alignmentNum = 0 for a in alignments(stdin,requireEof): alignmentNum +=1 if (headLimit != None) and (alignmentNum > headLimit): print >>stderr, "limit of %d alignments reached" % headLimit break if (a.errorText == None): exit("%s: alignment at line %d doesn't include error text" % (os_path.basename(argv[0]),a.lineNumber)) if ("detail" in debug): print >>stderr, "\nlooking for clumps in %s %c %u-%u" \ % (a.seqName,a.strand,a.start,a.end) clumps = find_clumps(a.errorText,1-maxMRatio,minColumns, positiveCh='x',negativeCh='=') clumpText = ["-"] * len(a.errorText) for (start,end) in clumps: for ix in xrange(start,end): clumpText[ix] = "*" clumpText = "".join(clumpText) prefixLen = 1 + a.lines[0].find(" =") if (prefixLen < 0): prefixLen = 1 + a.lines[0].find(" x") if (alignmentNum > 1): print a.lines.insert(3,"# %-*s%s" % (prefixLen-2,"noise clumps",clumpText)) print a if (reportClumps): for (start,end) in clumps: errorCount = matchCount = 0 for ch in a.errorText[start:end]: if (ch == 'x'): errorCount += 1 elif (ch == '='): matchCount += 1 print >>stderr, "line %d (%d,%d) m=%s x=%s mRatio: %.2f%%" \ % (a.lineNumber, start,end,matchCount,errorCount, (100.0*matchCount)/(matchCount+errorCount)) if (requireEof): print "# ncrf end-of-file"
def main(): global headLimit, reportProgress, requireEof global winnerThreshold, filterToKeep, reportConsensus, reportMsa global canonicalizeConsensuses global debug canonicalizeConsensuses = True # parse the command line filterToKeep = "consensus" nameToMotif = {} motifsOfInterest = [] reportConsensus = False reportMsa = False winnerThreshold = 0.50 # (see derive_consensuses) sliceWidth = None sliceStep = None headLimit = None reportProgress = None requireEof = True debug = [] for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=", 1)[1] if (arg == "--nonconsensus"): # (unadvertised) filterToKeep = "non consensus" reportMsa = False reportConsensus = True elif (arg == "--nonconsensus,msa"): # (unadvertised) filterToKeep = "non consensus" reportMsa = True reportConsensus = True elif (arg == "--consensusonly"): filterToKeep = "no filter" reportMsa = False reportConsensus = True elif (arg == "--filter,consensus"): # (unadvertised) filterToKeep = "consensus" reportMsa = False reportConsensus = True elif (arg == "--msa"): # (unadvertised) filterToKeep = "no filter" reportMsa = True reportConsensus = True elif (arg.startswith("--winner=")) or ( arg.startswith("W=")): # (unadvertised) winnerThreshold = parse_probability(argVal) elif (arg.startswith("--slice=")): # (unadvertised) if ("by" in argVal): (sliceWidth, sliceStep) = map(int_with_unit, argVal.split("by", 1)) else: sliceWidth = sliceStep = int_with_unit(argVal) elif (arg.startswith("--head=")): headLimit = int_with_unit(argVal) elif (arg.startswith("--progress=")): reportProgress = int_with_unit(argVal) elif (arg in ["--noendmark", "--noeof", "--nomark"]): # (unadvertised) requireEof = False elif (arg == "--debug"): debug += ["debug"] elif (arg.startswith("--debug=")): debug += argVal.split(",") elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) elif (":" in arg): (name, motif) = arg.split(":", 1) if (name in nameToMotif) and (nameToMotif[name] != motif): usage("\"%s\" is given for more than one motif" % name) if (name not in nameToMotif): nameToMotif[name] = motif motifsOfInterest += [motif] else: motifsOfInterest += [arg] if (motifsOfInterest == []): motifsOfInterest = None # this really means all motifs are of interest else: motifsOfInterest = set(motifsOfInterest) # process the alignments if (sliceWidth == None): simple_consensus_filter(stdin, motifsOfInterest, nameToMotif) else: sliced_consensus_filter(stdin, motifsOfInterest, nameToMotif, sliceWidth, sliceStep)
def main(): global reportProgress, batchSize global debug # parse the command line testMethod = "min-max" numTrials = 10 * 1000 # (only used for testMethod == "min-max") numNeededToPass = 1 # (only used for testMethod == "min-max") effectSize = 0.3 # (only used for testMethod == "chi-square") power = 0.8 # (only used for testMethod == "chi-square") discardWhich = "bad" testWhich = "matches-insertions" warnOnUntested = False subsampleK = None subsampleN = None headLimit = None batchSize = None # (will be replace by method-specific result) reportAs = "ncrf" requireEof = True prngSeed = defaultPrngSeed reportProgress = None debug = [] for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=", 1)[1] if (arg == "--method=min-max"): testMethod = "min-max" elif (arg.startswith("--trials=")): if ("/" in argVal): (numNeededToPass, numTrials) = map(int_with_unit, argVal.split("/", 1)) if (numTrials < 1): usage("bad value in: %s (trials must be at least 1)" % arg) if (not 1 <= numNeededToPass <= numTrials): usage( "bad value in: %s (num-in-bounds must be in range 1..trials)" % arg) else: (numNeededToPass, numTrials) = (1, int_with_unit(argVal)) if (numTrials < 1): usage("bad value in: %s (trials must be at least 1)" % arg) elif (arg in ["--method=chi-squared", "--method=chi-square"]): # (unadvertised, see [4]) testMethod = "chi-squared" elif (arg.startswith("--effectsize=")): # (unadvertised, see [4]) effectSize = parse_probability(argVal) elif (arg.startswith("--power=")): # (unadvertised, see [4]) power = parse_probability(argVal) elif (arg in ["--discard:bad", "--discard=bad"]): discardWhich = "bad" elif (arg in ["--discard:good", "--discard=good"]): discardWhich = "good" elif (arg in ["--discard:none", "--discard=none"]): discardWhich = "none" elif (arg in [ "--test:matches-insertions", "--test=matches-insertions", "--test:m-i", "--test=m-i" ]): testWhich = "matches-insertions" elif (arg in ["--test:matches", "--test=matches"]): testWhich = "matches" elif (arg in ["--test:errors", "--test=errors"]): testWhich = "errors" elif (arg == "--warn:untested") or (arg == "--warn=matrix"): warnOnUntested = True elif (arg.startswith("--head=")): headLimit = int_with_unit(argVal) elif (arg.startswith("--subsample=")): (subsampleK, subsampleN) = map(int, argVal.split("/", 2)) if (not 0 < subsampleK <= subsampleN): usage("bad subsample description in %s" % arg) elif (arg.startswith("--progress=")): reportProgress = int_with_unit(argVal) elif (arg.startswith("--batch=") ): # (no longer advertised, since it only applies to R) batchSize = int(argVal) elif (arg == "--report:matrix") or ( arg == "--report=matrix"): # (unadvertised) reportAs = "matrix" elif (arg == "--report:silent") or ( arg == "--report=silent"): # (unadvertised) reportAs = "silent" elif (arg in ["--noendmark", "--noeof", "--nomark"]): # (unadvertised) requireEof = False elif (arg.startswith("--seed=")): seed = argVal if (seed in ["none", "None", "NONE"]): prngSeed = None elif (seed in ["default", "Default", "DEFAULT"]): prngSeed = defaultPrngSeed else: # nota bene: if the seed is a number, use it as a number, since # string seeds can produce different sequences on # different versions/builds of python try: seed = int(seed) except ValueError: try: seed = float(seed) except ValueError: pass prngSeed = seed elif (arg == "--debug"): debug += ["debug"] elif (arg.startswith("--debug=")): debug += argVal.split(",") elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) if (reportAs in ["matrix", "silent"]): discardWhich = "none" if (testMethod == "chi-squared"): testDescription = "positional chi-squared" if (batchSize == None): batchSize = 30 elif (testMethod == "min-max"): testDescription = "positional min-max" if (batchSize == None): batchSize = 1 else: exit("%s: internal error: unrecognized test method: \"%s\"" % (os_path.basename(argv[0]), testMethod)) # initialize the PRNG, if needed if (testMethod == "min-max"): if (prngSeed != None): random_seed(prngSeed) else: if (prngSeed not in [None, defaultPrngSeed]): print >>stderr, "WARNING: ignoring request to use PRNG with \"%s\"" \ % testMethod # make sure the shell commands we're gonna use have been installed if (testMethod == "chi-squared"): if (not shell_command_exists("Rscript")): exit(( "%s: Unable to run the shell command \"Rscript\";" + "\n .. Either R hasn't been installed, or the command-line shell" + " can't find it.") % os_path.basename(argv[0])) # collect the alignments; we need to collect the positional info for all # alignments, to feed to R in batches (doing them one-by-one was incredibly # slow); hopefully this won't become a memory problem (unitLength,alignmentList,mxMatrix) \ = collect_alignments(stdin,testWhich, headLimit=headLimit, subsampleK=subsampleK,subsampleN=subsampleN, requireEof=requireEof) numAlignments = len(alignmentList) if (reportProgress != None): print >>stderr, "progress: read %s alignments" \ % (commatize(numAlignments)) # assess the alignments, batch-by-batch if (reportProgress != None): progressReported = -1 accepted = [] outcomeCount = {True: 0, False: 0, None: 0} for batchStartIx in xrange(0, numAlignments, batchSize): alignmentsTested = batchStartIx if (reportProgress != None): rBlock = (progressReported + 1) / reportProgress aBlock = (alignmentsTested + 1) / reportProgress if (alignmentsTested == 0) or (aBlock != rBlock): print >>stderr, "progress: testing alignment %s (%d uniform, %d non-uniform, %d untested)" \ % (commatize(1+alignmentsTested), outcomeCount[True], outcomeCount[False], outcomeCount[None]) progressReported = alignmentsTested batchEndIx = min(batchStartIx + batchSize, numAlignments) if ("batch" in debug): print >>stderr, "using R for alignments %d thru %d" \ % (batchStartIx+1,batchEndIx) mxBatch = mxMatrix[batchStartIx:batchEndIx] aBatch = alignmentList[batchStartIx:batchEndIx] if (testMethod == "chi-squared"): batchResult = mx_significance_tests(mxBatch, testWhich, effectSize, power) if (type(batchResult) == str): exit(("%s: internal error: having trouble with R" + " (with alignment batch %d..%d)" + "\nHere's what R reported:\n%s") % (os_path.basename( argv[0]), batchStartIx, batchEndIx, batchResult)) else: # if (testMethod == "min-max"): batchResult = min_max_tests(aBatch, mxBatch, batchStartIx, testWhich, numTrials, numNeededToPass) if (type(batchResult) == str): exit(("%s: internal error: having trouble with min-max test" + " (with alignment batch %d..%d)" + "\nHere's what was reported:\n%s") % (os_path.basename( argv[0]), batchStartIx, batchEndIx, batchResult)) if (len(batchResult) != batchEndIx - batchStartIx): exit(( "%s: internal error: number of test outcomes reported by R (%d)" + "\n .. doesn't match the number of tests given to R (%d)") % (os_path.basename( argv[0]), len(batchResult), batchEndIx - batchStartIx)) accepted += batchResult if (warnOnUntested): for alignmentNum in xrange(batchStartIx, batchEndIx): testOutcome = accepted[alignmentNum] if (testOutcome == None): print >>stderr, "WARNING: alignment number %d (at line %d) could not be tested" \ % (alignmentNum,1+alignmentList[alignmentNum].lineNumber) for alignmentNum in xrange(batchStartIx, batchEndIx): testOutcome = accepted[alignmentNum] outcomeCount[testOutcome] += 1 # process the alignments and their assessments # $$$ untested alignments should be processed by some other test -- for # example (if we're testing by error counts), a perfect alignment # currently gets discarded because it can't be tested if (reportAs in ["matrix", "silent"]): outcomeMapping = { True: "not_rejected", False: "rejected", None: "untested" } else: # if (reportAs == "ncrf"): if (testWhich == "matches-insertions"): outcomeMapping = { True: "match-insert uniformity not rejected", False: "match-insert uniformity rejected", None: "untested" } elif (testWhich == "errors"): outcomeMapping = { True: "error uniformity not rejected", False: "error uniformity rejected", None: "untested" } else: # if (testWhich == "matches"): outcomeMapping = { True: "match uniformity not rejected", False: "match uniformity rejected", None: "untested" } outcomeNameW = max( [len(outcomeMapping[testOutcome]) for testOutcome in outcomeMapping]) for testOutcome in [True, False, None]: outcomeName = outcomeMapping[testOutcome] count = outcomeCount[testOutcome] reportStr = "%-*s %d" % (outcomeNameW + 1, "%s:" % outcomeName, count) if (numAlignments > 0): reportStr += " (%.2f%%)" % (100.0 * count / numAlignments) print >> stderr, reportStr if (reportAs == "matrix"): # see note [3] above for the format of the matrix file for (alignmentNum, a) in enumerate(alignmentList): testOutcome = accepted[alignmentNum] vec = [a.lineNumber, outcomeMapping[testOutcome] ] + mxMatrix[alignmentNum] print "\t".join(map(str, vec)) elif (reportAs == "silent"): pass else: # if (reportAs == "ncrf"): numKept = 0 isFirst = True for (alignmentNum, a) in enumerate(alignmentList): testOutcome = accepted[alignmentNum] if (discardWhich == "good"): if (testOutcome == True): continue elif (discardWhich == "bad"): if (testOutcome != True): continue if (discardWhich == "none"): testInfo = "# %s: %s" % (testDescription, outcomeMapping[testOutcome]) (startIx, endIx) = a.positional_stats_indexes() a.lines.insert(endIx, testInfo) if (isFirst): isFirst = False else: print print a numKept += 1 reportStr = "kept %d of %d alignments" % (numKept, numAlignments) if (numAlignments > 0): reportStr += ", %.2f%%" % (100.0 * numKept / numAlignments) print >> stderr, reportStr if (requireEof): print "# ncrf end-of-file"