コード例 #1
0
def main():

    # parse the command line

    reportTotals = False

    for arg in argv[1:]:
        if ("=" in arg):
            argVal = arg.split("=", 1)[1]

        if (arg in ["--report:totals", "--report:total"]):
            reportTotals = True
        elif (arg.startswith("--")):
            usage("unrecognized option: %s" % arg)
        else:
            usage("unrecognized option: %s" % arg)

    # accumulate the length distributions

    lengthToCount = {}

    for (length, count) in read_length_counts(stdin):
        if (length not in lengthToCount): lengthToCount[length] = count
        else: lengthToCount[length] += count

    # report the total distribution

    lengths = [length for length in lengthToCount]
    lengths.sort()

    print "\n".join(
        ["%d\t%d" % (length, lengthToCount[length]) for length in lengths])

    if (reportTotals):
        numSequences = sum([lengthToCount[length] for length in lengths])
        if (numSequences == 0):
            print >> stderr, "0 sequences / 0 bp total"
        else:
            totalBp = sum(
                [lengthToCount[length] * length for length in lengths])
            avgSequence = int(round(float(totalBp) / numSequences))
            print >>stderr, "%s sequences / %s bp total / %s bp average" \
                          % (commatize(numSequences),commatize(totalBp),commatize(avgSequence))
コード例 #2
0
def main():

    # parse the command line

    reportProgress = None

    for arg in argv[1:]:
        if ("=" in arg):
            argVal = arg.split("=", 1)[1]

        if (arg.startswith("--progress=")):
            reportProgress = int_with_unit(argVal)
        elif (arg.startswith("--")):
            usage("unrecognized option: %s" % arg)
        else:
            usage("unrecognized option: %s" % arg)

    # process the fasta sequences

    lengthToCount = {}

    inputCount = inputBp = 0
    for (seqLen) in read_fasta_lengths(stdin):
        inputCount += 1
        inputBp += seqLen

        if (reportProgress != None):
            if (inputCount % reportProgress == 0):
                print >>stderr, "%s sequences read (%s nts, avg=%s)" \
                              % (commatize(inputCount),commatize(inputBp),
                                 commatize(int(round(float(inputBp)/inputCount))))

        if (seqLen not in lengthToCount): lengthToCount[seqLen] = 1
        else: lengthToCount[seqLen] += 1

    # report the distribution

    lengths = [length for length in lengthToCount]
    lengths.sort()

    print "\n".join(
        ["%d\t%d" % (length, lengthToCount[length]) for length in lengths])
コード例 #3
0
def collect_alignments(f,
                       testWhich,
                       headLimit=None,
                       subsampleK=None,
                       subsampleN=None,
                       requireEof=True):
    alignmentList = []
    mxMatrix = []
    unitLength = None

    alignmentNum = 0
    for a in alignments(f, requireEof):
        alignmentNum += 1
        if    (reportProgress != None) \
          and ((alignmentNum == 1) or (alignmentNum % reportProgress == 0)):
            print >>stderr, "progress: reading alignment %s" \
                          % (commatize(alignmentNum))

        if (headLimit != None) and (alignmentNum > headLimit):
            print >> stderr, "limit of %d alignments reached" % headLimit
            break

        if (subsampleN != None):
            if ((alignmentNum - 1) % subsampleN != (subsampleK - 1)): continue

        if (testWhich == "matches-insertions"):
            # note [1]
            mxRow = positional_error_vector(a, modified="m-i")
        else:
            mxRow = positional_error_vector(a)
        if (mxRow == None):
            raise ValueError, \
                  "alignment at line %d does not contain positional information" \
                % a.lineNumber

        if (unitLength == None):
            unitLength = len(mxRow) / 2
        elif (len(mxRow) != 2 * unitLength):
            raise ValueError, \
                  "alignments have different motif lengths, %d and %d (detected at line %d)" \
                % (unitLength,len(mxRow)/2,a.lineNumber)

        alignmentList += [a]
        mxMatrix += [mxRow]

    return (unitLength, alignmentList, mxMatrix)
コード例 #4
0
def main():
	global nameFieldW,lengthFieldW,countFieldW,rangeFieldW
	global debug

	# parse the command line

	genomeFilename      = None
	readsFilename       = None
	cigarFilename       = None
	intervalsFilename   = None
	intervalsAreCatalog = False
	motifs              = None
	chromsOfInterest    = None
	minLength           = None
	noiselessGenome     = True
	reportProgress      = None
	nameFieldW          = 1
	lengthFieldW        = 1
	countFieldW         = 1
	rangeFieldW         = 1
	debug               = []

	for arg in argv[1:]:
		if ("=" in arg):
			argVal = arg.split("=",1)[1]

		if (arg.startswith("--genome=")):
			genomeFilename = argVal
		elif (arg.startswith("--reads=")) or (arg.startswith("--read=")):
			readsFilename = argVal
		elif (arg.startswith("--cigars=")) or (arg.startswith("--cigar=")):
			cigarFilename = argVal
		elif (arg.startswith("--intervals=")) or (arg.startswith("--interval=")):
			if (intervalsFilename != None):
				usage("--intervals and --catalog are mutually exclusive")
			intervalsFilename   = argVal
			intervalsAreCatalog = False
		elif (arg.startswith("--catalog=")):
			if (intervalsFilename != None):
				usage("--intervals and --catalog are mutually exclusive")
			intervalsFilename   = argVal
			intervalsAreCatalog = True
		elif (arg.startswith("--motif=")):
			if (motifs == None): motifs = set()
			motifs.add(argVal)
		elif (arg.startswith("--chromosome=")) or (arg.startswith("--chromosomes=")) \
		  or (arg.startswith("--chrom="))      or (arg.startswith("--chroms=")):
			if (chromsOfInterest == None): chromsOfInterest = set()
			for chrom in argVal.split(","):
				chromsOfInterest.add(chrom)
		elif (arg.startswith("--minlength=")) or (arg.startswith("--minlen=")):
			try:
				minLength = int(argVal)
				if (minLength < 0): raise ValueError
				if (minLength == 0): minLength = None
			except ValueError:
				usage("bad length in \"%s\"" % arg)
		elif (arg == "--noisygenome"):
			noiselessGenome = False
		elif (arg.startswith("--progress=")):
			reportProgress = int_with_unit(argVal)
		elif (arg.startswith("--fields=")) or (arg.startswith("F=")):
			(nameFieldW,lengthFieldW,countFieldW,rangeFieldW) = argVal.split(",",4)
			nameFieldW   = max(int(nameFieldW),1)
			lengthFieldW = max(int(lengthFieldW),1)
			countFieldW  = max(int(countFieldW),1)
			rangeFieldW  = max(int(rangeFieldW),1)
		elif (arg.startswith("--namefield=")) or (arg.startswith("F1=")):
			nameFieldW = max(int(argVal),1)
		elif (arg.startswith("--lengthfield=")) or (arg.startswith("F2=")):
			lengthFieldW = max(int(argVal),1)
		elif (arg.startswith("--countfield=")) or (arg.startswith("F3=")):
			countFieldW = max(int(argVal),1)
		elif (arg.startswith("--intervalfield=")) or (arg.startswith("F4=")):
			rangeFieldW = max(int(argVal),1)
		elif (arg == "--debug"):
			debug += ["debug"]
		elif (arg.startswith("--debug=")):
			debug += argVal.split(",")
		elif (arg.startswith("--")):
			usage("unrecognized option: %s" % arg)
		else:
			usage("unrecognized option: %s" % arg)

	if (genomeFilename == None):
		usage("you need to give me a genome file")
	if (readsFilename == None):
		usage("you need to give me a reads file")
	if (cigarFilename == None):
		usage("you need to give me a cigar strings file")

	if (motifs != None) and (not intervalsAreCatalog):
		usage("--motifs requires --catalog")

	# read the intervals
	#
	# nota bene: this can modify chromsOfInterest, restricting it to the
	# chromosomes in the intervals list

	chromToIntervals = None
	motifsSeen = set()

	if (intervalsFilename != None):
		chromToIntervals = {}

		if (intervalsFilename.endswith(".gz")) or (intervalsFilename.endswith(".gzip")):
			intervalsF = gzip_open(intervalsFilename,"rt")
		else:
			intervalsF = file(intervalsFilename,"rt")

		for (lineNumber,chrom,gStart,gEnd,tags) in read_intervals(intervalsF):
			if (chromsOfInterest != None) and (chrom not in chromsOfInterest): continue
			if (chrom not in chromToIntervals): chromToIntervals[chrom] = []

			if (intervalsAreCatalog):
				if (tags == None):
					exit("%s: not enough fields at line %d (%d, expected at least %d)"
					   % (os_path.basename(argv[0]),lineNumber,len(fields),4))
				(motif,strand) = (tags[0][:-1],tags[0][-1:])
				if ("." in motif): motif = motif[:motif.find(".")]
				if (strand not in ["+","-"]) or (not is_nucleotide_string(motif)):
					exit("%s: bad motif at line %d: \"%s\""
					   % (os_path.basename(argv[0]),lineNumber,tags[0]))

				if (motifs != None):
					if (motif not in motifs): continue
					motifsSeen.add(motif)
			else:
				motif = strand = None

			chromToIntervals[chrom] += [(gStart,gEnd,motif,strand)]

		intervalsF.close()

		for chrom in chromToIntervals:
			chromToIntervals[chrom].sort()

		if (chromsOfInterest == None):
			chromsOfInterest = set(chromToIntervals)
		else:
			for chrom in chromsOfInterest:
				if (chrom not in chromToIntervals):
					chromsOfInterest.remove(chrom)

	if (motifs != None):
		for motif in motifs:
			if (motif not in motifsSeen):
				print >>stderr, "WARNING \"%s\" was not seen in %s" \
				              % (motif,intervalsFilename)

	# read the genome

	chromToSequence = {}

	if (genomeFilename.endswith(".gz")) or (genomeFilename.endswith(".gzip")):
		genomeF = gzip_open(genomeFilename,"rt")
	else:
		genomeF = file(genomeFilename,"rt")

	for (chrom,seq) in read_fasta_sequences(genomeF,chromsOfInterest):
		if (chrom in chromToSequence):
			exit("%s: \"%s\" appears more than once in \"%s\""
			   % (os_path.basename(argv[0]),chrom,genomeFilename))
		chromToSequence[chrom] = seq

	genomeF.close()

	if (chromsOfInterest != None):
		for chrom in chromsOfInterest:
			if (chrom not in chromToSequence):
				exit("%s: \"%s\" doesn't appear in \"%s\""
				   % (os_path.basename(argv[0]),chrom,genomeFilename))

	# read the cigar strings

	if (cigarFilename.endswith(".gz")) or (cigarFilename.endswith(".gzip")):
		cigarF = gzip_open(cigarFilename,"rt")
	else:
		cigarF = file(cigarFilename,"rt")

	readNameToCigar = {}

	for (lineNumber,line,readName,chrom,strand,gStart,gEnd,cigar) in read_cigars(cigarF):
		if (chromsOfInterest != None) and (chrom not in chromsOfInterest): continue
		(rLength,gLength) = cigar_lengths(cigar)
		readNameToCigar[readName] = (chrom,gStart,gEnd,gLength,strand,rLength,cigar)
		if (gLength != gEnd-gStart):
			exit("%s: bad cigar line (at line %d); cigar doesn't match interval length (%d vs %d)\n%s"
			   % (os_path.basename(argv[0]),lineNumber,gLength,gEnd-gStart,line))

	cigarF.close()

	# process the reads

	if (readsFilename.endswith(".gz")) or (readsFilename.endswith(".gzip")):
		readsF = gzip_open(readsFilename,"rt")
	else:
		readsF = file(readsFilename,"rt")

	readNum = alignmentsReported = 0
	for (readName,rNucs) in read_fasta_sequences(readsF):
		readNum += 1
		if (reportProgress != None) \
		   and ((readNum == 1) or (readNum % reportProgress == 0)):
			print >>stderr, "progress: processing read #%s %s (%s alignments reported so far)" \
			              % (commatize(readNum),readName,commatize(alignmentsReported))

		if (readName not in readNameToCigar):
			exit("%s: \"%s\" doesn't appear in \"%s\""
			   % (os_path.basename(argv[0]),readNameToCigar,cigarFilename))

		(chrom,gStart,gEnd,gLength,strand,rLength,cigar) = readNameToCigar[readName]
		gNucs = chromToSequence[chrom][gStart:gEnd]

		if (strand == "-"):
			gNucs = reverse_complement(gNucs)

		a = Alignment()
		a.readName = readName
		a.rStart   = 0
		a.rEnd     = rLength
		a.rLength  = rLength
		a.rNucs    = rNucs
		a.chrom    = chrom
		a.strand   = strand
		a.gStart   = gStart
		a.gEnd     = gEnd
		a.gNucs    = gNucs
		a.score    = 0
		a.motif    = "%s:%d-%d%s" % (chrom,a.gStart,a.gEnd,strand)

		(a.rText,a.gText) = reconstruct_alignment(rNucs,gNucs,cigar)

		if (chromToIntervals == None):
			if (minLength != None) and (a.gEnd-a.gStart < minLength):
				continue
			print_alignment(a)
			alignmentsReported += 1
		else:
			intervals = chromToIntervals[chrom]
			for (s,e,motif,mStrand) in intersecting_intervals(intervals,gStart,gEnd):
				aSliced = slice_alignment(a,s,e)
				if (minLength != None) and (aSliced.gEnd-aSliced.gStart < minLength):
					continue
				print_alignment(aSliced)
				alignmentsReported += 1

				if ("intervalsanity" in debug):
					rText    = remove_gaps(aSliced.rText)
					realText = rNucs[aSliced.rStart:aSliced.rEnd]
					if (realText != rText):
						exit("%s: sanity check failed for read:\n\"%s\"\n\"%s\""
						   % (os_path.basename(argv[0]),rText,realText))

					gText    = remove_gaps(aSliced.gText).upper()
					realText = chromToSequence[chrom][aSliced.gStart:aSliced.gEnd]
					if (strand == "-"): realText = reverse_complement(realText)
					if (realText != gText):
						exit("%s: sanity check failed for genome:\n\"%s\"\n\"%s\""
						   % (os_path.basename(argv[0]),gText,realText))
					print >>stderr, "%s: sanity check passed for read %s" \
					              % (os_path.basename(argv[0]),readName)

				if (motif != None):
					positionalStats = positonal_stats(aSliced,motif,mStrand,
					                                  noiselessGenome=noiselessGenome)
					print_positonal_stats(positionalStats)

	readsF.close()
	print "# ncrf end-of-file"

	if (reportProgress != None):
		print >>stderr, "progress: %s reads processed (%s alignments reported)" \
		              % (commatize(readNum),commatize(alignmentsReported))
コード例 #5
0
def main():
	global debug

	# parse the command line

	distributionFilename = None
	remainderFilename    = None
	wrapLength           = 100
	reportProgress       = None
	debug                = []

	for arg in argv[1:]:
		if ("=" in arg):
			argVal = arg.split("=",1)[1]

		if (arg.startswith("--remainder=")):
			remainderFilename = argVal
		elif (arg.startswith("--wrap=")):
			wrapLength = int(argVal)
			if (wrapLength <= 0): wrapLength = None
		elif (arg.startswith("--seed=")):
			random_seed(argVal)
		elif (arg.startswith("--progress=")):
			reportProgress = int_with_unit(argVal)
		elif (arg == "--debug"):
			debug += ["debug"]
		elif (arg.startswith("--debug=")):
			debug += argVal.split(",")
		elif (arg.startswith("--")):
			usage("unrecognized option: %s" % arg)
		elif (distributionFilename == None):
			distributionFilename = arg
		else:
			usage("unrecognized option: %s" % arg)

	if (distributionFilename == None):
		usage("you must provide a length-distribution filename")

	# read the distribution

	intervals = IntervalDict()

	distribF = file(distributionFilename,"rt")
	for spec in read_distribution_spec(distribF,distributionFilename):
		(lineNumber,minLength,maxLength,outCount,inCount) = spec

		interval = intervals.add(minLength,maxLength)
		if (interval == None): # interval overlaps an existing interval
			interval = Interval(minLength,maxLength)
			previous = intervals.overlapper(minLength,maxLength)
			assert (False), \
			       "%s (line %d) overlaps %s (line %d)" \
			     % (interval,lineNumber,previous,previous.lineNumber)

		interval.lineNumber = lineNumber
		interval.outCount   = outCount
		interval.inCount    = inCount

	distribF.close ()

	if ("distribution" in debug):
		for interval in intervals:
			print >>stderr, "%s %d %d" \
			              % (interval,interval.outCount,interval.inCount)

	# process the reads
	#
	# this filters reads based on the length (on the interval containing the
	# length); if we expect to see E more sequences of this length (including
	# this one), and we are to output N of those, we output this sequence with
	# probability N/E; and we adjust N and E for this length accordingly

	inputCount = outputCount = inputBp = outputBp = 0
	for (name,seq) in read_fasta_sequences(stdin):
		seqLen = len(seq)
		inputCount += 1
		inputBp    += seqLen

		if (reportProgress != None):
			if (inputCount % reportProgress == 0):
				print >>stderr, "%s sequences read, %s written (%.1f%%); %s nts read, %s written" \
				              % (commatize(inputCount),commatize(outputCount),
				                 100.0*outputCount/inputCount,
				                 commatize(inputBp),commatize(outputBp))

		try: interval = intervals[seqLen]
		except KeyError: continue

		if (interval.inCount <= 0):
			print >>stderr, "ERROR: for length %d (%s), actual input exceeded expected input count" \
			              % (seqLen,interval)
			if (remainderFilename != None):
				print >>stderr, "      (writing remainders to %s)" % remainderFilename
				remainderF = file(remainderFilename,"wt")
				write_remainders(remainderF,intervals)
				remainderF.close ()
			assert (False)

		if (interval.outCount == 0):
			keepSeq = False
		else:
			keepSeq = (randint(1,interval.inCount) <= interval.outCount)

		interval.inCount  -= 1
		if (not keepSeq): continue

		interval.outCount -= 1
		outputCount += 1
		outputBp    += seqLen
		print ">%s" % name
		if (wrapLength == None):
			print seq
		else:
			for i in range(0,seqLen,wrapLength):
				print seq[i:i+wrapLength]

	# write the remainders

	if (remainderFilename != None):
		remainderF = file(remainderFilename,"wt")
		write_remainders(remainderF,intervals)
		remainderF.close ()
コード例 #6
0
def main():
    global warnOnError

    # parse the command line

    minMapQ = None
    writeHeader = False
    writeWhat = "per alignment"
    warnOnError = False
    headLimit = None
    reportProgress = None

    for arg in argv[1:]:
        if ("=" in arg):
            argVal = arg.split("=", 1)[1]

        if (arg.startswith("--mapq=")) or (arg.startswith("--MAPQ=")) or (
                arg.startswith("MAPQ=")):
            minMapQ = int(argVal)
        elif (arg in ["--withheader", "--with=header", "--with:header"]):
            writeHeader = True
        elif (arg in ["--sumonly", "--sum=only", "--sum:only"]):
            writeWhat = "sum only"
        elif (arg == "--warnandcontinue"):
            warnOnError = True
        elif (arg.startswith("--head=")):
            headLimit = int_with_unit(argVal)
        elif (arg.startswith("--progress=")):
            reportProgress = int_with_unit(argVal)
        elif (arg.startswith("--")):
            usage("unrecognized option: %s" % arg)
        else:
            usage("unrecognized option: %s" % arg)

    # process the sam records

    sum = {"m": 0, "mm": 0, "io": 0, "ix": 0, "do": 0, "dx": 0}

    recordNum = alignmentNum = 0
    for a in read_sam_plain(stdin, minMapQ=minMapQ):
        recordNum += 1
        if (reportProgress != None) and (recordNum % reportProgress == 0):
            sum["events"] = (sum["m"] + sum["mm"] + sum["io"] + sum["ix"] +
                             sum["do"] + sum["dx"])
            mRatio = float(sum["m"]) / sum["events"]
            vec = [
                mRatio, sum["m"], sum["mm"], sum["io"], sum["ix"], sum["do"],
                sum["dx"]
            ]
            print >>stderr, "progress: processing sam record %s (mRatio=%.3f m=%d mm=%d io=%d ix=%d do=%d dx=%d)" \
                          % (commatize(recordNum),
                             mRatio,sum["m"],sum["mm"],sum["io"],sum["ix"],sum["do"],sum["dx"])

        if (headLimit != None) and (recordNum > headLimit):
            print >> stderr, "limit of %s sam records reached" % commatize(
                headLimit)
            break

        if (a.rName == "*"): continue  # read did not align
        if (minMapQ != None) and (a.mapQ < minMapQ): continue

        alignmentNum += 1
        events = sam_to_events(a)
        if (type(events) == str):
            print >> stderr, events
            continue
        (nMatch, nMismatch, nInsO, nInsX, nDelO, nDelX) = events

        if (writeHeader):
            print "\t".join(
                ["line", "read", "mRatio", "m", "mm", "io", "ix", "do", "dx"])
            writeHeader = False

        if (writeWhat == "per alignment"):
            mRatio = float(nMatch) / (nMatch + nMismatch + nInsO + nInsX +
                                      nDelO + nDelX)
            mRatio = "%.3f" % mRatio
            vec = [
                a.lineNumber, a.qName, mRatio, nMatch, nMismatch, nInsO, nInsX,
                nDelO, nDelX
            ]
            print "\t".join(map(str, vec))

        sum["m"] += nMatch
        sum["mm"] += nMismatch
        sum["io"] += nInsO
        sum["ix"] += nInsX
        sum["do"] += nDelO
        sum["dx"] += nDelX

    sum["events"] = (sum["m"] + sum["mm"] + sum["io"] + sum["ix"] + sum["do"] +
                     sum["dx"])

    if (alignmentNum == 0):
        print >> stderr, "WARNING: input contained no alignments"
    elif (writeWhat == "sum only"):
        alignmentNumStr = "(%d)" % alignmentNum
        mRatio = float(sum["m"]) / sum["events"]
        mRatio = "%.3f" % mRatio
        vec = [
            "all", alignmentNumStr, mRatio, sum["m"], sum["mm"], sum["io"],
            sum["ix"], sum["do"], sum["dx"]
        ]
        print "\t".join(map(str, vec))
コード例 #7
0
def sliced_consensus_filter(f, motifsOfInterest, nameToMotif, sliceWidth,
                            sliceStep):
    global userHasBeenWarned

    if (reportMsa) and (not userHasBeenWarned):
        print >> stderr, "WARNING: sliced consensus doesn't report MSA, ignoring that request"
        userHasBeenWarned = True

    alignmentNum = 0
    alignmentsWritten = 0
    for a in alignments(f, requireEof):
        alignmentNum += 1

        if (reportProgress != None):
            if (alignmentNum == 1) or (alignmentNum % reportProgress == 0):
                print >>stderr, "progress: testing alignment %s" \
                              % commatize(alignmentNum)

        if (headLimit != None) and (alignmentNum > headLimit):
            print >> stderr, "limit of %d alignments reached" % headLimit
            break

        if (a.motif in nameToMotif):
            a.motif = nameToMotif[a.motif]

        if (motifsOfInterest != None) and (a.motif not in motifsOfInterest):
            continue

        if ([ch for ch in a.motif if (ch not in "ACGT")] != []):
            abort_warn_about_named_motifs(a)

        motifText = a.motifText
        seqText = a.seqText
        if ("noflip" in debug):
            pass
        elif (a.strand == "-") and (a.start < a.end):
            # alignment was reported in reverse complement of motif, so flip it
            motifText = reverse_complement(motifText)
            seqText = reverse_complement(seqText)

        # look for consensus over each slice, separately

        consensuses = set()

        numSlices = (len(motifText) + sliceStep -
                     1) / sliceStep  # (an overestimate)
        minSlice = 10 * len(a.motif)

        for sliceNum in xrange(numSlices):
            sliceStart = sliceNum * sliceStep
            sliceEnd = min(sliceStart + sliceWidth, len(motifText))
            if (sliceEnd - sliceStart < minSlice): break

            motifTextSlice = motifText[sliceStart:sliceEnd]
            seqTextSlice = seqText[sliceStart:sliceEnd]

            # derive consensus(es)

            seqChunks = chunkify(a.motif, motifTextSlice, seqTextSlice)

            if ("consensus" in debug):
                print >> stderr
                print >>stderr, "%d score=%d slice.start=%d slice.end=%d" \
                              % (a.lineNumber,a.score,sliceStart,sliceEnd)

            sliceConsensuses = derive_consensuses(
                seqChunks, winnerThreshold=winnerThreshold)
            sliceConsensuses = list(sliceConsensuses)
            if (sliceConsensuses == []):
                consensuses.add(None)
            else:
                for word in sliceConsensuses:
                    consensuses.add(word)

                if ("consensus" in debug):
                    for word in sliceConsensuses:
                        print >> stderr, "consensus %s" % word

        consensuses = list(consensuses)

        # discard the alignment if it meets the filtering criterion (if there
        # is any such criterion)

        if (filterToKeep == "consensus"):
            if (a.motif not in consensuses): continue  # (discard it)
        elif (filterToKeep == "non consensus"):
            if (a.motif in consensuses): continue  # (discard it)
        else:  # if (filterToKeep == "no filter"):
            pass

        # copy the (unfiltered) alignment to the output

        if (alignmentsWritten > 0): print
        alignmentsWritten += 1

        print "\n".join(a.lines)

        # report the consensus, if we're supposed to

        if (reportConsensus):
            if (consensuses == []):
                print "# consensus (none)"
            else:
                canonicalized = []
                for motif in consensuses:
                    if (motif == None): continue
                    if (motif != a.motif) and (canonicalizeConsensuses):
                        (motif, strand) = canonical_motif(motif)
                    canonicalized += [motif]
                if (None in consensuses):
                    canonicalized += ["(none)"]
                print "# consensus %s" % ",".join(canonicalized)

    if (requireEof):
        print "# ncrf end-of-file"
コード例 #8
0
def simple_consensus_filter(f, motifsOfInterest, nameToMotif):
    alignmentNum = 0
    alignmentsWritten = 0
    for a in alignments(f, requireEof):
        alignmentNum += 1

        if (reportProgress != None):
            if (alignmentNum == 1) or (alignmentNum % reportProgress == 0):
                print >>stderr, "progress: testing alignment %s" \
                              % commatize(alignmentNum)

        if (headLimit != None) and (alignmentNum > headLimit):
            print >> stderr, "limit of %d alignments reached" % headLimit
            break

        if (a.motif in nameToMotif):
            a.motif = nameToMotif[a.motif]

        if (motifsOfInterest != None) and (a.motif not in motifsOfInterest):
            continue

        if ([ch for ch in a.motif if (ch not in "ACGT")] != []):
            abort_warn_about_named_motifs(a)

        motifText = a.motifText
        seqText = a.seqText
        if ("noflip" in debug):
            pass
        elif (a.strand == "-") and (a.start < a.end):
            # alignment was reported in reverse complement of motif, so flip it
            motifText = reverse_complement(motifText)
            seqText = reverse_complement(seqText)

        # derive consensus(es)

        seqChunks = chunkify(a.motif, motifText, seqText)

        if ("consensus" in debug):
            print >> stderr
            print >> stderr, "%d score=%d" % (a.lineNumber, a.score)

        consensuses = derive_consensuses(seqChunks,
                                         winnerThreshold=winnerThreshold)
        consensuses = list(consensuses)

        # discard the alignment if it meets the filtering criterion (if there
        # is any such criterion)

        if (filterToKeep == "consensus"):
            if (a.motif not in consensuses): continue  # (discard it)
        elif (filterToKeep == "non consensus"):
            if (a.motif in consensuses): continue  # (discard it)
        else:  # if (filterToKeep == "no filter"):
            pass

        # copy the (unfiltered) alignment to the output

        if (alignmentsWritten > 0): print
        alignmentsWritten += 1

        print "\n".join(a.lines)

        # report the consensus, if we're supposed to

        if (reportConsensus):
            if (consensuses == []):
                print "# consensus (none)"
            else:
                canonicalized = []
                for motif in consensuses:
                    if (motif != a.motif) and (canonicalizeConsensuses):
                        (motif, strand) = canonical_motif(motif)
                    canonicalized += [motif]
                print "# consensus %s" % ",".join(canonicalized)

        # report the MSA from which the consensus was derived, if we're
        # supposed to

        if (reportMsa):
            motifLen = len(a.motif)
            positionLength = [1] * motifLen
            for chunk in seqChunks:
                for (motifIx, seqNucs) in enumerate(chunk):
                    if (seqNucs == None): continue
                    positionLength[motifIx] = max(positionLength[motifIx],
                                                  len(seqNucs))

            line = []
            for (motifIx, motifNuc) in enumerate(a.motif):
                line += [motifNuc.ljust(positionLength[motifIx], ".")]
            print "# msa.query %s" % "".join(line)

            for chunk in seqChunks:
                line = []
                for (motifIx, seqNucs) in enumerate(chunk):
                    if (seqNucs == None):
                        line += ["." * positionLength[motifIx]]
                    elif (seqNucs == a.motif[motifIx]):
                        line += ["=" * positionLength[motifIx]]
                    else:
                        line += [seqNucs.ljust(positionLength[motifIx], ".")]
                print "# msa.seq   %s" % "".join(line)

    if (requireEof):
        print "# ncrf end-of-file"
コード例 #9
0
def main():
    global reportProgress, batchSize
    global debug

    # parse the command line

    testMethod = "min-max"
    numTrials = 10 * 1000  # (only used for testMethod == "min-max")
    numNeededToPass = 1  # (only used for testMethod == "min-max")
    effectSize = 0.3  # (only used for testMethod == "chi-square")
    power = 0.8  # (only used for testMethod == "chi-square")
    discardWhich = "bad"
    testWhich = "matches-insertions"
    warnOnUntested = False
    subsampleK = None
    subsampleN = None
    headLimit = None
    batchSize = None  # (will be replace by method-specific result)
    reportAs = "ncrf"
    requireEof = True
    prngSeed = defaultPrngSeed
    reportProgress = None
    debug = []

    for arg in argv[1:]:
        if ("=" in arg):
            argVal = arg.split("=", 1)[1]

        if (arg == "--method=min-max"):
            testMethod = "min-max"
        elif (arg.startswith("--trials=")):
            if ("/" in argVal):
                (numNeededToPass, numTrials) = map(int_with_unit,
                                                   argVal.split("/", 1))
                if (numTrials < 1):
                    usage("bad value in: %s (trials must be at least 1)" % arg)
                if (not 1 <= numNeededToPass <= numTrials):
                    usage(
                        "bad value in: %s (num-in-bounds must be in range 1..trials)"
                        % arg)
            else:
                (numNeededToPass, numTrials) = (1, int_with_unit(argVal))
                if (numTrials < 1):
                    usage("bad value in: %s (trials must be at least 1)" % arg)
        elif (arg in ["--method=chi-squared",
                      "--method=chi-square"]):  # (unadvertised, see [4])
            testMethod = "chi-squared"
        elif (arg.startswith("--effectsize=")):  # (unadvertised, see [4])
            effectSize = parse_probability(argVal)
        elif (arg.startswith("--power=")):  # (unadvertised, see [4])
            power = parse_probability(argVal)
        elif (arg in ["--discard:bad", "--discard=bad"]):
            discardWhich = "bad"
        elif (arg in ["--discard:good", "--discard=good"]):
            discardWhich = "good"
        elif (arg in ["--discard:none", "--discard=none"]):
            discardWhich = "none"
        elif (arg in [
                "--test:matches-insertions", "--test=matches-insertions",
                "--test:m-i", "--test=m-i"
        ]):
            testWhich = "matches-insertions"
        elif (arg in ["--test:matches", "--test=matches"]):
            testWhich = "matches"
        elif (arg in ["--test:errors", "--test=errors"]):
            testWhich = "errors"
        elif (arg == "--warn:untested") or (arg == "--warn=matrix"):
            warnOnUntested = True
        elif (arg.startswith("--head=")):
            headLimit = int_with_unit(argVal)
        elif (arg.startswith("--subsample=")):
            (subsampleK, subsampleN) = map(int, argVal.split("/", 2))
            if (not 0 < subsampleK <= subsampleN):
                usage("bad subsample description in %s" % arg)
        elif (arg.startswith("--progress=")):
            reportProgress = int_with_unit(argVal)
        elif (arg.startswith("--batch=")
              ):  # (no longer advertised, since it only applies to R)
            batchSize = int(argVal)
        elif (arg == "--report:matrix") or (
                arg == "--report=matrix"):  # (unadvertised)
            reportAs = "matrix"
        elif (arg == "--report:silent") or (
                arg == "--report=silent"):  # (unadvertised)
            reportAs = "silent"
        elif (arg in ["--noendmark", "--noeof", "--nomark"]):  # (unadvertised)
            requireEof = False
        elif (arg.startswith("--seed=")):
            seed = argVal
            if (seed in ["none", "None", "NONE"]):
                prngSeed = None
            elif (seed in ["default", "Default", "DEFAULT"]):
                prngSeed = defaultPrngSeed
            else:
                # nota bene: if the seed is a number, use it as a number, since
                #            string seeds can produce different sequences on
                #            different versions/builds of python
                try:
                    seed = int(seed)
                except ValueError:
                    try:
                        seed = float(seed)
                    except ValueError:
                        pass
                prngSeed = seed
        elif (arg == "--debug"):
            debug += ["debug"]
        elif (arg.startswith("--debug=")):
            debug += argVal.split(",")
        elif (arg.startswith("--")):
            usage("unrecognized option: %s" % arg)
        else:
            usage("unrecognized option: %s" % arg)

    if (reportAs in ["matrix", "silent"]):
        discardWhich = "none"

    if (testMethod == "chi-squared"):
        testDescription = "positional chi-squared"
        if (batchSize == None): batchSize = 30
    elif (testMethod == "min-max"):
        testDescription = "positional min-max"
        if (batchSize == None): batchSize = 1
    else:
        exit("%s: internal error: unrecognized test method: \"%s\"" %
             (os_path.basename(argv[0]), testMethod))

    # initialize the PRNG, if needed

    if (testMethod == "min-max"):
        if (prngSeed != None):
            random_seed(prngSeed)
    else:
        if (prngSeed not in [None, defaultPrngSeed]):
            print >>stderr, "WARNING: ignoring request to use PRNG with \"%s\"" \
                          % testMethod

    # make sure the shell commands we're gonna use have been installed

    if (testMethod == "chi-squared"):
        if (not shell_command_exists("Rscript")):
            exit((
                "%s: Unable to run the shell command \"Rscript\";" +
                "\n  .. Either R hasn't been installed, or the command-line shell"
                + " can't find it.") % os_path.basename(argv[0]))

    # collect the alignments; we need to collect the positional info for all
    # alignments, to feed to R in batches (doing them one-by-one was incredibly
    # slow); hopefully this won't become a memory problem

    (unitLength,alignmentList,mxMatrix) \
      = collect_alignments(stdin,testWhich,
                           headLimit=headLimit,
                           subsampleK=subsampleK,subsampleN=subsampleN,
                           requireEof=requireEof)

    numAlignments = len(alignmentList)
    if (reportProgress != None):
        print >>stderr, "progress: read %s alignments" \
                      % (commatize(numAlignments))

    # assess the alignments, batch-by-batch

    if (reportProgress != None):
        progressReported = -1

    accepted = []
    outcomeCount = {True: 0, False: 0, None: 0}
    for batchStartIx in xrange(0, numAlignments, batchSize):
        alignmentsTested = batchStartIx
        if (reportProgress != None):
            rBlock = (progressReported + 1) / reportProgress
            aBlock = (alignmentsTested + 1) / reportProgress
            if (alignmentsTested == 0) or (aBlock != rBlock):
                print >>stderr, "progress: testing alignment %s (%d uniform, %d non-uniform, %d untested)" \
                              % (commatize(1+alignmentsTested),
                                 outcomeCount[True],
                                 outcomeCount[False],
                                 outcomeCount[None])
                progressReported = alignmentsTested

        batchEndIx = min(batchStartIx + batchSize, numAlignments)
        if ("batch" in debug):
            print >>stderr, "using R for alignments %d thru %d" \
                          % (batchStartIx+1,batchEndIx)

        mxBatch = mxMatrix[batchStartIx:batchEndIx]
        aBatch = alignmentList[batchStartIx:batchEndIx]

        if (testMethod == "chi-squared"):
            batchResult = mx_significance_tests(mxBatch, testWhich, effectSize,
                                                power)
            if (type(batchResult) == str):
                exit(("%s: internal error: having trouble with R" +
                      " (with alignment batch %d..%d)" +
                      "\nHere's what R reported:\n%s") % (os_path.basename(
                          argv[0]), batchStartIx, batchEndIx, batchResult))
        else:  # if (testMethod == "min-max"):
            batchResult = min_max_tests(aBatch, mxBatch, batchStartIx,
                                        testWhich, numTrials, numNeededToPass)
            if (type(batchResult) == str):
                exit(("%s: internal error: having trouble with min-max test" +
                      " (with alignment batch %d..%d)" +
                      "\nHere's what was reported:\n%s") % (os_path.basename(
                          argv[0]), batchStartIx, batchEndIx, batchResult))

        if (len(batchResult) != batchEndIx - batchStartIx):
            exit((
                "%s: internal error: number of test outcomes reported by R (%d)"
                + "\n  .. doesn't match the number of tests given to R (%d)") %
                 (os_path.basename(
                     argv[0]), len(batchResult), batchEndIx - batchStartIx))
        accepted += batchResult

        if (warnOnUntested):
            for alignmentNum in xrange(batchStartIx, batchEndIx):
                testOutcome = accepted[alignmentNum]
                if (testOutcome == None):
                    print >>stderr, "WARNING: alignment number %d (at line %d) could not be tested" \
                                  % (alignmentNum,1+alignmentList[alignmentNum].lineNumber)

        for alignmentNum in xrange(batchStartIx, batchEndIx):
            testOutcome = accepted[alignmentNum]
            outcomeCount[testOutcome] += 1

    # process the alignments and their assessments
    # $$$ untested alignments should be processed by some other test -- for
    #     example (if we're testing by error counts), a perfect alignment
    #     currently gets discarded because it can't be tested

    if (reportAs in ["matrix", "silent"]):
        outcomeMapping = {
            True: "not_rejected",
            False: "rejected",
            None: "untested"
        }
    else:  # if (reportAs == "ncrf"):
        if (testWhich == "matches-insertions"):
            outcomeMapping = {
                True: "match-insert uniformity not rejected",
                False: "match-insert uniformity rejected",
                None: "untested"
            }
        elif (testWhich == "errors"):
            outcomeMapping = {
                True: "error uniformity not rejected",
                False: "error uniformity rejected",
                None: "untested"
            }
        else:  # if (testWhich == "matches"):
            outcomeMapping = {
                True: "match uniformity not rejected",
                False: "match uniformity rejected",
                None: "untested"
            }

    outcomeNameW = max(
        [len(outcomeMapping[testOutcome]) for testOutcome in outcomeMapping])
    for testOutcome in [True, False, None]:
        outcomeName = outcomeMapping[testOutcome]
        count = outcomeCount[testOutcome]
        reportStr = "%-*s %d" % (outcomeNameW + 1, "%s:" % outcomeName, count)
        if (numAlignments > 0):
            reportStr += " (%.2f%%)" % (100.0 * count / numAlignments)
        print >> stderr, reportStr

    if (reportAs == "matrix"):
        # see note [3] above for the format of the matrix file
        for (alignmentNum, a) in enumerate(alignmentList):
            testOutcome = accepted[alignmentNum]
            vec = [a.lineNumber, outcomeMapping[testOutcome]
                   ] + mxMatrix[alignmentNum]
            print "\t".join(map(str, vec))
    elif (reportAs == "silent"):
        pass
    else:  # if (reportAs == "ncrf"):
        numKept = 0
        isFirst = True
        for (alignmentNum, a) in enumerate(alignmentList):
            testOutcome = accepted[alignmentNum]
            if (discardWhich == "good"):
                if (testOutcome == True): continue
            elif (discardWhich == "bad"):
                if (testOutcome != True): continue

            if (discardWhich == "none"):
                testInfo = "# %s: %s" % (testDescription,
                                         outcomeMapping[testOutcome])
                (startIx, endIx) = a.positional_stats_indexes()
                a.lines.insert(endIx, testInfo)

            if (isFirst): isFirst = False
            else: print
            print a
            numKept += 1

        reportStr = "kept %d of %d alignments" % (numKept, numAlignments)
        if (numAlignments > 0):
            reportStr += ", %.2f%%" % (100.0 * numKept / numAlignments)
        print >> stderr, reportStr

        if (requireEof):
            print "# ncrf end-of-file"