Exemple #1
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-p", "--output-filename-pattern", dest="output_filename_pattern", type="string",
                      help="OUTPUT filename with histogram information on aggregate coverages [%default].")

    parser.add_option("--read-length-mean", dest="read_length_mean", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--read-length-std", dest="read_length_stddev", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--coverage-mean", dest="coverage_mean", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--coverage-std", dest="coverage_stddev", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--ds-mean", dest="ds_mean", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--ds-std", dest="ds_stddev", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--error-mean", dest="error_mean", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--error-std", dest="error_stddev", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--min-read-length", dest="min_read_length", type="int",
                      help="minimum read length [default=%default].")

    parser.add_option("--sample-size", dest="sample_size", type="int",
                      help="randomly sample from selected transcripts [default=%default].")

    parser.add_option("--test", dest="test", type="int",
                      help="test with # first entries [default=%default].")

    parser.add_option("--mode", dest="mode", type="choice",
                      choices=("genes", "transcripts"),
                      help="use genes or transcripts [default=%default].")

    parser.set_defaults(
        genome_file=None,
        read_length_mean=200.0,
        read_length_stddev=20.0,
        coverage_mean=2.0,
        coverage_stddev=1.0,
        ds_mean=None,
        ds_stddev=None,
        error_mean=None,
        error_stddev=None,
        min_read_length=50,
        test=None,
        mode="transcripts",
        output_filename_pattern=None,
        output_format_id="%010i",
        sample_size=0,
    )

    (options, args) = E.Start(parser, argv)

    assert options.genome_file, "please supply an indexed genome."

    if options.output_filename_pattern:
        outfile_stats = IOTools.openFile(options.output_filename_pattern % "stats", "w")
        outfile_stats.write(
            "id\tlen\tnreads\tlen_mean\tlen_std\tcov_mean\tcov_std\n")
        outfile_map = IOTools.openFile(options.output_filename_pattern % "map", "w")
        outfile_map.write("id\ttranscript\n")
    else:
        outfile_stats = None
        outfile_map = None

    genome = IndexedFasta.IndexedFasta(options.genome_file)

    ninput, noutput, nskipped = 0, 0, 0

    total_counts, total_read_lengths, total_len = [], [], 0
    total_pids = []
    total_error_pids = []

    if options.mode == "transcripts":
        iterator = GTF.transcript_iterator(
            GTF.iterator_filtered(GTF.iterator(options.stdin), feature="exon"))
        getId = lambda x: x.transcript_id
    elif options.mode == "genes":
        iterator = GTF.flat_gene_iterator(
            GTF.iterator_filtered(GTF.iterator(options.stdin), feature="exon"))
        getId = lambda x: x.gene_id

    if options.sample_size:
        iterator = Iterators.sample(iterator)

    if options.ds_mean:
        do_mutate = True
        pid_calc = SequencePairProperties.SequencePairPropertiesPID()
    else:
        do_mutate = False

    if options.error_mean:
        do_error = True
        pid_calc = SequencePairProperties.SequencePairPropertiesPID()
    else:
        do_error = False

    for gtfs in iterator:

        id = getId(gtfs[0])

        try:
            sequence = GTF.toSequence(gtfs, genome)
        except KeyError as msg:
            if options.loglevel >= 2:
                options.stdlog.write("# skipping %s: %s\n" % (id, msg))
            nskipped += 1
            continue

        lsequence = len(sequence)

        if lsequence <= options.min_read_length * 2:
            if options.loglevel >= 2:
                options.stdlog.write(
                    "# skipping %s - sequence is too short: %i\n" % (id, lsequence))
            nskipped += 1
            continue

        ninput += 1

        if do_mutate:
            new_sequence = getMutatedSequence(sequence, options.ds_mean)
            pid_calc.loadPair(sequence, new_sequence)
            pid = pid_calc.mPID
            total_pids.append(pid)
            sequence = new_sequence
        else:
            pid = 100.0

        if options.loglevel >= 2:
            options.stdlog.write(
                "# processing %s - len=%i\n" % (id, lsequence))
            options.stdlog.flush()

        total_len += lsequence
        lvsequence = lsequence * \
            random.gauss(options.coverage_mean, options.coverage_stddev)

        covered = 0
        counts = numpy.zeros(lsequence)
        nreads = 0

        error_pids, read_lengths = [], []

        while covered < lvsequence:

            read_length = int(
                random.gauss(options.read_length_mean, options.read_length_stddev))
            positive = random.randint(0, 1)

            if positive:
                start = random.randint(0, lsequence)
                end = min(lsequence, start + read_length)
            else:
                end = random.randint(0, lsequence)
                start = max(0, end - read_length)

            read_length = end - start
            if read_length < options.min_read_length:
                continue

            segment = sequence[start:end]

            if not positive:
                segment = Genomics.complement(segment)

            noutput += 1

            if do_error:
                new_segment = getMutatedSequence(segment, options.error_mean)
                pid_calc.loadPair(segment, new_segment)
                pid = pid_calc.mPID
                error_pids.append(pid)
                segment = new_segment
            else:
                pid = 100.0

            options.stdout.write(
                ">%s\n%s\n" % (options.output_format_id % noutput, segment))

            if outfile_map:
                outfile_map.write(
                    "%s\t%s\n" % (id, options.output_format_id % noutput))

            for x in range(start, end):
                counts[x] += 1

            nreads += 1

            covered += read_length
            read_lengths.append(read_length)

        if options.loglevel >= 2:
            options.stdout.write("# transcript %s: len=%i, nreads=%i, len_mean=%.2f, len_std=%.2f, cov_mean=%.2f, cov_stddev=%.2f\n" % (id,
                                                                                                                                        lsequence,
                                                                                                                                        nreads,
                                                                                                                                        numpy.mean(
                                                                                                                                            read_lengths),
                                                                                                                                        numpy.std(
                                                                                                                                            read_lengths),
                                                                                                                                        numpy.mean(
                                                                                                                                            counts),
                                                                                                                                        numpy.std(counts)))
        if outfile_stats:
            outfile_stats.write("%s\t%i\t%i\t%.2f\t%.2f\t%.2f\t%.2f\n" % (id,
                                                                          lsequence,
                                                                          nreads,
                                                                          numpy.mean(
                                                                              read_lengths),
                                                                          numpy.std(
                                                                              read_lengths),
                                                                          numpy.mean(
                                                                              counts),
                                                                          numpy.std(counts)))

        total_counts += list(counts)
        total_read_lengths += read_lengths
        total_error_pids += error_pids

        if options.test and ninput >= options.test:
            break

        if options.sample_size and ninput >= options.sample_size:
            break

    if options.loglevel >= 1:
        output = ["len=%i, nreads=%i" % (total_len,
                                         noutput)]
        output.append("len_mean=%.2f, len_std=%.2f, cov_mean=%.2f, cov_stddev=%.2f" % (
            numpy.mean(total_read_lengths),
            numpy.std(total_read_lengths),
            numpy.mean(total_counts),
            numpy.std(total_counts)))

        no_uncovered = [x for x in total_counts if x > 0]

        output.append("cov0_mean=%.2f, cov0_stddev=%.2f" % (numpy.mean(no_uncovered),
                                                            numpy.std(no_uncovered)))

        if do_mutate:
            output.append("pid_mean=%.2f, pid_std=%.2f" %
                          (numpy.mean(total_pids), numpy.std(total_pids)))

        if do_error:
            output.append("pid_error_mean=%.2f, pid_error_std=%.2f" %
                          (numpy.mean(total_error_pids), numpy.std(total_error_pids)))

        options.stdlog.write("# effective: %s\n" % ", ".join(output))

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped))

    E.Stop()
Exemple #2
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-p", "--output-filename-pattern", dest="output_filename_pattern", type="string",
                      help="OUTPUT filename with histogram information on aggregate coverages [%default].")

    parser.add_option("--read-length-mean", dest="read_length_mean", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--read-length-std", dest="read_length_stddev", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--coverage-mean", dest="coverage_mean", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--coverage-std", dest="coverage_stddev", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--ds-mean", dest="ds_mean", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--ds-std", dest="ds_stddev", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--error-mean", dest="error_mean", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--error-std", dest="error_stddev", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--min-read-length", dest="min_read_length", type="int",
                      help="minimum read length [default=%default].")

    parser.add_option("--sample-size", dest="sample_size", type="int",
                      help="randomly sample from selected transcripts [default=%default].")

    parser.add_option("--test", dest="test", type="int",
                      help="test with # first entries [default=%default].")

    parser.add_option("--mode", dest="mode", type="choice",
                      choices=("genes", "transcripts"),
                      help="use genes or transcripts [default=%default].")

    parser.set_defaults(
        genome_file=None,
        read_length_mean=200.0,
        read_length_stddev=20.0,
        coverage_mean=2.0,
        coverage_stddev=1.0,
        ds_mean=None,
        ds_stddev=None,
        error_mean=None,
        error_stddev=None,
        min_read_length=50,
        test=None,
        mode="transcripts",
        output_filename_pattern=None,
        output_format_id="%010i",
        sample_size=0,
    )

    (options, args) = E.Start(parser, argv)

    assert options.genome_file, "please supply an indexed genome."

    if options.output_filename_pattern:
        outfile_stats = open(options.output_filename_pattern % "stats", "w")
        outfile_stats.write(
            "id\tlen\tnreads\tlen_mean\tlen_std\tcov_mean\tcov_std\n")
        outfile_map = open(options.output_filename_pattern % "map", "w")
        outfile_map.write("id\ttranscript\n")
    else:
        outfile_stats = None
        outfile_map = None

    genome = IndexedFasta.IndexedFasta(options.genome_file)

    ninput, noutput, nskipped = 0, 0, 0

    total_counts, total_read_lengths, total_len = [], [], 0
    total_pids = []
    total_error_pids = []

    if options.mode == "transcripts":
        iterator = GTF.transcript_iterator(
            GTF.iterator_filtered(GTF.iterator(options.stdin), feature="exon"))
        getId = lambda x: x.transcript_id
    elif options.mode == "genes":
        iterator = GTF.flat_gene_iterator(
            GTF.iterator_filtered(GTF.iterator(options.stdin), feature="exon"))
        getId = lambda x: x.gene_id

    if options.sample_size:
        iterator = Iterators.sample(iterator)

    if options.ds_mean:
        do_mutate = True
        pid_calc = SequencePairProperties.SequencePairPropertiesPID()
    else:
        do_mutate = False

    if options.error_mean:
        do_error = True
        pid_calc = SequencePairProperties.SequencePairPropertiesPID()
    else:
        do_error = False

    for gtfs in iterator:

        id = getId(gtfs[0])

        try:
            sequence = GTF.toSequence(gtfs, genome)
        except KeyError, msg:
            if options.loglevel >= 2:
                options.stdlog.write("# skipping %s: %s\n" % (id, msg))
            nskipped += 1
            continue

        lsequence = len(sequence)

        if lsequence <= options.min_read_length * 2:
            if options.loglevel >= 2:
                options.stdlog.write(
                    "# skipping %s - sequence is too short: %i\n" % (id, lsequence))
            nskipped += 1
            continue

        ninput += 1

        if do_mutate:
            new_sequence = getMutatedSequence(sequence, options.ds_mean)
            pid_calc.loadPair(sequence, new_sequence)
            pid = pid_calc.mPID
            total_pids.append(pid)
            sequence = new_sequence
        else:
            pid = 100.0

        if options.loglevel >= 2:
            options.stdlog.write(
                "# processing %s - len=%i\n" % (id, lsequence))
            options.stdlog.flush()

        total_len += lsequence
        lvsequence = lsequence * \
            random.gauss(options.coverage_mean, options.coverage_stddev)

        covered = 0
        counts = numpy.zeros(lsequence)
        nreads = 0

        error_pids, read_lengths = [], []

        while covered < lvsequence:

            read_length = int(
                random.gauss(options.read_length_mean, options.read_length_stddev))
            positive = random.randint(0, 1)

            if positive:
                start = random.randint(0, lsequence)
                end = min(lsequence, start + read_length)
            else:
                end = random.randint(0, lsequence)
                start = max(0, end - read_length)

            read_length = end - start
            if read_length < options.min_read_length:
                continue

            segment = sequence[start:end]

            if not positive:
                segment = Genomics.complement(segment)

            noutput += 1

            if do_error:
                new_segment = getMutatedSequence(segment, options.error_mean)
                pid_calc.loadPair(segment, new_segment)
                pid = pid_calc.mPID
                error_pids.append(pid)
                segment = new_segment
            else:
                pid = 100.0

            options.stdout.write(
                ">%s\n%s\n" % (options.output_format_id % noutput, segment))

            if outfile_map:
                outfile_map.write(
                    "%s\t%s\n" % (id, options.output_format_id % noutput))

            for x in range(start, end):
                counts[x] += 1

            nreads += 1

            covered += read_length
            read_lengths.append(read_length)

        if options.loglevel >= 2:
            options.stdout.write("# transcript %s: len=%i, nreads=%i, len_mean=%.2f, len_std=%.2f, cov_mean=%.2f, cov_stddev=%.2f\n" % (id,
                                                                                                                                        lsequence,
                                                                                                                                        nreads,
                                                                                                                                        numpy.mean(
                                                                                                                                            read_lengths),
                                                                                                                                        numpy.std(
                                                                                                                                            read_lengths),
                                                                                                                                        numpy.mean(
                                                                                                                                            counts),
                                                                                                                                        numpy.std(counts)))
        if outfile_stats:
            outfile_stats.write("%s\t%i\t%i\t%.2f\t%.2f\t%.2f\t%.2f\n" % (id,
                                                                          lsequence,
                                                                          nreads,
                                                                          numpy.mean(
                                                                              read_lengths),
                                                                          numpy.std(
                                                                              read_lengths),
                                                                          numpy.mean(
                                                                              counts),
                                                                          numpy.std(counts)))

        total_counts += list(counts)
        total_read_lengths += read_lengths
        total_error_pids += error_pids

        if options.test and ninput >= options.test:
            break

        if options.sample_size and ninput >= options.sample_size:
            break
def main():

    parser = E.OptionParser( version = "%prog version: $Id: malis2masks.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"])

    parser.add_option("--random-proportion", dest="random_proportion", type="float",
                      help="mask randomly columns in multiple alignments [default=%default]" )

    parser.add_option("--random", dest="random", action="store_true",
                      help="shuffle quality scores before masking [default=%default]" )

    parser.set_defaults(
        quality_threshold = 40,
        quality_file = "quality",
        filename_map = None,
        frame = 3,
        )

    (options, args) = E.Start( parser )

    ##################################################
    ##################################################
    ##################################################
    ## read map
    ##################################################
    infile = open(options.filename_map) 
    map_genes2genome = {}
    for match in Blat.iterator( infile ):
        assert match.mQueryId not in map_genes2genome, "duplicate entry %s" % match.mQueryId
        map_genes2genome[match.mQueryId] = match
    infile.close()

    ##################################################
    ##################################################
    ##################################################
    ## get quality scores
    ##################################################
    quality = IndexedFasta.IndexedFasta( options.quality_file )
    quality.setTranslator( IndexedFasta.TranslatorBytes() )

    ##################################################
    ##################################################
    ##################################################
    ## main loop
    ##################################################
    ninput, noutput, nmissed = 0, 0, 0

    options.stdout.write( "cluster_id\tstart\tend\n" )

    for line in options.stdin:
        if line.startswith("cluster_id"): continue
        ninput += 1
        cluster_id, gene_id, alignment = line[:-1].split("\t")

        if gene_id not in map_genes2genome:
            nmissed += 1
            E.warn( "gene_id %s not found in map." % gene_id )
            continue
        
        match = map_genes2genome[gene_id]
        map_gene2genome = match.getMapQuery2Target()
        is_negative = match.strand == "-"

        # if strand is negative, the coordinates are 
        # on the negative strand of the gene/query
        # in order to work in the right coordinate system
        # revert the sequence
        if is_negative: 
            alignment = alignment[::-1]

        # get map of gene to alignment
        map_gene2mali = alignlib_lite.py_makeAlignmentVector()
        fillAlignment( map_gene2mali, alignment )

        # get quality scores
        quality_scores = quality.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo)


        # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2genome))
        # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2mali))
        # print quality_scores

        map_mali2genome = alignlib_lite.py_makeAlignmentVector()
        alignlib_lite.py_combineAlignment( map_mali2genome, map_gene2mali, map_gene2genome, alignlib_lite.py_RR )
        # print str(alignlib_lite.py_AlignmentFormatEmissions( map_mali2genome))

        # shuffle quality scores, but only those that are aligned
        if options.random:
            positions = []
            for fp,c in enumerate(alignment):
                if c == "-": continue
                y = map_mali2genome.mapRowToCol( fp ) - match.mSbjctFrom 
                if y < 0: continue
                positions.append( y )
            scores = [ quality_scores[ x ] for x in positions ]
            random.shuffle(scores)
            for p,q in zip( positions,scores): quality_scores[p] = q

        # negative strand
        to_mask = []
        ## reverse position
        rp = len(alignment)
        for fp,c in enumerate(alignment):
            rp -= 1
            if c == "-": continue
            y = map_mali2genome.mapRowToCol( fp ) - match.mSbjctFrom
            if y < 0: continue
            if quality_scores[y] < options.quality_threshold:
                if is_negative: p = rp
                else: p = fp
                E.debug( "low quality base: id=%s, mali=%i, char=%s, contig=%s, strand=%s, pos=%i, quality=%i" % \
                             (cluster_id, p, c, match.mSbjctId, match.strand, map_mali2genome.mapRowToCol( fp ), quality_scores[y] ) )
                if options.frame > 1:
                    start = (p // options.frame) * options.frame
                    to_mask.extend( list( range(start, start + options.frame) ) )
                else:
                    to_mask.append( p ) 

        regions = Iterators.group_by_distance( sorted(to_mask) )
            
        for start,end in regions:
            options.stdout.write( "%s\t%i\t%i\n" % (cluster_id, start, end ) )

        noutput += 1

    E.info( "ninput=%i, noutput=%i, nmissed=%i" % (ninput, noutput, nmissed) )

    E.Stop()
Exemple #4
0
def main(argv=None):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: malis2masks.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "--random-proportion",
        dest="random_proportion",
        type="float",
        help="mask randomly columns in multiple alignments [default=%default]")

    parser.add_option(
        "--random",
        dest="random",
        action="store_true",
        help="shuffle quality scores before masking [default=%default]")

    parser.set_defaults(
        quality_threshold=40,
        quality_file="quality",
        filename_map=None,
        frame=3,
    )

    (options, args) = E.Start(parser)

    ##################################################
    ##################################################
    ##################################################
    # read map
    ##################################################
    infile = open(options.filename_map)
    map_genes2genome = {}
    for match in Blat.iterator(infile):
        assert match.mQueryId not in map_genes2genome, "duplicate entry %s" % match.mQueryId
        map_genes2genome[match.mQueryId] = match
    infile.close()

    ##################################################
    ##################################################
    ##################################################
    # get quality scores
    ##################################################
    quality = IndexedFasta.IndexedFasta(options.quality_file)
    quality.setTranslator(IndexedFasta.TranslatorBytes())

    ##################################################
    ##################################################
    ##################################################
    # main loop
    ##################################################
    ninput, noutput, nmissed = 0, 0, 0

    options.stdout.write("cluster_id\tstart\tend\n")

    for line in options.stdin:
        if line.startswith("cluster_id"):
            continue
        ninput += 1
        cluster_id, gene_id, alignment = line[:-1].split("\t")

        if gene_id not in map_genes2genome:
            nmissed += 1
            E.warn("gene_id %s not found in map." % gene_id)
            continue

        match = map_genes2genome[gene_id]
        map_gene2genome = match.getMapQuery2Target()
        is_negative = match.strand == "-"

        # if strand is negative, the coordinates are
        # on the negative strand of the gene/query
        # in order to work in the right coordinate system
        # revert the sequence
        if is_negative:
            alignment = alignment[::-1]

        # get map of gene to alignment
        map_gene2mali = alignlib_lite.py_makeAlignmentVector()
        fillAlignment(map_gene2mali, alignment)

        # get quality scores
        quality_scores = quality.getSequence(match.mSbjctId, "+",
                                             match.mSbjctFrom, match.mSbjctTo)

        # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2genome))
        # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2mali))
        # print quality_scores
        map_mali2genome = alignlib_lite.py_makeAlignmentVector()
        alignlib_lite.py_combineAlignment(map_mali2genome, map_gene2mali,
                                          map_gene2genome, alignlib_lite.py_RR)
        # print str(alignlib_lite.py_AlignmentFormatEmissions(
        # map_mali2genome))

        # shuffle quality scores, but only those that are aligned
        if options.random:
            positions = []
            for fp, c in enumerate(alignment):
                if c == "-":
                    continue
                y = map_mali2genome.mapRowToCol(fp) - match.mSbjctFrom
                if y < 0:
                    continue
                positions.append(y)
            scores = [quality_scores[x] for x in positions]
            random.shuffle(scores)
            for p, q in zip(positions, scores):
                quality_scores[p] = q

        # negative strand
        to_mask = []
        # reverse position
        rp = len(alignment)
        for fp, c in enumerate(alignment):
            rp -= 1
            if c == "-":
                continue
            y = map_mali2genome.mapRowToCol(fp) - match.mSbjctFrom
            if y < 0:
                continue
            if quality_scores[y] < options.quality_threshold:
                if is_negative:
                    p = rp
                else:
                    p = fp
                E.debug(
                    "low quality base: id=%s, mali=%i, char=%s, contig=%s, strand=%s, pos=%i, quality=%i"
                    % (cluster_id, p, c, match.mSbjctId, match.strand,
                       map_mali2genome.mapRowToCol(fp), quality_scores[y]))
                if options.frame > 1:
                    start = (p // options.frame) * options.frame
                    to_mask.extend(list(range(start, start + options.frame)))
                else:
                    to_mask.append(p)

        regions = Iterators.group_by_distance(sorted(to_mask))

        for start, end in regions:
            options.stdout.write("%s\t%i\t%i\n" % (cluster_id, start, end))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nmissed=%i" % (ninput, noutput, nmissed))

    E.Stop()