예제 #1
0
    # read peptide sequences
    if param_filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences(
            open(param_filename_peptides, "r"))
    else:
        peptide_sequences = {}

    if param_genome_file:
        # read from fasta file
        fasta = IndexedFasta.IndexedFasta(param_genome_file)
        contig_sizes = fasta.getContigSizes()
        delete_missing = True
    elif param_filename_contigs:
        # read contigs
        contig_sizes = Genomics.ReadContigSizes(
            open(param_filename_contigs, "r"))
        delete_missing = True
    else:
        contig_sizes = {"dummy": 1000000000}
        delete_missing = False

    if param_loglevel >= 1:
        print "# read %i peptide sequences" % len(peptide_sequences)
        sys.stdout.flush()

    exons = Exons.ReadExonBoundaries(
        sys.stdin,
        contig_sizes=contig_sizes,
        delete_missing=delete_missing,
    )
예제 #2
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gff2gff.py 2868 2010-03-03 10:19:52Z andreas $")

    parser.add_option("-f",
                      "--forward-coordinates",
                      dest="forward_coordinates",
                      help="translate to forward coordinates.",
                      action="store_true")

    parser.add_option("--forward-strand",
                      dest="forward_strand",
                      help="convert to forward strand.",
                      action="store_true")

    parser.add_option("--ignore-strand",
                      dest="ignore_strand",
                      help="ignore strand information.",
                      action="store_true")

    parser.add_option("--is-gtf",
                      dest="is_gtf",
                      action="store_true",
                      help="input will be treated as gtf [default=%default].")

    parser.add_option(
        "--add-up-flank",
        dest="add_up_flank",
        type="int",
        help="add an upstream flanking segment to first exon of a group.")

    parser.add_option(
        "--add-down-flank",
        dest="add_down_flank",
        type="int",
        help="add a downstream flanking segment to last segment of a group.")

    parser.add_option("--extend",
                      dest="extend",
                      help="extend the existing features.",
                      action="store_true")

    parser.add_option("-c",
                      "--contigs",
                      dest="input_filename_contigs",
                      type="string",
                      help="filename with contig lenghts.")

    parser.add_option(
        "--filename-agp",
        dest="input_filename_agp",
        type="string",
        help="agp file to map coordinates from contigs to scaffolds.")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option(
        "--complement-groups",
        dest="complement_groups",
        action="store_true",
        help="""complement groups. Will write introns from exons [%default]."""
    )

    parser.add_option(
        "--group-field",
        dest="group_field",
        type="string",
        help=
        """gff field/attribute to group by such as gene_id, transrcipt_id, ... [%default]."""
    )

    parser.add_option("--combine-groups",
                      dest="combine_groups",
                      action="store_true",
                      help="""combine groups.""")

    parser.add_option(
        "--filter-range",
        dest="filter_range",
        type="string",
        help=
        """extract all elements overlapping a range. A range is specified by eithor 'contig:from..to', 'contig:+:from..to', or 'from,to' ."""
    )

    parser.add_option(
        "--join-features",
        dest="join_features",
        type="string",
        help=
        "join features into a single transcript. Consecutive features are grouped "
        " into the same transcript/gene. This metdo expects a string of for numbers ``a,b,c,d`` "
        " as input with:"
        " a,b=minimum/maximum distance between features, "
        " c,d=minimum,maximum number of features."
        "")

    parser.add_option(
        "--merge-features",
        dest="merge_features",
        type="string",
        help=
        "merge features. Consecutive features are merged into a single feature. "
        "This method expects a string of four numbers ``a,b,c,d`` as input; "
        "a,b=minimum/maximum distance between features, "
        "c,d=minimum,maximum number of features.")

    parser.add_option(
        "--crop-unique",
        dest="crop_unique",
        action="store_true",
        help=
        "crop overlapping intervals, keeping only intervals that are unique [default=%default]"
    )

    parser.add_option(
        "--crop",
        dest="crop",
        type="string",
        help=
        """crop features in gff file with features in another file. If a feature falls in the middle of another, two entries will be output."""
    )

    parser.add_option(
        "--sanitize",
        dest="sanitize",
        type="choice",
        choices=("ucsc", "ensembl", "genome"),
        help=
        "sanitize chr names for ucsc or ensembl or use the genome translator [%default]."
    )

    parser.add_option(
        "--skip-missing",
        dest="skip_missing",
        action="store_true",
        help=
        "skip entries on missing contigs. Otherwise an exception is raised [%default]."
    )

    parser.add_option(
        "--remove-contigs",
        dest="remove_contigs",
        type="string",
        action="store",
        help=
        "a comma separated list of regular expressions specifying contigs to be removed when runnnig sanitize [%default]."
    )

    parser.set_defaults(
        forward_coordinates=False,
        forward_strand=False,
        input_filename_contigs=False,
        input_filename_agp=False,
        genome_file=None,
        sanitize=None,
        add_up_flank=None,
        add_down_flank=None,
        extend=False,
        complement_groups=False,
        combine_groups=False,
        crop=None,
        crop_unique=False,
        ignore_strand=False,
        filter_range=None,
        join_features=None,
        merge_features=None,
        output_format="%06i",
        skip_missing=False,
        remove_contigs=None,
        is_gtf=False,
        group_field=None,
    )

    (options, args) = E.Start(parser, argv=argv)

    if options.input_filename_contigs:
        contigs = Genomics.ReadContigSizes(
            IOTools.openFile(options.input_filename_contigs, "r"))

    if options.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = genome_fasta.getContigSizes()
    else:
        genome_fasta = None

    if (options.forward_coordinates or options.forward_strand) and not contigs:
        raise ValueError("inverting coordinates requires genome file")

    if options.input_filename_agp:
        agp = AGP.AGP()
        agp.readFromFile(IOTools.openFile(options.input_filename_agp, "r"))
    else:
        agp = None

    gffs = GTF.iterator(options.stdin)

    if options.add_up_flank or options.add_down_flank:

        if options.is_gtf:
            iterator = GTF.flat_gene_iterator(gffs)
        else:
            iterator = GTF.joined_iterator(gffs, options.group_field)

        for chunk in iterator:
            is_positive = Genomics.IsPositiveStrand(chunk[0].strand)
            chunk.sort(lambda x, y: cmp(x.start, y.start))
            lcontig = contigs[chunk[0].contig]

            if options.extend:
                if options.add_up_flank:
                    if is_positive:
                        chunk[0].start = max(
                            0, chunk[0].start - options.add_up_flank)
                    else:
                        chunk[-1].end = min(
                            lcontig, chunk[-1].end + options.add_up_flank)
                if options.add_down_flank:
                    if is_positive:
                        chunk[-1].end = min(
                            lcontig, chunk[-1].end + options.add_down_flank)
                    else:
                        chunk[0].start = max(
                            0, chunk[0].start - options.add_down_flank)
            else:
                if options.add_up_flank:
                    gff = GTF.Entry()
                    if is_positive:
                        gff.copy(chunk[0])
                        gff.end = gff.start
                        gff.start = max(0, gff.start - options.add_up_flank)
                        chunk.insert(0, gff)
                    else:
                        gff.copy(chunk[-1])
                        gff.start = gff.end
                        gff.end = min(lcontig, gff.end + options.add_up_flank)
                        chunk.append(gff)
                    gff.feature = "5-Flank"
                    gff.mMethod = "gff2gff"
                if options.add_down_flank:
                    gff = GTF.Entry()
                    if is_positive:
                        gff.copy(chunk[-1])
                        gff.start = gff.end
                        gff.end = min(lcontig, gff.end + options.add_up_flank)
                        chunk.append(gff)
                    else:
                        gff.copy(chunk[0])
                        gff.end = gff.start
                        gff.start = max(0, gff.start - options.add_up_flank)
                        chunk.insert(0, gff)
                    gff.feature = "3-Flank"
                    gff.mMethod = "gff2gff"

            if not is_positive:
                chunk.reverse()

            for gff in chunk:
                options.stdout.write(str(gff) + "\n")

    elif options.complement_groups:

        iterator = GTF.joined_iterator(gffs, group_field=options.group_field)

        for chunk in iterator:
            if options.is_gtf:
                chunk = [x for x in chunk if x.feature == "exon"]
                if len(chunk) == 0:
                    continue
            chunk.sort()
            x = GTF.Entry()
            x.copy(chunk[0])
            x.start = x.end
            x.feature = "intron"
            for c in chunk[1:]:
                x.end = c.start
                options.stdout.write(str(x) + "\n")
                x.start = c.end

    elif options.combine_groups:

        iterator = GTF.joined_iterator(gffs)

        for chunk in iterator:
            chunk.sort()
            x = GTF.Entry()
            x.copy(chunk[0])
            x.end = chunk[-1].end
            x.feature = "segment"
            options.stdout.write(str(x) + "\n")

    elif options.join_features:

        combineGFF(gffs, options, merge=False)

    elif options.merge_features:

        combineGFF(gffs, options, merge=True)

    elif options.crop:

        cropGFF(gffs, options)

    elif options.crop_unique:

        cropGFFUnique(gffs, options)

    elif options.filter_range:

        contig, strand, interval = None, None, None
        try:
            contig, strand, start, sep, end = re.match(
                "(\S+):(\S+):(\d+)(\.\.|-)(\d+)",
                options.filter_range).groups()
        except AttributeError:
            pass

        if not contig:
            try:
                contig, start, sep, end = re.match(
                    "(\S+):(\d+)(\.\.|-)(\d+)", options.filter_range).groups()
                strand = None
            except AttributeError:
                pass

        if not contig:
            try:
                start, end = re.match("(\d+)(\.\.|\,|\-)(\d+)",
                                      options.filter_range).groups()
            except AttributeError:
                raise "can not parse range %s" % options.filter_range
            contig = None
            strand = None

        if start:
            interval = (int(start), int(end))
        else:
            interval = None

        if options.loglevel >= 2:
            options.stdlog.write(
                "# filter: contig=%s, strand=%s, interval=%s\n" %
                (str(contig), str(strand), str(interval)))
            options.stdlog.flush()

        for gff in GTF.iterator_filtered(gffs,
                                         contig=contig,
                                         strand=strand,
                                         interval=interval):
            options.stdout.write(str(gff) + "\n")

    elif options.sanitize:

        def toUCSC(id):
            if not id.startswith("contig") and not id.startswith("chr"):
                id = "chr%s" % id
            return id

        def toEnsembl(id):
            if id.startswith("contig"):
                return id[len("contig"):]
            if id.startswith("chr"):
                return id[len("chr"):]
            return id

        if options.sanitize == "genome":
            if genome_fasta is None:
                raise ValueError(
                    "please specify --genome-file= when using --sanitize=genome"
                )
            f = genome_fasta.getToken
        elif options.sanitize == "ucsc":
            f = toUCSC
        elif options.sanitize == "ensembl":
            f = toEnsembl

        skipped_contigs = collections.defaultdict(int)
        outofrange_contigs = collections.defaultdict(int)
        filtered_contigs = collections.defaultdict(int)

        for gff in gffs:
            try:
                gff.contig = f(gff.contig)
            except KeyError, msg:
                if options.skip_missing:
                    skipped_contigs[gff.contig] += 1
                    continue
                else:
                    raise

            if genome_fasta:
                lcontig = genome_fasta.getLength(gff.contig)
                if lcontig < gff.end:
                    outofrange_contigs[gff.contig] += 1
                    continue

            if options.remove_contigs:
                to_remove = [
                    re.compile(x) for x in options.remove_contigs.split(",")
                ]
                if any([x.match(gff.contig) for x in to_remove]):
                    filtered_contigs[gff.contig] += 1
                    continue

            options.stdout.write(str(gff) + "\n")

        if skipped_contigs:
            E.info("skipped %i entries on %i contigs: %s" %
                   (sum(skipped_contigs.values()), len(
                       skipped_contigs.keys()), str(skipped_contigs)))
        if outofrange_contigs:
            E.warn(
                "skipped %i entries on %i contigs because they are out of range: %s"
                % (sum(outofrange_contigs.values()),
                   len(outofrange_contigs.keys()), str(outofrange_contigs)))

        if filtered_contigs:
            E.info("filtered out %i entries on %i contigs: %s" %
                   (sum(filtered_contigs.values()), len(
                       filtered_contigs.keys()), str(filtered_contigs)))
예제 #3
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser.add_option("-m", "--filename-map", dest="filename_map", type="string",
                      help="filename with mapping information.")
    parser.add_option("-o", "--pattern-old", dest="pattern_old", type="string",
                      help="pattern for mapping new to old identifiers: extract string from old.")
    parser.add_option("-n", "--pattern-new", dest="pattern_new", type="string",
                      help="pattern for mapping new to old identifiers: put string into new.")
    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="genome_file.")
    parser.add_option("-p", "--peptides", dest="filename_peptides", type = "string",
                      help="filename with peptide sequences.")
    parser.add_option("-f", "--input-format", dest="input_format", type="choice",
                      help="format of mapping file", choices=("alignment", "offsets") )
    parser.add_option("-i", "--write-missed", dest="write_missed", type="string",
                      help="write missed identifiers to separate file.")
    parser.add_option("-a", "--filename-genes", dest="filename_genes", type="string",
                      help="filename with gene information.")
    parser.add_option("--filename-old-peptides", dest="filename_old_peptides", type="string",
                      help="filename with old peptide information.")
    parser.add_option("--no-renumber", dest="renumber", action="store_false",
                      help="do not renumber predictions.")
    parser.add_option("--contig-sizes-old", dest="contig_sizes_old", type="string",
                      help="contig sizes for old data.")
    parser.add_option("--contig-sizes-new", dest="contig_sizes_new", type="string",
                      help="contig sizes for new data.")
    parser.add_option("--skip-errors", dest="skip_errors", action="store_true",
                      help="skip entries with errors.")
    
    parser.set_defaults(
        filename_map = None,
        pattern_old = "(.+)",
        pattern_new = "%s",
        genome_file = None,
        filename_peptides = None,
        write_missed = None,
        filename_genes = None,
        filename_old_peptides = None,
        renumber = True,
        input_format = "alignment",
        contig_sizes_old = None,
        contig_sizes_new = None,
        skip_errors = None
        )

    (options, args) = E.Start( parser, add_pipe_options = True)

    predictor = PredictorExonerate()

    ## the different mapping criteria
    map_sbjcts = {}
    breakpoints = {}

    ################################################################################################
    map_transcript2gene = {}
    if options.filename_genes:
        infile = open(options.filename_genes, "r")
        for gene, transcript in map( lambda x: x[:-1].split("\t")[:2], filter( lambda x: x[0] != "#", infile.readlines())):
            map_transcript2gene[transcript] = gene
        infile.close()

    ################################################################################################
    peptides = {}
    if options.filename_peptides:
        peptides = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r"))
        options.stdlog.write( "# read %i peptide sequences.\n" % len(peptides))

    ################################################################################################
    ## read old query sequences and compare against new query sequences
    ## this can be used to build a map between old and new queries
    query_map_old2new = {}        
    if options.filename_old_peptides:
        old_peptides = Genomics.ReadPeptideSequences( open(options.filename_old_peptides, "r"))
        options.stdlog.write( "# read %i old peptide sequences.\n" % len(old_peptides))
        query_map_old2new, unmappable, unmapped = Genomics.MapSequences( old_peptides, peptides)
        options.stdlog.write( "# built map: unmappable=%i unmapped=%i.\n" % (len(unmappable), len(unmapped)))
        if options.loglevel >= 2:
            options.stdlog.write( "# unmappable: %s.\n" % ";".join(unmappable))
            options.stdlog.write( "# unmapped: %s.\n" % ";".join(unmapped))            

    ################################################################################################
    ## read old/new contig sizes for mapping positive/negative coordinates
    contig_sizes_old = {}
    contig_sizes_new = {}
    if options.contig_sizes_old:
        contig_sizes_old = Genomics.ReadContigSizes( open(options.contig_sizes_old, "r") )
    if options.contig_sizes_new:
        contig_sizes_new = Genomics.ReadContigSizes( open(options.contig_sizes_new, "r") )
        
    ################################################################################################        
    if options.filename_map:
        
        infile = open(options.filename_map)
        if options.input_format == "alignments":
            for line in infile:
                if line[0] == "#": continue

                x, old_token, old_from, old_to, old_ali, new_from, new_to, new_ali = line[:-1].split("\t")

                map_sbjcts[old_token] = (old_from, old_ali, new_from, new_ali)

            if options.loglevel >= 1:
                options.stdlog.write( "# read %i alignments.\n" % len(map_sbjcts))

        elif options.input_format == "offsets":
            ## input is a list of segments and their offsets.

            breakpoints, endpoints, offsets = ReadOffsets( infile )
            if options.loglevel >= 1:
                options.stdlog.write( "# read breakpoints for %i chromosomes.\n" % len(breakpoints))

        infile.close()
        
    ################################################################################################
    ################################################################################################
    ################################################################################################
    ## end of input section
    ################################################################################################
    ################################################################################################
    ################################################################################################        

    rx = re.compile(options.pattern_old)
    last_sbjct_token = None
    ninput = 0
    nerrors = 0
    nerrors_map = 0
    nerrors_inconsistencies = 0
    nerrors_boundaries = 0
    nerrors_translation = 0
    nerrors_inconsequential = 0
    nerrors_realigned = 0
    nmapped = 0
    nfiltered = 0
    naligned = 0
    noutput = 0
    found_transcripts = {}
    nduplicates = 0
    output = {}
    
    for line in sys.stdin:
        if line[0] == "#": continue
        
        entry = PredictionParser.PredictionParserEntry()

        entry.Read( line )
        
        ninput += 1
        is_positive = entry.mSbjctStrand == "+"
        
        is_error = False
        
        ## check if query token is mappable: using sequence map
        if (query_map_old2new and entry.mQueryToken not in query_map_old2new):
            options.stdlog.write("# skipping prediction %i: obsolete query %s\n" % (entry.mPredictionId, entry.mQueryToken) )
            nfiltered += 1
            continue
        else:
            ## check if query token is mappable: using filter        
            if (peptides and entry.mQueryToken not in peptides):
                options.stdlog.write("# skipping prediction %i: obsolete query %s\n" % (entry.mPredictionId, entry.mQueryToken) )
                nfiltered += 1
                continue

        new_sbjct_token = options.pattern_new % rx.search(entry.mSbjctToken).groups()[0]

        ##########################################################################################################
        ## Map via alignments
        if entry.mSbjctToken in map_sbjcts:
            nmapped += 1
            if last_sbjct_token != entry.mSbjctToken:
                old_from, old_ali, new_from, new_ali = map_sbjcts[entry.mSbjctToken]
                map_a2b = alignlib_lite.makeAlignmentVector()
                alignlib_lite.AlignmentFormatExplicit(
                    int(old_from), old_ali,
                    int(new_from), new_ali).copy( map_a2b )
                
            last_sbjct_token = entry.mSbjctToken
            
            if options.loglevel >= 3:
                print "#", str(entry)
                print "#", map_sbjcts[entry.mSbjctToken]
                sys.stdout.flush()

            old_f, old_t = entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo

            ## convert to forward coordinates:
            if is_positive:
                f, t= old_f, old_t
                first_res, last_res = f + 1, t                
            else:
                f, t = map_a2b.getRowTo() - old_f, map_a2b.getRowTo() - old_t 
                first_res, last_res = f, t + 1 
            
            ## map first and last residues
            mfirst_res = map_a2b.mapRowToCol( first_res )
            mlast_res = map_a2b.mapRowToCol( last_res )

            if (mfirst_res == 0 and old_f != 0) or (mlast_res == 0 and old_t != map_a2b.getRowTo() ):
                
                options.stderr.write("# mapping not possible for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i\n" % \
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      first_res, last_res,
                                      mfirst_res, mlast_res,
                                      f, t))
                
                options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken]))
                options.stderr.write("# %s\n" % str(entry))                
                options.stderr.flush()                
                nerrors_boundaries += 1
                is_error = True

                ## get extended boundaries for alignment later on
                while mfirst_res == 0 and first_res > 1:
                    first_res -= 1
                    mfirst_res = map_a2b.mapRowToCol(first_res)
                while mlast_res == 0 and last_res < map_a2b.getRowTo():
                    last_res += 1
                    mlast_res = map_a2b.mapRowToCol(last_res)

            ## convert to genomic coordinates            
            ## convert negative strand coordinates
            if is_positive:
                new_f = mfirst_res - 1
                new_t = mlast_res 
            else:
                new_f = mfirst_res
                new_t = mlast_res - 1
                
                new_f = map_a2b.getColTo() - new_f
                new_t = map_a2b.getColTo() - new_t

            ## Now map the alignment.
            try:
                MapAlignment( entry, map_a2b )
                
            except ValueError:
                options.stderr.write("# alignment mapping not possible for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i\n" % \
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      first_res, last_res,
                                      mfirst_res, mlast_res,
                                      new_f, new_t,
                                      entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken]))
                options.stderr.flush()
                nerrors_map += 1
                is_error= True
            
            if new_f != entry.mSbjctGenomeFrom or new_t != entry.mSbjctGenomeTo:
                options.stderr.write("# mapping inconsistency for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i <> %i-%i\n" % \
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      first_res, last_res,
                                      mfirst_res, mlast_res,                                      
                                      new_f, new_t,
                                      entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                
                nerrors_inconsistencies += 1
                is_error = True

        ##########################################################################################################
        ## Map via offsets
        if entry.mSbjctToken in breakpoints:

            old_f, old_t = entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo

            ## convert to forward coordinates:
            if is_positive:
                f, t= old_f, old_t
            else:
                f, t = contig_sizes_old[entry.mSbjctToken] - old_t, contig_sizes_old[entry.mSbjctToken] - old_f

            o1 = GetOffset( f,
                            breakpoints[entry.mSbjctToken],
                            endpoints[entry.mSbjctToken],
                            offsets[entry.mSbjctToken] )
            o2 = GetOffset( t,
                            breakpoints[entry.mSbjctToken],
                            endpoints[entry.mSbjctToken],
                            offsets[entry.mSbjctToken] )            

            if o1 != o2:
                options.stderr.write("# break within gene %s\n" % str(entry))
                nerrors_map += 1
                is_error = True
                
            f += o1
            t += o2

            if not is_positive:
                f, t = contig_sizes_new[entry.mSbjctToken] - t, contig_sizes_new[entry.mSbjctToken] - f

            entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = f, t

            if entry.mSbjctGenomeFrom > entry.mSbjctGenomeTo:
                options.stderr.write("# mapping error: start after end %s\n" % str(entry))
                nerrors_map += 1
                is_error = True
        
        ##########################################################################################################
        ## do translation check, if genome is given
        if options.genome_file:
            genomic_sequence = Genomics.GetGenomicSequence( new_sbjct_token, entry.mSbjctStrand,
                                                            entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo,
                                                            options.genome_file,
                                                            loglevel = 0)

            map_peptide2translation, translation = Genomics.Alignment2PeptideAlignment( \
                entry.mMapPeptide2Genome, entry.mQueryFrom, 0, genomic_sequence )

            if re.sub("X", "", translation) != re.sub("X", "", entry.mTranslation):
                options.stderr.write("# translation error for prediction %i on %s %s:%i-%i -> %i-%i <> %i-%i\n" % \
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                if map_sbjcts:
                    options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken]))
                options.stderr.write("# old=%s\n# new=%s\n" % (entry.mTranslation, translation))
                options.stderr.write("# old=%s\n# new=%s\n" % (entry.mAlignmentString, Genomics.Alignment2String(entry.mMapPeptide2Genome)))                    
                nerrors_translation += 1
                is_error = True

                if peptides and entry.mQueryToken in peptides:
                    naligned += 1

                    options.stdlog.write( "# aligning: %s versus %s:%s: %i-%i\n" % ( \
                        entry.mQueryToken,
                        new_sbjct_token, entry.mSbjctStrand,
                        entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                    
                    # do a quick reprediction
                    if entry.mQueryToken in peptides:
                        genomic_sequence = Genomics.GetGenomicSequence( new_sbjct_token, entry.mSbjctStrand,
                                                                        0, 0,
                                                                        genome_file = options.genome_pattern,                                                                        
                                                                        loglevel = 0)
                        predictor.mLogLevel = 0

                        
                        result =  predictor(entry.mQueryToken, peptides[entry.mQueryToken],
                                            entry.mSbjctToken, genomic_sequence,
                                            "--exhaustive --subopt FALSE --score '%s' " % str(80),
                                            new_f - 10, new_t + 10)
                        prediction_id = entry.mPredictionId
                        if result:
                            entry = result[0]
                            entry.mPredictionId = prediction_id
                            nerrors_realigned += 1
            else:
                if is_error:
                    nerrors_inconsequential += 1
                    
        entry.mSbjctToken = new_sbjct_token

        ## map query tokens
        if query_map_old2new:
            query_tokens = query_map_old2new[entry.mQueryToken]
        else:
            query_tokens = (entry.mQueryToken,)

        if options.skip_errors and is_error:
            continue

        for query_token in query_tokens:

            entry.mQueryToken = query_token
            
            prediction_id = entry.mPredictionId
            entry.mPredictionId = 0
            
            hid = Genomics.GetHID( str(entry) )
            if hid in output:
                nduplicates += 1
                continue
            
            noutput += 1                        
            if options.renumber: prediction_id = noutput

            entry.mPredictionId = prediction_id

            options.stdout.write( str(entry) + "\n")
            options.stdout.flush()
            found_transcripts[entry.mQueryToken] = 1

    ## write out found transcripts and genes
    nmissed_transcripts = 0
    missed_transcripts = []
    found_genes = {}
    if peptides:
        for x in peptides.keys():
            if x not in found_transcripts:
                nmissed_transcripts += 1
                missed_transcripts.append( x )
            else:
                found_genes[map_transcript2gene[x]] = 1

    missed_genes = {}
    nmissed_genes = 0
    if map_transcript2gene:

        for t in missed_transcripts:
            g = map_transcript2gene[t]
            if g not in found_genes:
                missed_genes[g] = 1
        nmissed_genes = len(missed_genes)
    
    if options.write_missed:
        outfile = open(options.write_missed, "w")
        for x in missed_transcripts:
            if x in unmapped:
                status = "unmapped"
            else:
                status = "mapped"
            outfile.write( "%s\t%s\t%s\n" % ("transcript", x, status ))
        for x in missed_genes:
            status = "unknown"
            outfile.write( "%s\t%s\t%s\n" % ("gene", x, status ))
        
        outfile.close()
        
    options.stdlog.write("# input=%i, output=%i, filtered=%i, nduplicates=%i, mapped=%i, errors=%i\n" % (\
         ninput, noutput, nfiltered, nduplicates, nmapped, nerrors ))
    options.stdlog.write("# errors: inconsequental=%i, boundaries=%i, mapping=%i, inconsistencies=%i, translation=%i, realigned=%i\n" % (\
       nerrors_inconsequential, nerrors_boundaries, nerrors_map, nerrors_inconsistencies, nerrors_translation, nerrors_realigned ))
    options.stdlog.write("# peptides: input=%i, found=%i, missed=%i, found_genes=%i, missed_genes=%i\n" % (\
        len(peptides), len(found_transcripts), nmissed_transcripts, len(found_genes), nmissed_genes) )
    
    E.Stop()