Exemple #1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/regions2graph.py 2754 2009-09-04 16:50:22Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-b",
                      "--benchmark",
                      dest="filename_benchmark",
                      type="string",
                      help="")

    parser.add_option("-y",
                      "--benchmark-synonyms",
                      dest="benchmark_synonyms",
                      type="string",
                      help="")

    parser.add_option("-p",
                      "--peptides",
                      dest="filename_peptides",
                      type="string",
                      help="")

    parser.add_option("-c",
                      "--min-coverage-query",
                      dest="min_coverage_query",
                      type="float",
                      help="")

    parser.add_option("-s",
                      "--min-score",
                      dest="min_total_score",
                      type="float",
                      help="")

    parser.add_option("-i",
                      "--min-percent-identity",
                      dest="min_percent_identity",
                      type="float",
                      help="")

    parser.add_option("-o",
                      "--max-percent-overlap",
                      dest="max_percent_overlap",
                      type="float",
                      help="")

    parser.add_option("--overlap-min-score",
                      dest="overlap_min_score",
                      type="float",
                      help="")

    parser.add_option("--overlap-min-coverage",
                      dest="overlap_min_coverage",
                      type="float",
                      help="")

    parser.add_option("--overlap-min-identity",
                      dest="overlap_min_identity",
                      type="float",
                      help="")

    parser.add_option("--overlap-max-coverage",
                      dest="overlap_max_coverage",
                      type="float",
                      help="")

    parser.add_option("-m",
                      "--max-matches",
                      dest="max_matches",
                      type="int",
                      help="")

    parser.add_option("-j",
                      "--join-regions",
                      dest="join_regions",
                      type="int",
                      help="")

    parser.add_option("--join-regions-max-regions",
                      dest="join_regions_max_regions",
                      type="int",
                      help="")

    parser.add_option("--join-regions-max-coverage",
                      dest="join_regions_max_coverage",
                      type="float",
                      help="")

    parser.add_option("--min-length", dest="min_length", type="int", help="")

    parser.add_option("--test", dest="test", type="int", help="")

    parser.add_option("--filter-queries",
                      dest="filename_filter_queries",
                      type="string",
                      help="")

    parser.add_option("--filter-regions",
                      dest="filter_regions",
                      type="string",
                      help="")

    parser.add_option("--conserve-memory",
                      dest="conserve_memory",
                      action="store_true",
                      help="")

    parser.add_option("--filter-suboptimal",
                      dest="filter_suboptimal",
                      action="store_true",
                      help="")

    parser.set_defaults(
        # overlap allowed for matches on genomic region
        max_percent_overlap=20,
        gop=-10.0,
        gep=-2.0,
        # thresholds for joining regions
        overlap_min_score=80,
        overlap_min_coverage=80,
        overlap_max_coverage=90,
        overlap_min_identity=50,
        # threshold for filtering bad predictions:
        # minimum score
        min_total_score=80,
        # joining regions
        join_regions=0,
        # maximum coverage of query for predictions to be joined
        # (This is to ensure not to join duplications. A range check
        # would be better, but runs into trouble with repeats).
        join_regions_max_coverage=90,
        # minimum coverage of query
        min_coverage_query=10,
        # conserve memory
        conserve_memory=0,
        # minimum percent identity
        min_percent_identity=0,
        # minimum length
        min_length=0,
        max_matches=0,
        filename_peptides=None,
        filename_filter_queries=None,
        # turn on/off various filters
        filter_suboptimal=False,
        filter_regions=False,
        # parameters for filter of suboptimal predictions
        min_relative_coverage=0.5,
        min_relative_score=0.5,
        min_relative_percent_identity=0.5,
        # minimum difference between non-correlated conflicts to keep them
        # both.
        conflicts_min_difference=0.1,
        # benchmarking data
        benchmarks=None,
        benchmark_synonyms=None,
        filename_benchmark=None,
        filename_benchmark_synonyms=None,
        test=None,
        max_intron=50000)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    ##########################################################################
    # read filtering
    filter_queries = {}
    if options.filename_filter_queries:
        for line in open(options.filename_filter_queries, "r"):
            if line[0] == "#":
                continue
            query_token = line[:-1].split("\t")[0]
            filter_queries[query_token] = True

    if options.loglevel >= 1:
        options.stdlog.write("# filtering for %i queries.\n" %
                             len(filter_queries))

    ##########################################################################
    # read benchmarking regions
    if options.filename_benchmark:
        options.benchmarks = ReadBenchmarkingRegions(
            open(options.filename_benchmark, "r"))
        if options.loglevel >= 1:
            options.stdlog.write(
                "# read benchmarking regions for %i tokens\n" %
                len(options.benchmarks))
            sys.stdout.flush()
        if options.filename_benchmark_synonyms:
            infile = open(options.filename_benchmark_synonyms, "r")
            options.benchmark_synonyms = {}
            for line in infile:
                if line[0] == "#":
                    continue
                value, key = line[:-1].split("\t")
                options.benchmark_synonyms[key] = value
        else:
            options.benchmark_synonyms = {}
    else:
        options.benchmarks = {}
        options.benchmark_synonyms = {}

    ##########################################################################
    # read peptide sequences
    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences(
            open(options.filename_peptides, "r"))
    else:
        peptide_sequences = {}

    if options.conserve_memory:
        old_predictions, filename_old_predictions = tempfile.mkstemp()
        os.close(old_predictions)
        old_predictions = PredictionFile.PredictionFile()
        old_predictions.open(filename_old_predictions, "w")
    else:
        # array with final predictions
        old_predictions = []

    if options.loglevel >= 1:
        options.stdlog.write("# reading predictions.\n")
        sys.stdout.flush()

    nread = 0
    ninput = 0
    for line in sys.stdin:

        if line[0] == "#":
            continue

        entry = PredictionParser.PredictionParserEntry(expand=0)
        entry.Read(line)
        nread += 1

        # set prediction id
        if not entry.mPredictionId:
            entry.mPredictionId = nread

        # filter bad predictions right here in order to save memory:
        if entry.score < options.min_total_score:
            if options.loglevel >= 3:
                options.stdlog.write(
                    "# PRUNING: reason: score below minimum: removing: %s\n" %
                    str(entry))
            continue
        elif entry.mQueryCoverage < options.min_coverage_query:
            if options.loglevel >= 3:
                options.stdlog.write(
                    "# PRUNING: reason: coverage below minimum: removing: %s\n"
                    % str(entry))
            continue
        elif entry.mPercentIdentity < options.min_percent_identity:
            if options.loglevel >= 3:
                options.stdlog.write(
                    "# PRUNING: reason: percent identity below minimum: removing: %s\n"
                    % str(entry))
            continue
        elif entry.mSbjctTo - entry.mSbjctFrom < options.min_length:
            if options.loglevel >= 3:
                options.stdlog.write(
                    "# PRUNING: reason: length of transcript below minimum: removing: %s\n"
                    % str(entry))
            continue

        ninput += 1

        if options.test and ninput > options.test:
            break

        old_predictions.append(entry)

    if options.loglevel >= 1:
        options.stdlog.write("# predictions after input: %i\n" % ninput)
        sys.stdout.flush()

    if options.loglevel >= 10:

        options.stdlog.write(
            "############## start: predictions after input ###################################\n"
        )
        for x in old_predictions:
            options.stdlog.write("# %s\n" % str(x))
        options.stdlog.write(
            "############## end: predictions after input #####################################\n"
        )
        sys.stdout.flush()

    if ninput == 0:
        options.stdlog.write("# ERROR: no predictions\n")
        sys.exit(1)

    ##########################################################################
    # set up stacks of regions
    if options.conserve_memory:
        old_predictions.close()
        old_predictions.open(mode="r")
        removed_predictions, filename_removed_predictions = tempfile.mkstemp()
        os.close(removed_predictions)
        removed_predictions = PredictionFile.PredictionFile()
        removed_predictions.open(filename_removed_predictions, "w")

        new_predictions, filename_new_predictions = tempfile.mkstemp()
        os.close(new_predictions)
        new_predictions = PredictionFile.PredictionFile()
        new_predictions.open(filename_new_predictions, "w")
    else:
        removed_predictions = []
        new_predictions = []

    if options.benchmarks:
        EvaluateBenchmark(old_predictions)

    ##########################################################################
    # join regions
    if options.join_regions and options.join_regions_max_coverage:
        if options.loglevel >= 1:
            options.stdlog.write(
                "# joining regions: maximum distance between segments = %i and maximum query coverage = %i\n"
                % (options.join_regions, options.join_regions_max_coverage))
            sys.stdout.flush()
        njoined = JoinRegions(old_predictions, new_predictions)
        if options.conserve_memory:
            ExchangeStreams(old_predictions, new_predictions)
        else:
            old_predictions = new_predictions
            new_predictions = []

        if options.loglevel >= 1:
            options.stdlog.write("# predictions after joining: %i\n" % njoined)
            sys.stdout.flush()

        if options.loglevel >= 10:
            options.stdlog.write(
                "############## start: predictions after joining ###################################\n"
            )
            for x in old_predictions:
                options.stdlog.write("# %s" % str(x))
            options.stdlog.write(
                "############## end: predictions after joining #####################################\n"
            )
            sys.stdout.flush()
    else:
        if options.loglevel >= 1:
            options.stdlog.write("# joining regions: skipped\n")
            sys.stdout.flush()

        njoined = ninput

    ##########################################################################
    # build map of best predictions
    if options.filter_suboptimal:
        if options.loglevel >= 1:
            options.stdlog.write("# calculating best predictions\n")
            sys.stdout.flush()
        best_predictions = GetBestPredictions(old_predictions)
    else:
        best_predictions = {}

    if options.loglevel >= 1:
        options.stdlog.write("# calculated best predictions: %i\n" %
                             len(best_predictions))
        sys.stdout.flush()

    ##########################################################################
    # get regions to eliminate
    filter_regions = {}
    if options.filter_regions:

        entry = PredictionParser.PredictionParserEntry(expand=0)

        filenames = options.filter_regions.split(",")

        for filename in filenames:
            if options.loglevel >= 1:
                options.stdlog.write("# reading regions to filter from %s.\n" %
                                     (filename))
                sys.stdout.flush()

            if filename.endswith(".gz"):
                infile = gzip.open(filename, "r")
            else:
                infile = open(filename, "r")

            for line in infile:

                if line[0] == "#":
                    continue

                entry.Read(line)

                exons = Exons.Alignment2Exons(
                    Genomics.String2Alignment(entry.mAlignmentString),
                    entry.mQueryFrom, entry.mSbjctGenomeFrom)

                key = "%s-%s" % (entry.mSbjctToken, entry.mSbjctStrand)

                if key not in filter_regions:
                    filter_regions[key] = []

                for exon in exons:
                    filter_regions[key].append(
                        (exon.mGenomeFrom, exon.mGenomeTo))

            infile.close()

        for k in filter_regions.keys():
            filter_regions[k].sort()

    ##########################################################################
    # bipartite graph construction

    ##########################################################################
    # sort predictions by genomic region
    if options.conserve_memory:
        old_predictions.sort(('mSbjctToken', 'mSbjctStrand',
                              'mSbjctGenomeFrom', 'mSbjctGenomeTo'))
    else:
        old_predictions.sort(lambda x, y: cmp(
            (x.mSbjctToken, x.mSbjctStrand, x.mSbjctGenomeFrom, x.
             mSbjctGenomeTo), (y.mSbjctToken, y.mSbjctStrand, y.
                               mSbjctGenomeFrom, y.mSbjctGenomeTo)))

    ##########################################################################
    # filter predictions and resolve conflicts based on genomic overlap
    # deleted segments are put in a temporary storage space.
    min_from, max_from = None, None
    min_to, max_to = None, None
    region_id = 0
    noverlaps = 0
    last_prediction = None
    predictions = []
    region = Region()
    nclusters = 0
    neliminated_suboptimal = 0
    neliminated_overlap = 0

    noutput, nfiltered = 0, 0

    for this_prediction in old_predictions:

        # Filter 1: skip suboptimal predictions
        if this_prediction.mQueryToken in best_predictions:

            best_prediction = best_predictions[this_prediction.mQueryToken]

            neliminated_suboptimal += 1
            if float(
                    this_prediction.mQueryCoverage
            ) / best_prediction.mQueryCoverage < options.min_relative_coverage:
                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# PRUNING: reason: coverage below best: removing %s\n"
                        % str(this_prediction))
                continue

            if float(this_prediction.score
                     ) / best_prediction.score < options.min_relative_score:
                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# PRUNING: reason: score below best: removing %s\n" %
                        str(this_prediction))
                continue

            if float(
                    this_prediction.mPercentIdentity
            ) / best_prediction.mPercentIdentity < options.min_relative_percent_identity:
                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# PRUNING: reason: percent identity below best: removing %s\n"
                        % str(this_prediction))
                continue

            neliminated_suboptimal -= 1

        # Filter 2: remove predictions overlapping with certain segments
        key = "%s-%s" % (this_prediction.mSbjctToken,
                         this_prediction.mSbjctStrand)

        if key in filter_regions:

            exons = Exons.Alignment2Exons(
                Genomics.String2Alignment(this_prediction.mAlignmentString),
                this_prediction.mQueryFrom, this_prediction.mSbjctGenomeFrom)

            if CheckOverlap(map(lambda x: (x.mGenomeFrom, x.mGenomeTo), exons),
                            filter_regions[key]):
                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# PRUNING: reason: overlapping with taboo region: removing %s\n"
                        % str(this_prediction))
                neliminated_overlap += 1
                continue

        try:
            this_query_peptide, this_query_status, this_query_gene, this_query_transcript = \
                re.split("\s+", this_prediction.mQueryToken)
        except ValueError:
            this_query_gene = None

        # process first entry
        if min_from is None:
            min_from = this_prediction.mSbjctGenomeFrom
            max_from = this_prediction.mSbjctGenomeFrom
            max_to = this_prediction.mSbjctGenomeTo
            min_to = this_prediction.mSbjctGenomeTo
            predictions.append(this_prediction)
            last_prediction = this_prediction
            continue

        overlap = min_to > this_prediction.mSbjctGenomeFrom and \
            last_prediction.mSbjctToken == this_prediction.mSbjctToken and \
            last_prediction.mSbjctStrand == this_prediction.mSbjctStrand

        if options.loglevel >= 4:
            options.stdlog.write("# from=%i, to=%i, working on: %s\n" %
                                 (min_from, max_to, str(this_prediction)))
            options.stdlog.flush()

        # resolve overlap between different genes
        if overlap:
            noverlaps += 1
        else:
            region.mSbjctToken = last_prediction.mSbjctToken
            region.mSbjctStrand = last_prediction.mSbjctStrand
            region.mSbjctGenomeFrom = min_from
            region.mSbjctGenomeTo = max_to

            region_id, nxoutput, nxfiltered = ProcessRegion(
                predictions, region_id, region, peptide_sequences,
                filter_queries)

            noutput += nxoutput
            nfiltered += nxfiltered
            nclusters += 1
            predictions = []
            min_from = this_prediction.mSbjctGenomeFrom
            max_from = this_prediction.mSbjctGenomeFrom
            min_to = this_prediction.mSbjctGenomeTo
            max_to = this_prediction.mSbjctGenomeTo

        predictions.append(this_prediction)

        min_from = min(min_from, this_prediction.mSbjctGenomeFrom)
        max_from = max(max_from, this_prediction.mSbjctGenomeFrom)
        min_to = min(min_to, this_prediction.mSbjctGenomeTo)
        max_to = max(max_to, this_prediction.mSbjctGenomeTo)

        last_prediction = this_prediction

    if last_prediction:
        region.mSbjctToken = last_prediction.mSbjctToken
        region.mSbjctStrand = last_prediction.mSbjctStrand
        region.mSbjctGenomeFrom = min_from
        region.mSbjctGenomeTo = max_to

        region_id, nxoutput, nxfiltered = ProcessRegion(
            predictions, region_id, region, peptide_sequences, filter_queries)
        noutput += nxoutput
        nfiltered += nxfiltered

        nclusters += 1

    if options.conserve_memory:
        os.remove(filename_old_predictions)
        os.remove(filename_new_predictions)
        os.remove(filename_removed_predictions)

    if options.loglevel >= 1:
        options.stdlog.write(
            "# pairs: nread=%i, input=%i, joined=%i, clusters=%i, regions=%i, eliminated_subopt=%i, eliminated_overlap=%i, noutput=%i, nfiltered=%i\n"
            %
            (nread, ninput, njoined, nclusters, region_id,
             neliminated_suboptimal, neliminated_overlap, noutput, nfiltered))

    E.Stop()
Exemple #2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/gff2predictions.py 2021 2008-07-10 16:00:48Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--trans",
                      dest="trans",
                      help="input is translated DNA.",
                      action="store_true")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      help="input format.",
                      type="choice",
                      choices=("exons", "psl", "gff"))

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      help="output format",
                      type="choice",
                      choices=('exontable', 'exons', 'predictions', 'cds',
                               'fasta'))

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genomic data (indexed).")

    parser.add_option(
        "--predictions-file",
        dest="predictions_file",
        type="string",
        help=
        "filename with predictions. Use gene structures from this file if available."
    )

    parser.add_option("-i",
                      "--gff-field-id",
                      dest="gff_field_id",
                      type="string",
                      help="field for the feature id in the gff info section.")

    parser.add_option(
        "-p",
        "--filename-peptides",
        dest="filename_peptides",
        type="string",
        help=
        "Filename with peptide sequences. If given, it is used to check the predicted translated sequences."
    )

    parser.add_option(
        "--no-realignment",
        dest="do_realignment",
        action="store_false",
        help="do not re-align entries that do not parse correctly.")

    parser.add_option(
        "--remove-unaligned",
        dest="remove_unaligned",
        action="store_true",
        help="remove entries that have not been aligned correctly.")

    parser.add_option(
        "--input-coordinates",
        dest="input_coordinates",
        type="string",
        help=
        "specify input format for input coordinates [forward|both-zero|one-closed|open]."
    )

    parser.set_defaults(trans=False,
                        output_format="predictions",
                        format="psl",
                        gff_field_id='id',
                        input_coordinates="both-zero-open",
                        filename_peptides=None,
                        genome_file=None,
                        do_realignment=True,
                        predictions_file=None,
                        remove_unaligned=False)

    (options, args) = E.Start(parser)

    if not options.genome_file:
        raise "please specify a genome file."

    fasta = IndexedFasta.IndexedFasta(options.genome_file)
    contig_sizes = fasta.getContigSizes()

    ninput, noutput, nskipped = 0, 0, 0
    nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0, 0, 0, 0, 0, 0

    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences(
            IOTools.openFile(options.filename_peptides, "r"))
        predictor = Predictor.PredictorExonerate()
        predictor.mLogLevel = 0
    else:
        peptide_sequences = None
        predictor = None

    converter = IndexedFasta.getConverter(options.input_coordinates)

    predictions = {}
    if options.predictions_file:
        parser = PredictionParser.iterator_predictions(
            IOTools.openFile(options.predictions_file, "r"))
        for p in parser:
            predictions[p.mPredictionId] = p

    if options.output_format == "predictions":

        if options.format == "psl":

            if options.trans:
                parser = PredictionParser.PredictionParserBlatTrans()
            else:
                parser = PredictionParser.PredictionParserBlatCDNA()

            nmatches = 1
            for line in sys.stdin:
                if line[0] == "#":
                    continue
                if not re.match("^[0-9]", line):
                    continue

                try:
                    entries = parser.Parse((line, ))
                except PredictionParser.AlignmentError, e:
                    print "# %s" % str(e)
                    print "#", line[:-1]
                    sys.exit(1)

                for entry in entries:
                    entry.mPredictionId = nmatches
                    nmatches += 1

                print str(entries)

        elif options.format == "exons":
            parser = PredictionParser.PredictionParserExons(
                contig_sizes=contig_sizes)
        else:
            raise "unknown format %s for output option %s" % (
                options.format, options.output_format)

        if options.loglevel >= 2:
            options.stdlog.write("# parsing.\n")
            options.stdlog.flush()

        results = parser.Parse(sys.stdin.readlines())

        if options.loglevel >= 2:
            options.stdlog.write("# parsing finished.\n")
            options.stdlog.flush()

        if options.loglevel >= 1:
            options.stdlog.write(
                "# parsing: ninput=%i, noutput=%i, nerrors=%i\n" %
                (parser.GetNumInput(), parser.GetNumOutput(),
                 parser.GetNumErrors()))

            for error, msg in parser.mErrors:
                options.stdlog.write("# %s : %s\n" % (str(error), msg))
                options.stdlog.flush()

        # if genomes are given: build translation
        if options.genome_file:

            results.Sort(lambda x, y: cmp(x.mSbjctToken, y.mSbjctToken))

            new_results = PredictionParser.Predictions()

            for entry in results:

                ninput += 1

                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# processing entry %s:%s on %s:%s %i/%i.\n" %
                        (entry.mPredictionId, entry.mQueryToken,
                         entry.mSbjctToken, entry.mSbjctStrand, ninput,
                         len(results)))
                    options.stdlog.flush()

                try:
                    lgenome = fasta.getLength(entry.mSbjctToken)
                    # added 3 residues - was a problem at split codons just before the stop.
                    # See for example the chicken sequence ENSGALP00000002741
                    genomic_sequence = fasta.getSequence(
                        entry.mSbjctToken, entry.mSbjctStrand,
                        entry.mSbjctGenomeFrom,
                        min(entry.mSbjctGenomeTo + 3, lgenome))

                except KeyError:
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# did not find entry for %s on %s.\n" %
                            (entry.mPredictionId, entry.mSbjctToken))
                    nskipped += 1
                    continue

                if predictions and entry.mPredictionId in predictions:
                    if options.loglevel >= 2:
                        options.stdlog.write(
                            "# substituting entry %s on %s:%s.\n" %
                            (entry.mPredictionId, entry.mSbjctToken,
                             entry.mSbjctStrand))
                        options.stdlog.flush()
                    entry = predictions[entry.mPredictionId]

                exons = Exons.Alignment2Exons(entry.mMapPeptide2Genome, 0,
                                              entry.mSbjctGenomeFrom)

                entry.mMapPeptide2Translation, entry.mTranslation = Genomics.Alignment2PeptideAlignment(
                    Genomics.String2Alignment(entry.mAlignmentString),
                    entry.mQueryFrom, 0, genomic_sequence)

                entry.score = entry.mMapPeptide2Translation.getColTo(
                ) - entry.mMapPeptide2Translation.getColFrom() + 1

                (entry.mNIntrons, entry.mNFrameShifts, entry.mNGaps, entry.mNSplits, entry.mNStopCodons, entry.mNDisruptions ) = \
                    Genomics.CountGeneFeatures(0,
                                               entry.mMapPeptide2Genome,
                                               genomic_sequence)

                if peptide_sequences:

                    if str(entry.mPredictionId) in peptide_sequences:

                        reference = peptide_sequences[str(
                            entry.mPredictionId)].upper()

                        translation = entry.mTranslation
                        nfound += 1

                        is_identical, nmismatches = checkIdentity(
                            reference, translation, options)

                        if is_identical:
                            nidentical += 1
                        else:
                            nmismatch += 1

                            if options.do_realignment:
                                if options.loglevel >= 2:
                                    options.stdlog.write(
                                        "# %s: mismatches..realigning in region %i:%i\n"
                                        % (entry.mPredictionId,
                                           entry.mSbjctGenomeFrom,
                                           entry.mSbjctGenomeTo))
                                    options.stdlog.flush()

                                    result = predictor(
                                        entry.mPredictionId, reference,
                                        entry.mSbjctToken, genomic_sequence,
                                        "--subopt FALSE --score '%s'" %
                                        str(80))
                                    # "--exhaustive --subopt FALSE --score '%s'" % str(80) )

                                    if result:
                                        translation = result[0].mTranslation
                                        is_identical, nmismatches = checkIdentity(
                                            reference, translation, options)
                                    else:
                                        if options.loglevel >= 2:
                                            options.stdlog.write(
                                                "# %s: realignment returned empty result\n"
                                                % (entry.mPredictionId))
                                            options.stdlog.flush()
                                        is_identical = False

                                    if is_identical:
                                        naligned += 1
                                        prediction_id = entry.mPredictionId
                                        sbjct_genome_from = entry.mSbjctGenomeFrom
                                        entry = result[0]
                                        entry.mPredictionId = prediction_id
                                        entry.mSbjctGenomeFrom += sbjct_genome_from
                                    else:
                                        nunaligned += 1
                                        if options.loglevel >= 1:
                                            options.stdlog.write(
                                                "# %s: mismatch on %s:%s:%i-%i after realignment\n# reference =%s\n# translated=%s\n# realigned =%s\n"
                                                %
                                                (entry.mPredictionId,
                                                 entry.mSbjctToken,
                                                 entry.mSbjctStrand,
                                                 entry.mSbjctGenomeFrom,
                                                 entry.mSbjctGenomeTo,
                                                 reference, entry.mTranslation,
                                                 translation))
                                            options.stdlog.flush()
                                        if options.remove_unaligned:
                                            nskipped += 1
                                            continue

                            else:
                                if options.loglevel >= 2:
                                    options.stdlog.write(
                                        "# %s: mismatches on %s ... no realignment\n"
                                        % (
                                            entry.mPredictionId,
                                            entry.mSbjctToken,
                                        ))
                                    if options.loglevel >= 3:
                                        options.stdlog.write(
                                            "# %s: mismatch before realignment\n# reference =%s\n# translated=%s\n"
                                            % (entry.mPredictionId, reference,
                                               translation))
                                    options.stdlog.flush()

                                if options.remove_unaligned:
                                    nskipped += 1
                                    continue

                    else:
                        nnotfound += 1

                new_results.append(entry)
                noutput += 1

            results = new_results
        if results:
            options.stdout.write(str(results) + "\n")
Exemple #3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/predictions2introns.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-o",
                      "--output-filename-summary",
                      dest="output_filename_summary",
                      type="string",
                      help="filename with summary information.")

    parser.add_option("--skip-header",
                      dest="skip_header",
                      action="store_true",
                      help="skip header.")

    parser.add_option(
        "--fill-introns",
        dest="fill_introns",
        type="int",
        help=
        "fill intron if divisible by three and no stop codon up to a maximum length of #."
    )

    parser.add_option(
        "--introns-max-stops",
        dest="introns_max_stops",
        type="int",
        help="maximum number of stop codons to tolerate within an intron.")

    parser.add_option("--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("predictions", "extensions", "filled-introns"),
                      help="output format.")

    parser.set_defaults(
        genome_file="genome",
        start_codons=("ATG"),
        stop_codons=("TAG", "TAA", "TGA"),
        skip_header=False,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    p = PredictionParser.PredictionParserEntry()

    ninput, noutput = 0, 0
    nfilled = 0
    nseqs_filled = 0
    nseqs_extended = 0
    left_extensions = []
    right_extensions = []
    filled_introns = []

    if not options.skip_header:
        options.stdout.write("\t".join((
            "prediction_id",
            "intron",
            "contig",
            "strand",
            "start",
            "end",
            "length",
            "nstops",
            "type",
            "prime5",
            "prime3",
        )) + "\n")

    for line in sys.stdin:

        if line[0] == "#": continue

        ninput += 1
        p.Read(line)

        lsequence = fasta.getLength(p.mSbjctToken)

        genomic_sequence = fasta.getSequence(p.mSbjctToken, p.mSbjctStrand,
                                             p.mSbjctGenomeFrom,
                                             p.mSbjctGenomeTo).upper()

        exons = Exons.Alignment2Exons(p.mMapPeptide2Genome,
                                      query_from=0,
                                      sbjct_from=0)

        new_exons = []

        last_e = exons[0]

        nintron = 0

        for e in exons[1:]:

            nintron += 1
            lintron = e.mGenomeFrom - last_e.mGenomeTo

            intron_is_l3 = lintron % 3 != 0

            if intron_is_l3:
                ## get sequence, include also residues from split codons
                ## when checking for stop codons.
                ## note that e.mAlignment can sometimes be empty. This might
                ## be an exonerate bug. In the alignment string there are two
                ## consecutive exons.
                if e.mAlignment and last_e.mAlignment and e.mAlignment[0][
                        0] == "S":
                    offset_left = last_e.mAlignment[-1][2]
                    offset_right = e.mAlignment[0][2]
                else:
                    offset_left, offset_right = 0, 0

                sequence = genomic_sequence[last_e.mGenomeTo -
                                            offset_left:e.mGenomeFrom +
                                            offset_right]

                intron_nstops = 0
                for codon in [
                        sequence[x:x + 3] for x in range(0, len(sequence), 3)
                ]:
                    if codon in options.stop_codons:
                        intron_nstops += 1
            else:
                intron_nstops = 0

            ## check for splice signals
            sequence = genomic_sequence[last_e.mGenomeTo:e.mGenomeFrom]

            intron_type, prime5, prime3 = Genomics.GetIntronType(sequence)

            if options.loglevel >= 2:
                options.stdlog.write( "\t".join(map(str, (p.mPredictionId,
                                                          nintron,
                                                          lintron,
                                                          intron_nstops,
                                                          intron_type,
                                                          genomic_sequence[last_e.mGenomeTo-6:last_e.mGenomeTo].lower() + "|" + sequence[:5] + "..." +\
                                                          sequence[-5:] + "|" + genomic_sequence[e.mGenomeFrom:e.mGenomeFrom+6].lower()) ) ) + "\n" )

            options.stdout.write("\t".join(
                map(str, (p.mPredictionId, nintron, p.mSbjctToken,
                          p.mSbjctStrand,
                          last_e.mGenomeTo + p.mSbjctGenomeFrom,
                          e.mGenomeFrom + p.mSbjctGenomeFrom, lintron,
                          intron_nstops, intron_type, prime5, prime3))) + "\n")

            last_e = e

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i.\n" % (\
            ninput, noutput))

    E.Stop()
Exemple #4
0
                              e.mGenomeFrom, e.mGenomeTo)))

    elif options.output_format == "exons":

        if options.format == "exons":
            parser = PredictionParser.PredictionParserExons(
                contig_sizes=contig_sizes)
        else:
            raise "unknown format %s." % options.format

        results = parser.Parse(sys.stdin.readlines())
        id = 0
        for entry in results:
            exons = Exons.Alignment2Exons(
                entry.mMapPeptide2Genome,
                entry.mQueryFrom,
                entry.mSbjctGenomeFrom,
            )

            for e in exons:
                id += 1
                print "\t".join(
                    map(str,
                        (entry.mQueryToken, entry.mSbjctToken,
                         entry.mSbjctStrand, e.frame, e.mRank, e.mPeptideFrom,
                         e.mPeptideTo, e.mGenomeFrom, e.mGenomeTo)))

    elif options.output_format == "cds":

        if options.format == "exons":
Exemple #5
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/predictions2cds.py 1858 2008-05-13 15:07:05Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-o",
                      "--forward-coordinates",
                      dest="forward_coordinates",
                      action="store_true",
                      help="input uses forward coordinates.")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="choice",
                      choices=("default", "cds", "cdnas", "map", "gff",
                               "intron-fasta", "exons"),
                      help="output format.")

    parser.add_option("-r",
                      "--reset-to-start",
                      dest="reset_to_start",
                      action="store_true",
                      help="move genomic coordinates to begin from 0.")

    parser.add_option("--reset-query",
                      dest="reset_query",
                      action="store_true",
                      help="move peptide coordinates to begin from 0.")

    parser.set_defaults(genome_file=None,
                        forward_coordinates=False,
                        format="default",
                        reset_to_start=False,
                        reset_query=False)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    cds_id = 1

    entry = PredictionParser.PredictionParserEntry()

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    ninput, noutput, nskipped, nerrors = 0, 0, 0, 0

    for line in sys.stdin:

        if line[0] == "#":
            continue
        if line.startswith("id"):
            continue

        ninput += 1

        try:
            entry.Read(line)
        except ValueError, msg:
            options.stdlog.write("# parsing failed with msg %s in line %s" %
                                 (msg, line))
            nerrors += 1
            continue

        cds = Exons.Alignment2Exons(entry.mMapPeptide2Genome,
                                    query_from=entry.mQueryFrom,
                                    sbjct_from=entry.mSbjctGenomeFrom,
                                    add_stop_codon=0)

        for cd in cds:
            cd.mSbjctToken = entry.mSbjctToken
            cd.mSbjctStrand = entry.mSbjctStrand

        if cds[-1].mGenomeTo != entry.mSbjctGenomeTo:
            options.stdlog.write(
                "# WARNING: discrepancy in exon calculation!!!\n")
            for cd in cds:
                options.stdlog.write("# %s\n" % str(cd))
            options.stdlog.write("# %s\n" % entry)

        lsequence = fasta.getLength(entry.mSbjctToken)
        genomic_sequence = fasta.getSequence(entry.mSbjctToken,
                                             entry.mSbjctStrand,
                                             entry.mSbjctGenomeFrom,
                                             entry.mSbjctGenomeTo)

        # deal with forward coordinates: convert them to negative strand
        # coordinates
        if options.forward_coordinates and \
                entry.mSbjctStrand == "-":
            entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = lsequence - \
                entry.mSbjctGenomeTo, lsequence - entry.mSbjctGenomeFrom
            for cd in cds:
                cd.InvertGenomicCoordinates(lsequence)

        # attach sequence to cds
        for cd in cds:
            start = cd.mGenomeFrom - entry.mSbjctGenomeFrom
            end = cd.mGenomeTo - entry.mSbjctGenomeFrom
            cd.mSequence = genomic_sequence[start:end]

        # reset coordinates for query
        if options.reset_to_start:
            offset = entry.mPeptideFrom
            for cd in cds:
                cd.mPeptideFrom -= offset
                cd.mPeptideTo -= offset

        # play with coordinates
        if options.reset_to_start:
            offset = entry.mSbjctGenomeFrom
            for cd in cds:
                cd.mGenomeFrom -= offset
                cd.mGenomeTo -= offset
        else:
            offset = 0

        if options.format == "cds":
            rank = 0
            for cd in cds:
                rank += 1
                cd.mQueryToken = entry.mQueryToken
                cd.mSbjctToken = entry.mSbjctToken
                cd.mSbjctStrand = entry.mSbjctStrand
                cd.mRank = rank
                print str(cd)

        if options.format == "exons":
            rank = 0
            for cd in cds:
                rank += 1
                options.stdout.write("\t".join(
                    map(str, (entry.mPredictionId, cd.mSbjctToken,
                              cd.mSbjctStrand, rank, cd.frame, cd.mPeptideFrom,
                              cd.mPeptideTo, cd.mGenomeFrom, cd.mGenomeTo))) +
                                     "\n")

        elif options.format == "cdnas":
            print string.join(
                map(str,
                    (entry.mPredictionId, entry.mQueryToken, entry.mSbjctToken,
                     entry.mSbjctStrand, entry.mSbjctGenomeFrom - offset,
                     entry.mSbjctGenomeTo - offset, genomic_sequence)), "\t")

        elif options.format == "map":

            map_prediction2genome = alignlib_lite.makeAlignmentSet()

            for cd in cds:
                alignlib_lite.addDiagonal2Alignment(
                    map_prediction2genome, cd.mPeptideFrom + 1, cd.mPeptideTo,
                    (cd.mGenomeFrom - offset) - cd.mPeptideFrom)

            print string.join(
                map(str, (entry.mPredictionId, entry.mSbjctToken,
                          entry.mSbjctStrand,
                          alignlib_lite.AlignmentFormatEmissions(
                              map_prediction2genome))), "\t")

        elif options.format == "intron-fasta":
            rank = 0
            if len(cds) == 1:
                nskipped += 1
                continue

            last = cds[0].mGenomeTo
            for cd in cds[1:]:
                rank += 1
                key = "%s %i %s:%s:%i:%i" % (
                    entry.mPredictionId, rank, entry.mSbjctToken,
                    entry.mSbjctStrand, last, entry.mSbjctGenomeFrom)
                sequence = genomic_sequence[last - entry.mSbjctGenomeFrom:cd.
                                            mGenomeFrom -
                                            entry.mSbjctGenomeFrom]
                options.stdout.write(">%s\n%s\n" % (key, sequence))
                last = cd.mGenomeTo

        elif options.format == "gff-match":
            print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tTarget \"%s\" %i %i; Score %i; Introns %i; Frameshifts %i; Stops %i" % \
                  (entry.mSbjctToken,
                   "gpipe", "similarity",
                   entry.mSbjctGenomeFrom,
                   entry.mSbjctGenomeTo,
                   entry.mPercentIdentity,
                   entry.mSbjctStrand,
                   ".",
                   entry.mQueryToken,
                   entry.mQueryFrom,
                   entry.mQueryTo,
                   entry.score,
                   entry.mNIntrons,
                   entry.mNFrameShifts,
                   entry.mNStopCodons)

        elif options.format == "gff-exon":
            rank = 0
            for cd in cds:
                rank += 1
                print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tTarget \"%s\" %i %i; Score %i; Rank %i/%i; Prediction %i" % \
                      (entry.mSbjctToken,
                       "gpipe", "similarity",
                       cd.mGenomeFrom,
                       cd.mGenomeTo,
                       entry.mPercentIdentity,
                       entry.mSbjctStrand,
                       ".",
                       entry.mQueryToken,
                       cd.mPeptideFrom / 3 + 1,
                       cd.mPeptideTo / 3 + 1,
                       entry.score,
                       rank,
                       len(cds),
                       entry.mPredictionId)
        else:
            exon_from = 0
            for cd in cds:
                cd.mPeptideFrom = exon_from
                exon_from += cd.mGenomeTo - cd.mGenomeFrom
                cd.mPeptideTo = exon_from
                print string.join(
                    map(str, (cds_id, entry.mPredictionId, cd.mPeptideFrom,
                              cd.mPeptideTo, cd.frame, cd.mGenomeFrom,
                              cd.mGenomeTo, cd.mSequence)), "\t")
                cds_id += 1

        noutput += 1
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gpipe/predictions2transcripts.py 1841 2008-05-08 12:07:13Z andreas $",
                                    usage = globals()["__doc__"] )
    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome."  )

    parser.add_option("-o", "--output-filename-summary", dest="output_filename_summary", type="string",
                      help="filename with summary information."  )

    parser.add_option( "--skip-header", dest="skip_header", action="store_true",
                       help="skip header."  )

    parser.add_option( "--start-codon-boundary", dest="start_codon_boundary", type="int",
                      help="maximum extension for start codon (make divisible by 3)."  )
    
    parser.add_option( "--stop-codon-boundary", dest="stop_codon_boundary", type="int",
                      help="maximum extension for stop codon (make divisible by 3)."  )

    parser.add_option( "--left-extension-mode", dest="left_extension_mode", type="choice",
                       choices=("first-start", "first-stop-backtrack"),
                       help="extension mode for 5' end.")

    parser.add_option( "--fill-introns", dest="fill_introns", type="int",
                      help="fill intron if divisible by three and no stop codon up to a maximum length of #."  )

    parser.add_option( "--introns-max-stops", dest="introns_max_stops", type="int",
                      help="maximum number of stop codons to tolerate within an intron."  )

    parser.add_option( "--output-format", dest="output_format", type="choice",
                       choices=("predictions", "extensions", "filled-introns"),
                      help="output format."  )
    
    parser.set_defaults(
        genome_file = "genome",
        start_codons = ("ATG"),
        stop_codons = ("TAG", "TAA", "TGA"),
        start_codon_boundary = 9999,
        stop_codon_boundary  = 9999,
        fill_introns = 0,
        introns_max_stops = 0,
        left_splice_signals = ("GT",),
        right_splice_signals = ("AG",),
        output_format="extensions",
        left_extension_mode = "first-start",
        skip_header = False,
        output_filename_summary = None,
        )

    (options, args) = E.Start( parser, add_pipe_options = True )

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    options.start_codon_boundary = int(options.start_codon_boundary / 3)
    options.stop_codon_boundary = int(options.stop_codon_boundary / 3)

    fasta = IndexedFasta.IndexedFasta( options.genome_file )
    
    p = PredictionParser.PredictionParserEntry()

    ninput, noutput = 0, 0
    nfilled = 0
    nseqs_filled = 0
    nseqs_extended = 0
    left_extensions = []
    right_extensions = []
    filled_introns = []

    if not options.skip_header:
        if options.output_format == "predictions":
            options.stdout.write( Prediction.Prediction().getHeader() + "\n" )
        elif options.output_format == "filled-introns":
            options.stdout.write("\t".join( ("prediction_id",
                                             "intron",
                                             "peptide_sequence",
                                             "genomic_sequence") ) + "\n" )

    if options.output_filename_summary:
        outfile_summary = open (options.output_filename_summary, "w" )
        outfile_summary.write( "id\ttype\tnumber\tlength\tfrom\tto\tsequence\n" )
    else:
        outfile_summary = None

    for line in options.stdin:
        
        if line[0] == "#": continue

        ninput += 1
        p.Read(line)

        lsequence = fasta.getLength( p.mSbjctToken )

        genome_from = max( 0, p.mSbjctGenomeFrom - options.start_codon_boundary)
        genome_to = min( lsequence, p.mSbjctGenomeTo + options.stop_codon_boundary)
        
        genomic_sequence = fasta.getSequence( p.mSbjctToken, p.mSbjctStrand,
                                              genome_from,
                                              genome_to ).upper()

        ########################################################################
        ########################################################################
        ########################################################################            
        ## Do extensions
        
        if options.start_codon_boundary or options.stop_codon_boundary:
            
            extension_start = p.mSbjctGenomeFrom - genome_from 
            extension_stop  = genome_to - p.mSbjctGenomeTo
            
            fragment_to = extension_start + p.mSbjctGenomeTo - p.mSbjctGenomeFrom

            lfragment = len(genomic_sequence)

            ########################################################################
            ########################################################################
            ########################################################################            
            ## find start codon
            start = extension_start
            found_start = False
            if options.left_extension_mode == "first-start":

                found_start, start = findCodonReverse( genomic_sequence,
                                                       start,
                                                       options.start_codons,
                                                       options.stop_codons )
                
            elif options.left_extension_mode == "first-stop-backtrack":

                if genomic_sequence[start:start+3] in options.start_codons:
                    found_start = True
                else:
                    found_start, start = findCodonReverse( genomic_sequence,
                                                           start,
                                                           options.stop_codons )
                    
                    if found_start:
                        E.info("prediction %s: stop found at %i (%i) backtracking ..." % ( p.mPredictionId, start, extension_start - start) )
                        
                        ## bracktrack to first start codon
                        found_start = False
                        while start < extension_start:
                            start += 3
                            if genomic_sequence[start:start+3] in options.start_codons:
                                found_start = True
                                break
                        else:
                            start = extension_start

                        if found_start:
                            E.info("start codon found at %i (%i)." % ( start, extension_start - start) )
                        else:
                            E.info("no start codon found." )
                    else:
                        E.info("prediction %s: no stop found ... backtracking to start codon." % ( p.mPredictionId ) )

                        found_start, start = findCodonReverse( genomic_sequence, start, options.start_codons )

                        E.info("prediction %s: no start codon found." % ( p.mPredictionId ) )

            if found_start:
                start += genome_from
            else:
                start = p.mSbjctGenomeFrom

            dstart = p.mSbjctGenomeFrom - start
            
            ########################################################################
            ########################################################################
            ########################################################################            
            ## find stop codon
            ## stop points to the beginning of the codon, thus the stop codon will
            ## not be part of the sequence.
            stop = fragment_to
            found_stop = 0
            while stop < lfragment and \
                      genomic_sequence[stop:stop+3] not in ("NNN", "XXX"):
                if genomic_sequence[stop:stop+3] in options.stop_codons:
                    found_stop = 1
                    break

                stop += 3

            if found_stop:
                stop += genome_from 
            else:
                stop = p.mSbjctGenomeTo

            dstop = stop - p.mSbjctGenomeTo 

            ########################################################################
            ########################################################################
            ########################################################################            
            ## build new prediction
            map_peptide2genome = []
            if dstart: map_peptide2genome.append( ("G", 0, dstart) )
            map_peptide2genome += p.mMapPeptide2Genome
            if dstop: map_peptide2genome.append( ("G", 0, dstop) )

            E.info("prediction %s: extension: found_start=%i, found_stop=%i, left=%i, right=%i" % ( p.mPredictionId, found_start, found_stop, dstart, dstop ) )

            ## save results
            p.mMapPeptide2Genome = map_peptide2genome
            p.mAlignmentString = Genomics.Alignment2String( map_peptide2genome )
            p.mSbjctGenomeFrom -= dstart
            p.mSbjctGenomeTo += dstop
            p.mSbjctFrom += dstart / 3
            p.mSbjctTo += dstart / 3 + dstop / 3            
            
            if dstart or dstop:
                if dstart: left_extensions.append( dstart )
                if dstop: right_extensions.append( dstop )
                
                nseqs_extended += 1

        ## update genomic sequence because borders might have changed.
        genomic_sequence = fasta.getSequence( p.mSbjctToken,
                                              p.mSbjctStrand,
                                              p.mSbjctGenomeFrom,
                                              p.mSbjctGenomeTo ).upper()

        if options.fill_introns:
            
            has_filled = False

            exons = Exons.Alignment2Exons( p.mMapPeptide2Genome,
                                           query_from = 0,
                                           sbjct_from = 0 )

            new_exons = []

            last_e = exons[0]

            nintron = 0

            for e in exons[1:]:

                nintron += 1
                lintron = e.mGenomeFrom - last_e.mGenomeTo
                
                if lintron > options.fill_introns or (lintron) % 3 != 0:
                    E.debug( "prediction %s: intron %i of size %i discarded." % \
                                 (p.mPredictionId,
                                  nintron, lintron ) )
                    
                    new_exons.append(last_e)
                    last_e = e
                    continue

                ## get sequence, include also residues from split codons
                ## when checking for stop codons.
                if e.mAlignment[0][0] == "S":
                    offset_left = last_e.mAlignment[-1][2]
                    offset_right = e.mAlignment[0][2]
                else:
                    offset_left, offset_right = 0, 0
                    
                sequence = genomic_sequence[last_e.mGenomeTo - offset_left:e.mGenomeFrom+offset_right]
                
                ## check for splice sites
                for signal in options.left_splice_signals:
                    if sequence[offset_left:offset_left+len(signal)] == signal:
                        left_signal = True
                        break
                else:
                    left_signal = False
                    
                for signal in options.right_splice_signals:
                    if sequence[-(len(signal)+offset_right):-offset_right] == signal:
                        right_signal = True
                        break
                else:
                    right_signal = False

                nstops, ngaps = 0, 0
                for codon in [ sequence[x:x+3] for x in range(0,len(sequence),3) ]:
                    if codon in options.stop_codons: nstops += 1
                    if "N" in codon.upper(): ngaps += 1
                        
                    E.debug( "prediction %s: intron %i of size %i (%i-%i) (%s:%s:%i:%i): stops=%i, gaps=%i, signals=%s,%s." % \
                                 (p.mPredictionId,
                                  nintron, lintron,
                                  offset_left, offset_right,
                                  p.mSbjctToken, p.mSbjctStrand,
                                  p.mSbjctGenomeFrom + last_e.mGenomeTo,
                                  p.mSbjctGenomeFrom + e.mGenomeFrom,
                                  nstops,
                                  ngaps,
                                  left_signal, right_signal ) )

                if nstops + ngaps > options.introns_max_stops:
                    new_exons.append(last_e)                                        
                    last_e = e
                    continue
                
                E.info( "prediction %s: filling intron %i of size %i: stops=%i, gaps=%i, signals=%s,%s" % \
                            (p.mPredictionId,
                             nintron, lintron,
                             nstops,
                             ngaps,
                             left_signal, right_signal))

                e.Merge( last_e )
                has_filled = True
                nfilled += 1
                last_e = e

                if options.output_format == "filled-introns":
                    options.stdout.write( "\t".join( map(str, ( p.mPredictionId,
                                                                nintron,
                                                                Genomics.TranslateDNA2Protein( sequence ),
                                                                sequence ) ) ) + "\n" )
                                                                
                
                filled_introns.append(lintron)
                p.mNIntrons -= 1
                
            new_exons.append(last_e)

            if has_filled: nseqs_filled += 1

            Exons.UpdatePeptideCoordinates( new_exons )
            
            p.mMapPeptide2Genome = Exons.Exons2Alignment( new_exons )
            p.mAlignmentString = Genomics.Alignment2String( p.mMapPeptide2Genome )

        ## build translated sequence
        p.mMapPeptide2Translation, p.mTranslation = Genomics.Alignment2PeptideAlignment( \
               p.mMapPeptide2Genome, p.mQueryFrom, 0, genomic_sequence )

        ## output info
        if options.output_format == "predictions":
            options.stdout.write( str(p) + "\n" )
        elif options.output_format == "extensions":
            if found_start: found_start = 1
            if found_stop: found_stop = 1
            options.stdout.write( "\t".join( map(str, ( p.mPredictionId,
                                                        found_start, found_stop, 
                                                        dstart, dstop,
                                                        p.mTranslation,
                                                        p.mSbjctGenomeFrom, p.mSbjctGenomeTo,
                                                        p.mAlignmentString ))) + "\n" )

        noutput += 1
        options.stdout.flush()

    E.info("stats  : %s" % "\t".join(Stats.DistributionalParameters().getHeaders() ))
    E.info("left   : %s" % str(Stats.DistributionalParameters(left_extensions)) )
    E.info("right  : %s" % str(Stats.DistributionalParameters(right_extensions)) )
    E.info("introns: %s" % str(Stats.DistributionalParameters(filled_introns)) )        
    E.info("ninput=%i, noutput=%i, nextended=%i, nfilled=%i, nexons_filled=%i" % (\
            ninput, noutput, nseqs_extended, nseqs_filled, nfilled))
        
    E.Stop()
Exemple #7
0
def ResolveExonOverlaps(gene_id, predictions):
    """resolve overlaps between predictions based
    on exonic overlap."""

    all_exons = []
    n = 1

    if len(predictions) == 0:
        return gene_id

    for p in predictions:

        exons = Exons.Alignment2Exons(Genomics.String2Alignment(
            p.mAlignmentString),
                                      query_from=0,
                                      sbjct_from=p.mSbjctGenomeFrom)
        for exon in exons:
            all_exons.append((exon.mGenomeFrom, exon.mGenomeTo, n))
        else:
            all_exons.append((p.mSbjctGenomeFrom, p.mSbjctGenomeTo, n))

        n += 1

    map_prediction2gene = range(0, len(predictions) + 1)
    map_gene2predictions = [None]
    for x in range(1, len(predictions) + 1):
        map_gene2predictions.append([x])

    all_exons.sort()
    # print all_exons

    # cluster exons by overlap
    last_exon_from, last_exon_to, last_p = all_exons[0]

    for exon_from, exon_to, p in all_exons[1:]:
        # if overlap
        overlap = min(exon_to, last_exon_to) - max(exon_from, last_exon_from)

        if overlap and param_exon_identity:
            overlap = (exon_from == last_exon_from) and (exon_to
                                                         == last_exon_to)

        if overlap > 0:
            # print "# overlap between %i and %i" % (p, last_p)
            # rewire pointers to point to gene of previous prediction
            # if they belong to different genes
            new_g = map_prediction2gene[last_p]
            old_g = map_prediction2gene[p]

            if new_g != old_g:
                for x in map_gene2predictions[old_g]:
                    map_gene2predictions[new_g].append(x)
                    map_prediction2gene[x] = new_g
                map_gene2predictions[old_g] = []

        # if no overlap: create new gene, if predictions has no gene
        # associated with it yet.
        else:
            # print "# no overlap between %i and %i" % (p, last_p)
            if not map_prediction2gene[p]:
                map_prediction2gene[p] = len(map_gene2predictions)
                map_gene2predictions.append([p])

        if param_exon_identity:
            last_exon_to = exon_to
            last_exon_from = exon_from
        else:
            last_exon_to = max(last_exon_to, exon_to)

        last_p = p

    for x in range(1, len(map_gene2predictions)):
        if map_gene2predictions[x]:
            for p in map_gene2predictions[x]:
                print "%i\t%i" % (gene_id, predictions[p - 1].mPredictionId)
            gene_id += 1

    return gene_id
Exemple #8
0
        alignlib_lite.ALIGNMENT_LOCAL, param_gop, param_gep)
    map_reference2target = alignlib_lite.makeAlignmentVector()
    assignment_id = 0

    for line in cr.fetchall():

        reference = PredictionParser.PredictionParserEntry()
        reference.FillFromTable(line)

        ct = dbhandle.cursor()
        ct.execute(statement %
                   (param_tablename_predictions_target, reference.mSbjctToken,
                    reference.mSbjctStrand, reference.mSbjctGenomeFrom,
                    reference.mSbjctGenomeTo))

        reference_exons = Exons.Alignment2Exons(reference.mMapPeptide2Genome,
                                                0, reference.mSbjctFrom)

        for line2 in ct.fetchall():
            target = PredictionParser.PredictionParserEntry()
            target.FillFromTable(line2)

            target_exons = Exons.Alignment2Exons(target.mMapPeptide2Genome, 0,
                                                 target.mSbjctFrom)

            ## check for exon overlap
            rr, tt = 0, 0
            overlap = 0
            while rr < len(reference_exons) and tt < len(target_exons):

                r = reference_exons[rr]
                t = target_exons[tt]