Esempio n. 1
0
def CheckAlignments(peptide_sequences, query_token, other_tokens):
    """check wether query aligns to all others.
    """

    if param_loglevel >= 3:
        print "# checking query %s and sbjcts %s" % (query_token, str(other_tokens))
        sys.stdout.flush()

    if query_token not in peptide_sequences:
        return True

    result = alignlib_lite.makeAlignmentVector()
    alignator = alignlib_lite.makeAlignatorDPFull(alignlib_lite.ALIGNMENT_LOCAL,
                                                  -10.0, -1.0)
    row_seq = alignlib_lite.makeSequence(peptide_sequences[query_token])

    for x in other_tokens:
        if x not in peptide_sequences:
            continue
        col_seq = alignlib_lite.makeSequence(peptide_sequences[x])
        alignator.align(result, row_seq, col_seq)
        if param_loglevel >= 5:
            print "# %s - %s = %f" % (query_token, x, result.getScore())
        if result.getScore() > param_min_alignment_score:
            return True

    return False
Esempio n. 2
0
def CheckAlignments(peptide_sequences, query_token, other_tokens):
    """check wether query aligns to all others.
    """

    if param_loglevel >= 3:
        print "# checking query %s and sbjcts %s" % (query_token,
                                                     str(other_tokens))
        sys.stdout.flush()

    if query_token not in peptide_sequences:
        return True

    result = alignlib_lite.makeAlignmentVector()
    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, -10.0, -1.0)
    row_seq = alignlib_lite.makeSequence(peptide_sequences[query_token])

    for x in other_tokens:
        if x not in peptide_sequences:
            continue
        col_seq = alignlib_lite.makeSequence(peptide_sequences[x])
        alignator.align(result, row_seq, col_seq)
        if param_loglevel >= 5:
            print "# %s - %s = %f" % (query_token, x, result.getScore())
        if result.getScore() > param_min_alignment_score:
            return True

    return False
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser.add_option("-m", "--filename-map", dest="filename_map", type="string",
                      help="filename with mapping information.")
    parser.add_option("-o", "--pattern-old", dest="pattern_old", type="string",
                      help="pattern for mapping new to old identifiers: extract string from old.")
    parser.add_option("-n", "--pattern-new", dest="pattern_new", type="string",
                      help="pattern for mapping new to old identifiers: put string into new.")
    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="genome_file.")
    parser.add_option("-p", "--peptides", dest="filename_peptides", type="string",
                      help="filename with peptide sequences.")
    parser.add_option("-f", "--input-format", dest="input_format", type="choice",
                      help="format of mapping file", choices=("alignment", "offsets"))
    parser.add_option("-i", "--write-missed", dest="write_missed", type="string",
                      help="write missed identifiers to separate file.")
    parser.add_option("-a", "--filename-genes", dest="filename_genes", type="string",
                      help="filename with gene information.")
    parser.add_option("--filename-old-peptides", dest="filename_old_peptides", type="string",
                      help="filename with old peptide information.")
    parser.add_option("--no-renumber", dest="renumber", action="store_false",
                      help="do not renumber predictions.")
    parser.add_option("--contig-sizes-old", dest="contig_sizes_old", type="string",
                      help="contig sizes for old data.")
    parser.add_option("--contig-sizes-new", dest="contig_sizes_new", type="string",
                      help="contig sizes for new data.")
    parser.add_option("--skip-errors", dest="skip_errors", action="store_true",
                      help="skip entries with errors.")

    parser.set_defaults(
        filename_map=None,
        pattern_old="(.+)",
        pattern_new="%s",
        genome_file=None,
        filename_peptides=None,
        write_missed=None,
        filename_genes=None,
        filename_old_peptides=None,
        renumber=True,
        input_format="alignment",
        contig_sizes_old=None,
        contig_sizes_new=None,
        skip_errors=None
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    predictor = PredictorExonerate()

    # the different mapping criteria
    map_sbjcts = {}
    breakpoints = {}

    ##########################################################################
    map_transcript2gene = {}
    if options.filename_genes:
        infile = open(options.filename_genes, "r")
        for gene, transcript in map(lambda x: x[:-1].split("\t")[:2], filter(lambda x: x[0] != "#", infile.readlines())):
            map_transcript2gene[transcript] = gene
        infile.close()

    ##########################################################################
    peptides = {}
    if options.filename_peptides:
        peptides = Genomics.ReadPeptideSequences(
            open(options.filename_peptides, "r"))
        options.stdlog.write("# read %i peptide sequences.\n" % len(peptides))

    ##########################################################################
    # read old query sequences and compare against new query sequences
    # this can be used to build a map between old and new queries
    query_map_old2new = {}
    if options.filename_old_peptides:
        old_peptides = Genomics.ReadPeptideSequences(
            open(options.filename_old_peptides, "r"))
        options.stdlog.write(
            "# read %i old peptide sequences.\n" % len(old_peptides))
        query_map_old2new, unmappable, unmapped = Genomics.MapSequences(
            old_peptides, peptides)
        options.stdlog.write(
            "# built map: unmappable=%i unmapped=%i.\n" % (len(unmappable), len(unmapped)))
        if options.loglevel >= 2:
            options.stdlog.write("# unmappable: %s.\n" % ";".join(unmappable))
            options.stdlog.write("# unmapped: %s.\n" % ";".join(unmapped))

    ##########################################################################
    # read old/new contig sizes for mapping positive/negative coordinates
    contig_sizes_old = {}
    contig_sizes_new = {}
    if options.contig_sizes_old:
        contig_sizes_old = Genomics.ReadContigSizes(
            open(options.contig_sizes_old, "r"))
    if options.contig_sizes_new:
        contig_sizes_new = Genomics.ReadContigSizes(
            open(options.contig_sizes_new, "r"))

    ##########################################################################
    if options.filename_map:

        infile = open(options.filename_map)
        if options.input_format == "alignments":
            for line in infile:
                if line[0] == "#":
                    continue

                x, old_token, old_from, old_to, old_ali, new_from, new_to, new_ali = line[
                    :-1].split("\t")

                map_sbjcts[old_token] = (old_from, old_ali, new_from, new_ali)

            if options.loglevel >= 1:
                options.stdlog.write(
                    "# read %i alignments.\n" % len(map_sbjcts))

        elif options.input_format == "offsets":
            # input is a list of segments and their offsets.

            breakpoints, endpoints, offsets = ReadOffsets(infile)
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# read breakpoints for %i chromosomes.\n" % len(breakpoints))

        infile.close()

    ##########################################################################
    ##########################################################################
    ##########################################################################
    # end of input section
    ##########################################################################
    ##########################################################################
    ##########################################################################

    rx = re.compile(options.pattern_old)
    last_sbjct_token = None
    ninput = 0
    nerrors = 0
    nerrors_map = 0
    nerrors_inconsistencies = 0
    nerrors_boundaries = 0
    nerrors_translation = 0
    nerrors_inconsequential = 0
    nerrors_realigned = 0
    nmapped = 0
    nfiltered = 0
    naligned = 0
    noutput = 0
    found_transcripts = {}
    nduplicates = 0
    output = {}

    for line in sys.stdin:
        if line[0] == "#":
            continue

        entry = PredictionParser.PredictionParserEntry()

        entry.Read(line)

        ninput += 1
        is_positive = entry.mSbjctStrand == "+"

        is_error = False

        # check if query token is mappable: using sequence map
        if (query_map_old2new and entry.mQueryToken not in query_map_old2new):
            options.stdlog.write("# skipping prediction %i: obsolete query %s\n" % (
                entry.mPredictionId, entry.mQueryToken))
            nfiltered += 1
            continue
        else:
            # check if query token is mappable: using filter
            if (peptides and entry.mQueryToken not in peptides):
                options.stdlog.write("# skipping prediction %i: obsolete query %s\n" % (
                    entry.mPredictionId, entry.mQueryToken))
                nfiltered += 1
                continue

        new_sbjct_token = options.pattern_new % rx.search(
            entry.mSbjctToken).groups()[0]

        #######################################################################
        # Map via alignments
        if entry.mSbjctToken in map_sbjcts:
            nmapped += 1
            if last_sbjct_token != entry.mSbjctToken:
                old_from, old_ali, new_from, new_ali = map_sbjcts[
                    entry.mSbjctToken]
                map_a2b = alignlib_lite.makeAlignmentVector()
                alignlib_lite.AlignmentFormatExplicit(
                    int(old_from), old_ali,
                    int(new_from), new_ali).copy(map_a2b)

            last_sbjct_token = entry.mSbjctToken

            if options.loglevel >= 3:
                print "#", str(entry)
                print "#", map_sbjcts[entry.mSbjctToken]
                sys.stdout.flush()

            old_f, old_t = entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo

            # convert to forward coordinates:
            if is_positive:
                f, t = old_f, old_t
                first_res, last_res = f + 1, t
            else:
                f, t = map_a2b.getRowTo() - old_f, map_a2b.getRowTo() - old_t
                first_res, last_res = f, t + 1

            # map first and last residues
            mfirst_res = map_a2b.mapRowToCol(first_res)
            mlast_res = map_a2b.mapRowToCol(last_res)

            if (mfirst_res == 0 and old_f != 0) or (mlast_res == 0 and old_t != map_a2b.getRowTo()):

                options.stderr.write("# mapping not possible for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i\n" %
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      first_res, last_res,
                                      mfirst_res, mlast_res,
                                      f, t))

                options.stderr.write(
                    "# %s\n" % str(map_sbjcts[entry.mSbjctToken]))
                options.stderr.write("# %s\n" % str(entry))
                options.stderr.flush()
                nerrors_boundaries += 1
                is_error = True

                # get extended boundaries for alignment later on
                while mfirst_res == 0 and first_res > 1:
                    first_res -= 1
                    mfirst_res = map_a2b.mapRowToCol(first_res)
                while mlast_res == 0 and last_res < map_a2b.getRowTo():
                    last_res += 1
                    mlast_res = map_a2b.mapRowToCol(last_res)

            # convert to genomic coordinates
            # convert negative strand coordinates
            if is_positive:
                new_f = mfirst_res - 1
                new_t = mlast_res
            else:
                new_f = mfirst_res
                new_t = mlast_res - 1

                new_f = map_a2b.getColTo() - new_f
                new_t = map_a2b.getColTo() - new_t

            # Now map the alignment.
            try:
                MapAlignment(entry, map_a2b)

            except ValueError:
                options.stderr.write("# alignment mapping not possible for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i\n" %
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      first_res, last_res,
                                      mfirst_res, mlast_res,
                                      new_f, new_t,
                                      entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                options.stderr.write(
                    "# %s\n" % str(map_sbjcts[entry.mSbjctToken]))
                options.stderr.flush()
                nerrors_map += 1
                is_error = True

            if new_f != entry.mSbjctGenomeFrom or new_t != entry.mSbjctGenomeTo:
                options.stderr.write("# mapping inconsistency for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i <> %i-%i\n" %
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      first_res, last_res,
                                      mfirst_res, mlast_res,
                                      new_f, new_t,
                                      entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))

                nerrors_inconsistencies += 1
                is_error = True

        #######################################################################
        # Map via offsets
        if entry.mSbjctToken in breakpoints:

            old_f, old_t = entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo

            # convert to forward coordinates:
            if is_positive:
                f, t = old_f, old_t
            else:
                f, t = contig_sizes_old[
                    entry.mSbjctToken] - old_t, contig_sizes_old[entry.mSbjctToken] - old_f

            o1 = GetOffset(f,
                           breakpoints[entry.mSbjctToken],
                           endpoints[entry.mSbjctToken],
                           offsets[entry.mSbjctToken])
            o2 = GetOffset(t,
                           breakpoints[entry.mSbjctToken],
                           endpoints[entry.mSbjctToken],
                           offsets[entry.mSbjctToken])

            if o1 != o2:
                options.stderr.write("# break within gene %s\n" % str(entry))
                nerrors_map += 1
                is_error = True

            f += o1
            t += o2

            if not is_positive:
                f, t = contig_sizes_new[
                    entry.mSbjctToken] - t, contig_sizes_new[entry.mSbjctToken] - f

            entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = f, t

            if entry.mSbjctGenomeFrom > entry.mSbjctGenomeTo:
                options.stderr.write(
                    "# mapping error: start after end %s\n" % str(entry))
                nerrors_map += 1
                is_error = True

        #######################################################################
        # do translation check, if genome is given
        if options.genome_file:
            genomic_sequence = Genomics.GetGenomicSequence(new_sbjct_token, entry.mSbjctStrand,
                                                           entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo,
                                                           options.genome_file,
                                                           loglevel=0)

            map_peptide2translation, translation = Genomics.Alignment2PeptideAlignment(
                entry.mMapPeptide2Genome, entry.mQueryFrom, 0, genomic_sequence)

            if re.sub("X", "", translation) != re.sub("X", "", entry.mTranslation):
                options.stderr.write("# translation error for prediction %i on %s %s:%i-%i -> %i-%i <> %i-%i\n" %
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                if map_sbjcts:
                    options.stderr.write(
                        "# %s\n" % str(map_sbjcts[entry.mSbjctToken]))
                options.stderr.write(
                    "# old=%s\n# new=%s\n" % (entry.mTranslation, translation))
                options.stderr.write("# old=%s\n# new=%s\n" % (
                    entry.mAlignmentString, Genomics.Alignment2String(entry.mMapPeptide2Genome)))
                nerrors_translation += 1
                is_error = True

                if peptides and entry.mQueryToken in peptides:
                    naligned += 1

                    options.stdlog.write("# aligning: %s versus %s:%s: %i-%i\n" % (
                        entry.mQueryToken,
                        new_sbjct_token, entry.mSbjctStrand,
                        entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))

                    # do a quick reprediction
                    if entry.mQueryToken in peptides:
                        genomic_sequence = Genomics.GetGenomicSequence(new_sbjct_token, entry.mSbjctStrand,
                                                                       0, 0,
                                                                       genome_file=options.genome_pattern,
                                                                       loglevel=0)
                        predictor.mLogLevel = 0

                        result = predictor(entry.mQueryToken, peptides[entry.mQueryToken],
                                           entry.mSbjctToken, genomic_sequence,
                                           "--exhaustive --subopt FALSE --score '%s' " % str(
                                               80),
                                           new_f - 10, new_t + 10)
                        prediction_id = entry.mPredictionId
                        if result:
                            entry = result[0]
                            entry.mPredictionId = prediction_id
                            nerrors_realigned += 1
            else:
                if is_error:
                    nerrors_inconsequential += 1

        entry.mSbjctToken = new_sbjct_token

        # map query tokens
        if query_map_old2new:
            query_tokens = query_map_old2new[entry.mQueryToken]
        else:
            query_tokens = (entry.mQueryToken,)

        if options.skip_errors and is_error:
            continue

        for query_token in query_tokens:

            entry.mQueryToken = query_token

            prediction_id = entry.mPredictionId
            entry.mPredictionId = 0

            hid = Genomics.GetHID(str(entry))
            if hid in output:
                nduplicates += 1
                continue

            noutput += 1
            if options.renumber:
                prediction_id = noutput

            entry.mPredictionId = prediction_id

            options.stdout.write(str(entry) + "\n")
            options.stdout.flush()
            found_transcripts[entry.mQueryToken] = 1

    # write out found transcripts and genes
    nmissed_transcripts = 0
    missed_transcripts = []
    found_genes = {}
    if peptides:
        for x in peptides.keys():
            if x not in found_transcripts:
                nmissed_transcripts += 1
                missed_transcripts.append(x)
            else:
                found_genes[map_transcript2gene[x]] = 1

    missed_genes = {}
    nmissed_genes = 0
    if map_transcript2gene:

        for t in missed_transcripts:
            g = map_transcript2gene[t]
            if g not in found_genes:
                missed_genes[g] = 1
        nmissed_genes = len(missed_genes)

    if options.write_missed:
        outfile = open(options.write_missed, "w")
        for x in missed_transcripts:
            if x in unmapped:
                status = "unmapped"
            else:
                status = "mapped"
            outfile.write("%s\t%s\t%s\n" % ("transcript", x, status))
        for x in missed_genes:
            status = "unknown"
            outfile.write("%s\t%s\t%s\n" % ("gene", x, status))

        outfile.close()

    options.stdlog.write("# input=%i, output=%i, filtered=%i, nduplicates=%i, mapped=%i, errors=%i\n" % (
        ninput, noutput, nfiltered, nduplicates, nmapped, nerrors))
    options.stdlog.write("# errors: inconsequental=%i, boundaries=%i, mapping=%i, inconsistencies=%i, translation=%i, realigned=%i\n" % (
        nerrors_inconsequential, nerrors_boundaries, nerrors_map, nerrors_inconsistencies, nerrors_translation, nerrors_realigned))
    options.stdlog.write("# peptides: input=%i, found=%i, missed=%i, found_genes=%i, missed_genes=%i\n" % (
        len(peptides), len(found_transcripts), nmissed_transcripts, len(found_genes), nmissed_genes))

    E.Stop()
Esempio n. 4
0
	nstopcodons, 
	pidentity, 
	psimilarity, 
	sequence, 
	sbjct_genome_from, 
	sbjct_genome_to, 
	map_query2genome
    FROM %s AS p 
    WHERE p.sbjct_token = '%s' AND
    p.sbjct_strand = '%s' AND 
    OVERLAP( %i, %i, p.sbjct_genome_from, sbjct_genome_to) > 0 
    """

    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, param_gop, param_gep)
    map_reference2target = alignlib_lite.makeAlignmentVector()
    assignment_id = 0

    for line in cr.fetchall():

        reference = PredictionParser.PredictionParserEntry()
        reference.FillFromTable(line)

        ct = dbhandle.cursor()
        ct.execute(statement %
                   (param_tablename_predictions_target, reference.mSbjctToken,
                    reference.mSbjctStrand, reference.mSbjctGenomeFrom,
                    reference.mSbjctGenomeTo))

        reference_exons = Exons.Alignment2Exons(reference.mMapPeptide2Genome,
                                                0, reference.mSbjctFrom)
Esempio n. 5
0
 def Expand(self):
     self.mMapOld2New = alignlib_lite.makeAlignmentVector()
     alignlib_lite.AlignmentFormatEmissions(
         self.mOldFrom, self.mOldAli, self.mNewFrom,
         self.mNewAli).copy(self.mMapOld2New)
Esempio n. 6
0
 def Expand( self ):
     self.mMapOld2New = alignlib_lite.makeAlignmentVector()
     alignlib_lite.AlignmentFormatEmissions( 
         self.mOldFrom, self.mOldAli,
         self.mNewFrom, self.mNewAli).copy( self.mMapOld2New )
Esempio n. 7
0
def IsParalogLink(link, cds1, cds2):
    """sort out ortholog relationships between
    transcripts of orthologous genes.

    """

    map_a2b = alignlib_lite.makeAlignmentVector()
    alignlib_lite.AlignmentFormatEmissions(link.mQueryFrom, link.mQueryAli,
                                           link.mSbjctFrom,
                                           link.mSbjctAli).copy(map_a2b)

    if link.mQueryLength < (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) or \
       link.mSbjctLength < (map_a2b.getColTo() - map_a2b.getColFrom() + 1):
        print "ERRONEOUS LINK: %s" % str(link)
        raise "length discrepancy"

    coverage_a = 100.0 * \
        (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) / link.mQueryLength
    coverage_b = 100.0 * \
        (map_a2b.getColTo() - map_a2b.getColFrom() + 1) / link.mSbjctLength

    # check exon boundaries, look at starts, skip first exon
    def MyMap(a, x):
        if x < a.getRowFrom():
            return 0
        while x <= a.getRowTo():
            c = a.mapRowToCol(x)
            if c:
                return c
            x += 1
        else:
            return 0

    mapped_boundaries = UniquifyList(
        map(lambda x: MyMap(map_a2b, x.mPeptideFrom / 3 + 1), cds1[1:]))
    reference_boundaries = UniquifyList(
        map(lambda x: x.mPeptideFrom / 3 + 1, cds2[1:]))

    nmissed = 0
    nfound = 0
    nmin = min(len(mapped_boundaries), len(reference_boundaries))
    nmax = max(len(mapped_boundaries), len(reference_boundaries))
    both_single_exon = len(cds1) == 1 and len(cds2) == 1
    one_single_exon = len(cds1) == 1 or len(cds2) == 1
    if len(mapped_boundaries) < len(reference_boundaries):
        mless = mapped_boundaries
        mmore = reference_boundaries
    else:
        mmore = mapped_boundaries
        mless = reference_boundaries

    # check if exon boundaries are ok
    for x in mless:
        is_ok = 0
        for c in mmore:
            if abs(x - c) < param_boundaries_max_slippage:
                is_ok = 1
                break
        if is_ok:
            nfound += 1
        else:
            nmissed += 1

    # set is_ok for dependent on exon boundaries
    # in single exon cases, require a check of coverage
    is_ok = False
    check_coverage = False
    if both_single_exon or one_single_exon:
        is_ok = True
        check_coverage = True
    else:
        if nmin == 1:
            is_ok = nmissed == 0
        elif nmin == 2:
            is_ok = nmissed <= 1
        elif nmin > 2:
            is_ok = nfound >= 2

    cc = min(coverage_a, coverage_b)

    if param_loglevel >= 3:
        print "# nquery=", len(cds1), "nsbjct=", len(cds2), "nmin=", nmin, "nmissed=", nmissed, "nfound=", nfound, \
              "is_ok=", is_ok, "check_cov=", check_coverage, \
              "min_cov=", cc, coverage_a, coverage_b, \
              "mapped=", mapped_boundaries, "reference=", reference_boundaries

    if not is_ok:
        return True, "different exon boundaries"

    if check_coverage and cc < param_min_coverage:
        return True, "low coverage"

    return False, None
Esempio n. 8
0
def FilterConflicts(old_predictions, new_predictions, removed_predictions,
                    min_overlap, peptide_sequences):
    """remove conflicts.

    Remove overlapping entries between different queries.

    Only remove those sequences, which are alignable.

    If they are alignable, take the sequence with the highest score and highest coverage.
    (Take both, if score and coverage are not correlated.)
    """
    ##########################################################################
    # sort predictions by genomic region
    if isinstance(old_predictions, PredictionFile.PredictionFile):
        old_predictions.sort(
            ('mSbjctToken', 'mSbjctStrand', 'mSbjctGenomeFrom', 'mSbjctGenomeTo'))
    else:
        old_predictions.sort(lambda x, y: cmp((x.mSbjctToken, x.mSbjctStrand, x.mSbjctGenomeFrom, x.mSbjctGenomeTo),
                                              (y.mSbjctToken, y.mSbjctStrand, y.mSbjctGenomeFrom, y.mSbjctGenomeTo)))

    ##########################################################################
    # filter predictions and resolve conflicts based on genomic overlap
    # deleted segments are put in a temporary storage space.
    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, param_gop, param_gep)
    result = alignlib_lite.makeAlignmentVector()
    alignments = {}
    noverlaps = 0
    nredundants = 0

    nnew = 0
    last_prediction = None

    for this_prediction in old_predictions:
        try:
            (this_query_peptide, this_query_status, this_query_gene,
             this_query_transcript) = \
                re.split("\s+", this_prediction.mQueryToken)
        except ValueError:
            this_query_gene = None

        if not last_prediction:
            last_prediction = this_prediction
            last_query_gene = this_query_gene
            continue

        overlap = min(last_prediction.mSbjctGenomeTo,
                      this_prediction.mSbjctGenomeTo) -\
            max(last_prediction.mSbjctGenomeFrom,
                this_prediction.mSbjctGenomeFrom)
        union = max(last_prediction.mSbjctGenomeTo,
                    this_prediction.mSbjctGenomeTo) -\
            min(last_prediction.mSbjctGenomeFrom,
                this_prediction.mSbjctGenomeFrom)

        # resolve overlap between different genes
        if overlap > 0 and \
                (last_query_gene != this_query_gene or
                 last_query_gene is None):

            noverlaps += 1
            relative_overlap = 100 * overlap / union

            # Start conflict resolution, if overlap is above threshold.
            # Keep higher scoring segment.
            #
            # Check if queries are homologous.
            if relative_overlap >= param_max_percent_overlap:

                if peptide_sequences:
                    if last_prediction.mQueryToken < this_prediction.mQueryToken:
                        key = "%s-%s" % (last_prediction.mQueryToken,
                                         this_prediction.mQueryToken)
                    else:
                        key = "%s-%s" % (this_prediction.mQueryToken,
                                         last_prediction.mQueryToken)

                    if not alignments.has_key(key):
                        result.clear()
                        alignator.align(result,
                                        alignlib_lite.makeSequence(
                                            peptide_sequences[this_prediction.mQueryToken]),
                                        alignlib_lite.makeSequence(peptide_sequences[last_prediction.mQueryToken]))
                        alignments[key] = result.getScore()
                        if result.getScore() >= param_min_score_overlap:
                            nredundants += 1

                    if alignments[key] >= param_min_score_overlap:
                        is_overlap = 1
                    else:
                        is_overlap = 0
                else:
                    is_overlap = 1
            else:
                is_overlap = 0
        else:
            is_overlap = 0

        if is_overlap:
            # take best prediction. If difference is very small, set
            # difference to 0 (difference does not matter). In this case,
            # the first prediction is taken.
            d1 = last_prediction.mQueryCoverage - \
                this_prediction.mQueryCoverage
            if float(abs(d1)) / float(last_prediction.mQueryCoverage) < param_conflicts_min_difference:
                d1 = 0
            d2 = last_prediction.score - this_prediction.score
            if float(abs(d2)) / float(this_prediction.score) < param_conflicts_min_difference:
                d2 = 0
            if d1 >= 0 and d2 >= 0:
                if param_loglevel >= 2:
                    print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % (last_prediction.mPredictionId,
                                                                                          last_prediction.mQueryToken,
                                                                                          last_prediction.mSbjctGenomeFrom,
                                                                                          overlap, relative_overlap,
                                                                                          str(this_prediction))
                if param_benchmarks:
                    if CheckBenchmark(this_prediction, last_prediction):
                        print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % (overlap, relative_overlap,
                                                                               str(last_prediction))

                removed_predictions.append(this_prediction)
                continue
            elif d1 <= 0 and d2 <= 0:
                if param_loglevel >= 2:
                    print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % (this_prediction.mPredictionId,
                                                                                          this_prediction.mQueryToken,
                                                                                          this_prediction.mSbjctGenomeFrom,
                                                                                          overlap, relative_overlap,
                                                                                          str(last_prediction))
                if param_benchmarks:
                    if CheckBenchmark(last_prediction, this_prediction):
                        print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % (overlap, relative_overlap,
                                                                               str(this_prediction))
                removed_predictions.append(last_prediction)
                last_prediction = this_prediction
                last_query_gene = this_query_gene
                continue
            else:
                if param_loglevel >= 2:
                    print "# CONFLICT: non-correlated score/coverage. Keeping both %i(%s-%i) (%5.2f/%i/%i) and %i(%s-%i) (%5.2f/%i/%i)" % \
                          (this_prediction.mPredictionId,
                           this_prediction.mQueryToken, this_prediction.mSbjctGenomeFrom,
                           this_prediction.score, this_prediction.mQueryCoverage,
                           this_prediction.mPercentIdentity,
                           last_prediction.mPredictionId,
                           last_prediction.mQueryToken, last_prediction.mSbjctGenomeFrom,
                           last_prediction.score, last_prediction.mQueryCoverage,
                           last_prediction.mPercentIdentity)

        new_predictions.append(last_prediction)
        nnew += 1
        last_query_gene = this_query_gene
        last_prediction = this_prediction

    new_predictions.append(last_prediction)
    nnew += 1

    if param_loglevel >= 1:
        print "# calculated %i alignments for %i potential conflicts (%i above threshold)" % \
              (len(alignments), noverlaps, nredundants)

    return nnew
def GetOrthologTranscripts(transcripts1, peptides1, cds1,
                           transcripts2, peptides2, cds2):
    """sort out ortholog relationships between
    transcripts of orthologous genes.

    Orthologs have:
        the same number of exons        
        compatible intron/exon boundaries

    For the remaining transcript pairs, take reciprocal bet hits.

    I see the following:
    0: 0(100%), 1: 0(94%), 2: 0,1(100%)
    0: 0(100%), 1: 0,1,2(100%)

    Selecting 1-0 first, would result in a suboptimal match, because one transcript
    is longer than the other, while matching up 0-0 and 2-1 would be better.

    Objective function: it is the maximal matching/assignment problem. Use greedy
    implementation instead. Assign as much as possible according to descending weights.
    """

    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, -10.0, -2.0)

    # for long sequence: use dot alignment with tuple size of three
    dottor = alignlib_lite.makeAlignatorTuples(3)
    alignator_dots = alignlib_lite.makeAlignatorDotsSquared(
        param_gop, param_gep, dottor)

    seqs1 = map(lambda x: alignlib_lite.makeSequence(
        peptides1[x[0]]), transcripts1)
    seqs2 = map(lambda x: alignlib_lite.makeSequence(
        peptides2[x[0]]), transcripts2)

    if param_loglevel >= 4:
        print "# building sequence 1"
    for i in range(len(seqs1)):
        if not cds1.has_key(transcripts1[i][0]):
            if param_loglevel >= 4:
                print "# %s not found" % transcripts1[i][0]

    if param_loglevel >= 4:
        print "# building sequence 2"

    for i in range(len(seqs2)):
        if not cds2.has_key(transcripts2[i][0]):
            if param_loglevel >= 4:
                print "# %s not found" % transcripts1[i][0]

    if param_loglevel >= 4:
        print "# all-vs-all alignment"

    # do all versus all alignment
    alis1 = []
    alis2 = []
    for i in range(len(seqs1)):
        alis1.append([])
    for i in range(len(seqs2)):
        alis2.append([])

    if param_loglevel >= 3:

        print "#################################"

        for i in range(len(seqs1)):
            for cd in cds1[transcripts1[i][0]]:
                print "#", str(cd)
        print "# versus"
        for i in range(len(seqs2)):
            for cd in cds2[transcripts2[i][0]]:
                print "#", str(cd)
        sys.stdout.flush()

    weights = {}
    for i in range(len(seqs1)):
        prediction_id1, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1 = transcripts1[
            i]

        for j in range(len(seqs2)):
            prediction_id2, sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2 = transcripts2[
                j]
            map_a2b = alignlib_lite.makeAlignmentVector()

            m = seqs1[i].getLength() * seqs2[j].getLength()

            if param_loglevel >= 3:
                print "# Starting alignment of pair (%i,%i) of lengths %s:%i and %s:%i" %\
                      (i, j, prediction_id1, seqs1[
                       i].getLength(), prediction_id2, seqs2[j].getLength())
                sys.stdout.flush()

            if m > param_max_matrix_size:
                # switch to tuple alignment if sequences are too large
                if param_loglevel >= 2:
                    print "# WARNING: sequences are of length %i and %i: switching to dot alignment." % (seqs1[i].getLength(), seqs2[j].getLength())
                    sys.stdout.flush()

                alignator_dots.align(map_a2b, seqs1[i], seqs2[j])
            else:
                alignator.align(map_a2b, seqs1[i], seqs2[j])

            coverage_a = 100.0 * \
                (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) / \
                seqs1[i].getLength()
            coverage_b = 100.0 * \
                (map_a2b.getColTo() - map_a2b.getColFrom() + 1) / \
                seqs2[j].getLength()

            # get copy of cds, but only those overlapping with alignment
            c1 = Exons.GetExonsRange(cds1[prediction_id1],
                                     (map_a2b.getRowFrom() - 1) * 3,
                                     (map_a2b.getRowTo()) * 3 + 1,
                                     full=False,
                                     min_overlap=param_min_alignment_exon_overlap,
                                     min_exon_size=param_min_exon_size)
            c2 = Exons.GetExonsRange(cds2[prediction_id2],
                                     (map_a2b.getColFrom() - 1) * 3,
                                     (map_a2b.getColTo()) * 3 + 1,
                                     full=False,
                                     min_overlap=param_min_alignment_exon_overlap,
                                     min_exon_size=param_min_exon_size)

            # check exon boundaries, look at starts, skip first exon
            def MyMap(a, x):
                while x <= a.getRowTo():
                    c = a.mapRowToCol(x)
                    if c:
                        return c
                    x += 1
                else:
                    return 0

            mapped_boundaries = map(
                lambda x: MyMap(map_a2b, x.mPeptideFrom / 3 + 1), c1[1:])
            mapped_boundaries.sort()
            reference_boundaries = map(
                lambda x: x.mPeptideFrom / 3 + 1, c2[1:])
            reference_boundaries.sort()

            nmissed_cmp2ref = Exons.CountMissedBoundaries(
                mapped_boundaries, reference_boundaries, param_boundaries_max_slippage)
            nmissed_ref2cmp = Exons.CountMissedBoundaries(
                reference_boundaries, mapped_boundaries, param_boundaries_max_slippage)

            min_nmissed = min(nmissed_cmp2ref, nmissed_ref2cmp)

            # set is_ok for the whole thing
            # no intron: is ok
            is_ok = 0
            if (len(c1) == 1 and len(c2) == 1):
                is_ok = 1
            else:
                # allow for missed boundaries, if param_boundaries_allow_missed
                # > 0
                if min_nmissed == 0:
                    is_ok = 1
                else:
                    if param_boundaries_allow_missed and \
                            len(mapped_boundaries) >= param_boundaries_allow_missed and \
                            min_nmissed <= param_boundaries_max_missed:
                        is_ok = 1

            cc = min(coverage_a, coverage_b)
            if cc >= param_min_coverage:
                is_ok_coverage = 1
            else:
                is_ok_coverage = 0

            # check for missing introns
            is_ok_exons = 1
            if abs(len(c1) - len(c2)) != 0:
                if param_missing_max_missing:
                    if ((abs(len(c1) - len(c2)) > param_missing_max_missing) or
                            (min(len(c1), len(c2)) < param_missing_min_present)):
                        is_ok_exons = 0
                else:
                    is_ok_exons = 0

            if param_loglevel >= 3:
                print "# i=", i, "li=", len(c1), "j=", j, "lj=", len(c2), \
                      "boundaries_ok=", is_ok, \
                      "nexons_ok=", is_ok_exons, \
                      "missed_c2r=", nmissed_cmp2ref, \
                      "missed_r2c=", nmissed_ref2cmp, \
                      "min_cov=", cc, \
                      "mapped=", mapped_boundaries, \
                      "reference=", reference_boundaries

                print "#", string.join(map(str, (alignlib_lite.AlignmentFormatEmissions(map_a2b),
                                                 map_a2b.getNumGaps(), coverage_a, coverage_b)), "\t")
                sys.stdout.flush()

            # dump out pairs
            for method in param_write_pairs:
                if method == "all":
                    print string.join(map(str, (
                        "pair", method,
                        prediction_id1,
                        prediction_id2,
                        sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1, seqs1[
                            i].getLength(),
                        sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2, seqs2[
                            j].getLength(),
                        map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali,
                        map_a2b.getColFrom(), map_a2b.getColTo(), col_ali,
                        map_a2b.getNumGaps(), coverage_a, coverage_b,
                        nmissed_cmp2ref, mapped_boundaries,
                        nmissed_ref2cmp, reference_boundaries,
                        i, j, len(c1), len(c2), cc, is_ok, is_ok_exons, is_ok_coverage)), "\t")
                elif method == "alignment":
                    print string.join(map(str, (
                        "pair", method,
                        prediction_id1, prediction_id2,
                        map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali,
                        map_a2b.getColFrom(), map_a2b.getColTo(), col_ali,
                        map_a2b.getNumGaps(), coverage_a, coverage_b)), "\t")
                elif method == "location":
                    print string.join(map(str, (
                        "pair", method,
                        prediction_id1,
                        prediction_id2,
                        sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1, seqs1[
                            i].getLength(),
                        sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2, seqs2[j].getLength())), "\t")
            if not is_ok_exons:
                if param_loglevel >= 4:
                    print "# rejected %i and %i: too many exons difference." % (i, j)
                continue

            if param_check_exon_boundaries:
                if not is_ok:
                    continue

            if cc < param_min_coverage:
                continue

            if not weights.has_key(cc):
                weights[cc] = []

            alis1[i].append((coverage_a, j))
            alis2[j].append((coverage_b, i))

            weights[cc].append((i, j, map_a2b))

    # sort out alignments
    ww = weights.keys()
    ww.sort()
    ww.reverse()

    pairs = []
    assigned1 = {}
    assigned2 = {}

    if param_loglevel >= 3:
        print "# alis1=", alis1
        print "# alis2=", alis2
        print "# --------------------------------------"

    for w in ww:
        for i, j, map_a2b in weights[w]:
            if not assigned1.has_key(i) and not assigned2.has_key(j):
                pairs.append((transcripts1[i], transcripts2[j], w, map_a2b))
                assigned1[i] = 1
                assigned2[j] = 1
        if len(assigned1) == len(transcripts1):
            break
        if len(assigned2) == len(transcripts2):
            break

    return pairs
Esempio n. 10
0
def WriteExons(token1, peptide1, cds1, transcript1,
               token2, peptide2, cds2, transcript2,
               peptide_map_a2b):

    if param_loglevel >= 3:
        for cd in cds1:
            print "#", str(cd)
        for cd in cds2:
            print "#", str(cd)
        print "# peptide_map_a2b", str(alignlib_lite.AlignmentFormatExplicit(peptide_map_a2b))
        sys.stdout.flush()

    dna_map_a2b = Genomics.AlignmentProtein2CDNA(peptide_map_a2b,
                                                 cds1, cds2)

    if len(cds1) != len(cds2):
        if param_loglevel >= 4:
            print ""  # WARNING: different number of exons!"

    seq1 = alignlib_lite.makeSequence(transcript1)
    seq2 = alignlib_lite.makeSequence(transcript2)
    tmp_map_a2b = alignlib_lite.makeAlignmentVector()

    dialign = WrapperDialign.Dialign("-n")
    dialignlgs = WrapperDialign.Dialign("-n -it -thr 2 -lmax 30 -smin 8")
    dba = WrapperDBA.DBA()
    #clustal = WrapperClustal.Clustal()

    matrix, gop, gep = global_substitution_matrix
    alignator_nw = alignlib_lite.makeAlignatorDPFullDP(
        alignlib_lite.ALIGNMENT_GLOBAL, gop, gep, matrix)
    alignator_sw = alignlib_lite.makeAlignatorDPFullDP(
        alignlib_lite.ALIGNMENT_LOCAL, gop, gep, matrix)

    # concatenated alignments for exons:
    # 1: only the common parts
    ali_common1 = ""
    ali_common2 = ""

    e1, e2 = 0, 0
    while cds1[e1].mGenomeTo <= dna_map_a2b.getRowFrom():
        e1 += 1
    while cds2[e2].mGenomeTo <= dna_map_a2b.getColFrom():
        e2 += 1

    nskipped, nerrors = 0, 0

    if param_loglevel >= 5:
        nmapped = 0
        for x in range(dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo() + 1):
            if dna_map_a2b.mapRowToCol(x) >= 0:
                nmapped += 1
        print "# nmapped=", nmapped
        print str(alignlib_lite.AlignmentFormatEmissions(dna_map_a2b))

    # declare alignments used
    map_intron_a2b = alignlib_lite.makeAlignmentVector()

    result = Exons.CompareGeneStructures(
        cds1, cds2, map_cmp2ref=peptide_map_a2b)

    if param_loglevel >= 2:
        print result.Pretty("#")

    nskipped_exons, nskipped_introns = 0, 0

    last_e1, last_e2 = None, None

    for link in result.mEquivalences:

        if link.mCoverage <= param_min_exon_coverage:
            nskipped_exons += 1
            continue

        e1, e2 = link.mId1, link.mId2

        c1 = cds1[e1]
        c2 = cds2[e2]
        exon_fragment1 = transcript1[c1.mGenomeFrom:c1.mGenomeTo]
        exon_fragment2 = transcript2[c2.mGenomeFrom:c2.mGenomeTo]

        #######################################################################
        # write unaligned exons
        if param_write_exons:
            pair = AlignedPairs.UnalignedPair()

            pair.mCategory = "exon"
            pair.mToken1 = token1
            pair.mId1 = e1 + 1
            pair.mNum1 = len(cds1)
            pair.mLen1 = len(exon_fragment1)
            pair.mSequence1 = exon_fragment1
            pair.mToken2 = token2
            pair.mId2 = e2 + 1
            pair.mNum2 = len(cds2)
            pair.mLen2 = len(exon_fragment2)
            pair.mSequence2 = exon_fragment2
            pair.mFrom1, pair.mTo1 = c1.mGenomeFrom, c1.mGenomeTo,
            pair.mFrom2, pair.mTo2 = c2.mGenomeFrom, c2.mGenomeTo,

            print str(pair)
            sys.stdout.flush()

        #######################################################################
        # build alignment for overlap of both exons
# tmp_map_a2b.clear()
# alignlib_lite.copyAlignment( tmp_map_a2b, dna_map_a2b,
# c1.mGenomeFrom + 1, c1.mGenomeTo )

# if param_loglevel >= 5:
# print "# alignment: %i-%i" % (c1.mGenomeFrom + 1, c1.mGenomeTo)
# for x in alignlib_lite.writeAlignmentTable( tmp_map_a2b ).split("\n"):
# print "#", x
# if tmp_map_a2b.getLength() == 0:
# if param_loglevel >= 1:
# print "# WARNING: empty alignment between exon %i (from %i to %i) and exon %i" % \
##                       (e1,c1.mGenomeFrom + 1, c1.mGenomeTo, e2)
# print "## peptide_map_a2b", peptide_map_a2b.getRowFrom(), peptide_map_a2b.getRowTo(),\
##                       peptide_map_a2b.getColFrom(), peptide_map_a2b.getColTo(), \
# Alignlib.writeAlignmentCompressed(peptide_map_a2b)
# print "## dna_map_a2b", dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo(),\
##                       dna_map_a2b.getColFrom(), dna_map_a2b.getColTo(), \
# Alignlib.writeAlignmentCompressed(dna_map_a2b)
# for cd in cds1: print "##", str(cd)
# for cd in cds2: print "##", str(cd)
##             nerrors += 1
# continue
##         data = map(lambda x: x.split("\t"), alignlib_lite.writePairAlignment( seq1, seq2, tmp_map_a2b  ).split("\n"))
# if "caligned" in param_write_exons :
# print "exon\tcaligned\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, e1,
##                                                                                token2, e2,
##                                                                                data[0][0], data[0][2],
##                                                                                data[1][0], data[1][2],
# data[0][1], data[1][1] )
##         ali_common1 += data[0][1]
##         ali_common2 += data[1][1]
        #######################################################################
        # write alignment of introns for orthologous introns
        # orthologous introns are between orthologous exons
        if param_write_introns:

            if last_e1 is not None:
                if e1 - last_e1 != 1 or e2 - last_e2 != 1:
                    nskipped_introns += 1
                else:
                    pair = AlignedPairs.UnalignedPair()

                    intron_from1 = cds1[e1 - 1].mGenomeTo
                    intron_to1 = cds1[e1].mGenomeFrom
                    intron_from2 = cds2[e2 - 1].mGenomeTo
                    intron_to2 = cds2[e2].mGenomeFrom

                    intron_fragment1 = transcript1[intron_from1:intron_to1]
                    intron_fragment2 = transcript2[intron_from2:intron_to2]

                    if len(intron_fragment1) == 0 or len(intron_fragment2) == 0:
                        print "## ERROR: empty intron fragments: %i-%i out of %i and %i-%i out of %i." %\
                              (intron_from1, intron_to1, len(transcript1),
                               intron_from2, intron_to2, len(transcript2))
                        continue

                    pair.mCategory = "intron"
                    pair.mToken1 = token1
                    pair.mId1 = e1 + 1
                    pair.mNum1 = len(cds1) - 1
                    pair.mLen1 = len(intron_fragment1)
                    pair.mFrom1 = intron_from1
                    pair.mTo1 = intron_to1
                    pair.mSequence1 = intron_fragment1
                    pair.mToken2 = token2
                    pair.mId2 = e2 + 1
                    pair.mNum1 = len(cds2) - 1
                    pair.mLen2 = len(intron_fragment2)
                    pair.mFrom2 = intron_from2
                    pair.mTo2 = intron_to2
                    pair.mSequence2 = intron_fragment2

                    if (param_min_intron_length and len(intron_fragment1) < param_min_intron_length) or \
                            (param_min_intron_length and len(intron_fragment2) < param_min_intron_length) or \
                            (param_max_intron_length and len(intron_fragment1) > param_max_intron_length) or \
                            (param_max_intron_length and len(intron_fragment2) > param_max_intron_length):
                        if param_loglevel >= 1:
                            print "# skipped: fragment lengths out of bounds for: %s\t%s\t%s\t%s\t%i\t%i" %\
                                  (token1, e1, token2, e2,
                                   len(intron_fragment1),
                                   len(intron_fragment2))
                            sys.stdout.flush()
                            nskipped += 1

                    print str(pair)

# else:
##                         anchored_from1 = intron_from1 - param_extend_introns
##                         anchored_to1 = intron_to1 + param_extend_introns
##                         anchored_from2 = intron_from2 - param_extend_introns
##                         anchored_to2 = intron_to2 + param_extend_introns

##                         anchored_fragment1 = transcript1[anchored_from1:anchored_to1]
##                         anchored_fragment2 = transcript2[anchored_from2:anchored_to2]

# for method in param_write_introns:

# if param_loglevel >= 2:
# print "## aligning with method %s" % method
# sys.stdout.flush

# map_intron_a2b.clear()

# if method == "unaligned":

##                                 from1, to1, ali1, from2, to2, ali2 = 0, 0, intron_fragment1, 0, 0, intron_fragment2

# elif method in ("dialigned", "dbaligned", "clusaligned", "dialignedlgs"):

##                                 tmp_intron_a2b = alignlib_lite.makeAlignmentVector()

# if param_loglevel >= 1:
# print "# aligning with method %s two fragments of length %i and %i" % (method,
# len(anchored_fragment1),
# len(anchored_fragment2))
# sys.stdout.flush()

# if method == "dialigned":
##                                     result = dialign.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# elif method == "dialignedlgs":
##                                     result = dialignlgs.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# elif method == "dbaligned":
##                                     result = dba.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# elif method == "clusaligned":
##                                     result = clustal.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# if not result or result.getLength() == 0:
# if param_loglevel >= 1:
# print "# Error: empty intron alignment"
# sys.stdout.flush()
##                                     nerrors += 1
# continue
##                                 tmp_intron_a2b.moveAlignment( anchored_from1, anchored_from2 )
# alignlib_lite.copyAlignment( map_intron_a2b, tmp_intron_a2b,
##                                                        intron_from1 + 1, intron_to1,
# intron_from2 + 1, intron_to2 )
# elif method == "nwaligned":
##                                 seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom )
##                                 seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom )
##                                 alignator_nw.Align( seq1, seq2, map_intron_a2b )
# seq1.useFullLength()
# seq2.useFullLength()
# elif method == "swaligned":
##                                 seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom )
##                                 seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom )
##                                 alignlib_lite.performIterativeAlignment( map_intron_a2b, seq1, seq2, alignator_sw, param_min_score_sw )
# seq1.useFullLength()
# seq2.useFullLength()
# else:
##                                 raise "unknown method %s" % method
# if map_intron_a2b.getLength() > 0:
# if param_compress:
##                                     from1, to1 = map_intron_a2b.getRowFrom(), map_intron_a2b.getRowTo()
##                                     from2, to2 = map_intron_a2b.getColFrom(), map_intron_a2b.getColTo()
##                                     ali1, ali2 = Alignlib.writeAlignmentCompressed( map_intron_a2b )
# else:
# data = map(lambda x: x.split("\t"),
# alignlib_lite.writePairAlignment( seq1, seq2, map_intron_a2b  ).split("\n"))
# if len(data) < 2:
##                                         data=[ ( 0, "", 0), (0, "", 0)]
##                                     from1, ali1, to1 = data[0]
##                                     from2, ali2, to2 = data[1]
# print string.join(map(str, ("intron",
# method,
##                                                         token1, e1, len(cds1) - 1, len(intron_fragment1),
##                                                         token2, e2, len(cds2) - 1, len(intron_fragment2),
# map_intron_a2b.getNumGaps(),
# map_intron_a2b.getLength(),
##                                                         map_intron_a2b.getLength() - map_intron_a2b.getNumGaps(),
##                                                         from1, to1, ali1,
##                                                         from2, to2, ali2,
##                                                         intron_from1, intron_to1,
# intron_from2, intron_to2)), "\t")
# sys.stdout.flush()
        last_e1, last_e2 = e1, e2

    ##########################################################################
    # write concatenated exons
# for method in param_write_exons:
# if method == "common":
# print "exon\tcommon\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0,
##                                                                            token2, 0,
##                                                                            0, 0,
##                                                                            0, 0,
# ali_common1, ali_common2 )
# elif method == "exons":
# Write full alignment without gaps.
# This will not care about exon boundaries and gaps.
# data = map(lambda x: x.split("\t"),
# alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b  ).split("\n"))

# try:
##                 from1, s1, to1, from2, s2, to2 = data[0] + data[1]
# except ValueError:
##                 from1, to1, from2, to2 = 0, 0, 0, 0
##                 s1, s2 = "", ""
##                 nerrors += 1
# except IndexError:
##                 from1, to1, from2, to2 = 0, 0, 0, 0
##                 s1, s2 = "", ""
##                 nerrors += 1

# if from1:
# if len(s1) != len(s2):
# print "# WARNING: alignment of different lengths: %i and %i" % (len(s1), len(s2))
##                     nerrors += 1
##                     from1, to1, from2, to2 = 0, 0, 0, 0
##                     s1, s2 = "", ""
# else:
##                     a1, a2 = [], []
# for x in range( min(len(s1), len(s2)) ):
# if s1[x] != "-" and s2[x] != "-":
##                             a1.append( s1[x] )
##                             a2.append( s2[x] )
##                     s1 = string.join(a1, "")
##                     s2 = string.join(a2, "")

# print "exon\texons\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( (token1, 0,
##                                                                              token2, 0,
##                                                                              from1, to1,
##                                                                              from2, to2,
# s1, s2 ) )
# elif method == "full":
# write full alignment (do not care about exon boundaries)
# data = map(lambda x: x.split("\t"),
# alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b  ).split("\n"))
##             if len(data) < 2: data=[ ( 0, "", 0), (0, "", 0)]
# print "exon\tfull\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0,
##                                                                            token2, 0,
##                                                                            data[0][0], data[0][2],
##                                                                            data[1][0], data[1][2],
# data[0][1], data[1][1] )

    if param_loglevel >= 3:
        print "# skipped_exons=%i, skipped_introns=%i" % (nskipped_exons, nskipped_introns)

    return nerrors, nskipped
Esempio n. 11
0
        if param_loglevel >= 1:
            print "# reading has finished."
            sys.stdout.flush()

        alignator = alignlib_lite.makeAlignatorDPFull(
            alignlib_lite.ALIGNMENT_LOCAL, -10.0, -2.0)

        for q1, q2 in pairs:

            ninput += 1

            if param_loglevel >= 1:
                print "# processing %s and %s" % (q1, q2)

            if q1 in transcripts1 and q2 in transcripts2:
                map_a2b = alignlib_lite.makeAlignmentVector()

                alignator.align(map_a2b,
                                alignlib_lite.makeSequence(peptides1[q1]),
                                alignlib_lite.makeSequence(peptides2[q2]))

                if map_a2b.getLength() == 0:
                    if param_loglevel >= 1:
                        print "# Alignment failed between %s and %s" % (q1, q2)
                        sys.stdout.flush()

                    ntotal_errors += 1
                    continue

                nerrors, nskipped = WriteExons(q1, peptides1[q1], cds1[q1], transcripts1[q1],
                                               q2, peptides2[q2], cds2[
Esempio n. 12
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser.add_option("-m", "--filename-map", dest="filename_map", type="string",
                      help="filename with mapping information.")
    parser.add_option("-o", "--pattern-old", dest="pattern_old", type="string",
                      help="pattern for mapping new to old identifiers: extract string from old.")
    parser.add_option("-n", "--pattern-new", dest="pattern_new", type="string",
                      help="pattern for mapping new to old identifiers: put string into new.")
    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="genome_file.")
    parser.add_option("-p", "--peptides", dest="filename_peptides", type = "string",
                      help="filename with peptide sequences.")
    parser.add_option("-f", "--input-format", dest="input_format", type="choice",
                      help="format of mapping file", choices=("alignment", "offsets") )
    parser.add_option("-i", "--write-missed", dest="write_missed", type="string",
                      help="write missed identifiers to separate file.")
    parser.add_option("-a", "--filename-genes", dest="filename_genes", type="string",
                      help="filename with gene information.")
    parser.add_option("--filename-old-peptides", dest="filename_old_peptides", type="string",
                      help="filename with old peptide information.")
    parser.add_option("--no-renumber", dest="renumber", action="store_false",
                      help="do not renumber predictions.")
    parser.add_option("--contig-sizes-old", dest="contig_sizes_old", type="string",
                      help="contig sizes for old data.")
    parser.add_option("--contig-sizes-new", dest="contig_sizes_new", type="string",
                      help="contig sizes for new data.")
    parser.add_option("--skip-errors", dest="skip_errors", action="store_true",
                      help="skip entries with errors.")
    
    parser.set_defaults(
        filename_map = None,
        pattern_old = "(.+)",
        pattern_new = "%s",
        genome_file = None,
        filename_peptides = None,
        write_missed = None,
        filename_genes = None,
        filename_old_peptides = None,
        renumber = True,
        input_format = "alignment",
        contig_sizes_old = None,
        contig_sizes_new = None,
        skip_errors = None
        )

    (options, args) = E.Start( parser, add_pipe_options = True)

    predictor = PredictorExonerate()

    ## the different mapping criteria
    map_sbjcts = {}
    breakpoints = {}

    ################################################################################################
    map_transcript2gene = {}
    if options.filename_genes:
        infile = open(options.filename_genes, "r")
        for gene, transcript in map( lambda x: x[:-1].split("\t")[:2], filter( lambda x: x[0] != "#", infile.readlines())):
            map_transcript2gene[transcript] = gene
        infile.close()

    ################################################################################################
    peptides = {}
    if options.filename_peptides:
        peptides = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r"))
        options.stdlog.write( "# read %i peptide sequences.\n" % len(peptides))

    ################################################################################################
    ## read old query sequences and compare against new query sequences
    ## this can be used to build a map between old and new queries
    query_map_old2new = {}        
    if options.filename_old_peptides:
        old_peptides = Genomics.ReadPeptideSequences( open(options.filename_old_peptides, "r"))
        options.stdlog.write( "# read %i old peptide sequences.\n" % len(old_peptides))
        query_map_old2new, unmappable, unmapped = Genomics.MapSequences( old_peptides, peptides)
        options.stdlog.write( "# built map: unmappable=%i unmapped=%i.\n" % (len(unmappable), len(unmapped)))
        if options.loglevel >= 2:
            options.stdlog.write( "# unmappable: %s.\n" % ";".join(unmappable))
            options.stdlog.write( "# unmapped: %s.\n" % ";".join(unmapped))            

    ################################################################################################
    ## read old/new contig sizes for mapping positive/negative coordinates
    contig_sizes_old = {}
    contig_sizes_new = {}
    if options.contig_sizes_old:
        contig_sizes_old = Genomics.ReadContigSizes( open(options.contig_sizes_old, "r") )
    if options.contig_sizes_new:
        contig_sizes_new = Genomics.ReadContigSizes( open(options.contig_sizes_new, "r") )
        
    ################################################################################################        
    if options.filename_map:
        
        infile = open(options.filename_map)
        if options.input_format == "alignments":
            for line in infile:
                if line[0] == "#": continue

                x, old_token, old_from, old_to, old_ali, new_from, new_to, new_ali = line[:-1].split("\t")

                map_sbjcts[old_token] = (old_from, old_ali, new_from, new_ali)

            if options.loglevel >= 1:
                options.stdlog.write( "# read %i alignments.\n" % len(map_sbjcts))

        elif options.input_format == "offsets":
            ## input is a list of segments and their offsets.

            breakpoints, endpoints, offsets = ReadOffsets( infile )
            if options.loglevel >= 1:
                options.stdlog.write( "# read breakpoints for %i chromosomes.\n" % len(breakpoints))

        infile.close()
        
    ################################################################################################
    ################################################################################################
    ################################################################################################
    ## end of input section
    ################################################################################################
    ################################################################################################
    ################################################################################################        

    rx = re.compile(options.pattern_old)
    last_sbjct_token = None
    ninput = 0
    nerrors = 0
    nerrors_map = 0
    nerrors_inconsistencies = 0
    nerrors_boundaries = 0
    nerrors_translation = 0
    nerrors_inconsequential = 0
    nerrors_realigned = 0
    nmapped = 0
    nfiltered = 0
    naligned = 0
    noutput = 0
    found_transcripts = {}
    nduplicates = 0
    output = {}
    
    for line in sys.stdin:
        if line[0] == "#": continue
        
        entry = PredictionParser.PredictionParserEntry()

        entry.Read( line )
        
        ninput += 1
        is_positive = entry.mSbjctStrand == "+"
        
        is_error = False
        
        ## check if query token is mappable: using sequence map
        if (query_map_old2new and entry.mQueryToken not in query_map_old2new):
            options.stdlog.write("# skipping prediction %i: obsolete query %s\n" % (entry.mPredictionId, entry.mQueryToken) )
            nfiltered += 1
            continue
        else:
            ## check if query token is mappable: using filter        
            if (peptides and entry.mQueryToken not in peptides):
                options.stdlog.write("# skipping prediction %i: obsolete query %s\n" % (entry.mPredictionId, entry.mQueryToken) )
                nfiltered += 1
                continue

        new_sbjct_token = options.pattern_new % rx.search(entry.mSbjctToken).groups()[0]

        ##########################################################################################################
        ## Map via alignments
        if entry.mSbjctToken in map_sbjcts:
            nmapped += 1
            if last_sbjct_token != entry.mSbjctToken:
                old_from, old_ali, new_from, new_ali = map_sbjcts[entry.mSbjctToken]
                map_a2b = alignlib_lite.makeAlignmentVector()
                alignlib_lite.AlignmentFormatExplicit(
                    int(old_from), old_ali,
                    int(new_from), new_ali).copy( map_a2b )
                
            last_sbjct_token = entry.mSbjctToken
            
            if options.loglevel >= 3:
                print "#", str(entry)
                print "#", map_sbjcts[entry.mSbjctToken]
                sys.stdout.flush()

            old_f, old_t = entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo

            ## convert to forward coordinates:
            if is_positive:
                f, t= old_f, old_t
                first_res, last_res = f + 1, t                
            else:
                f, t = map_a2b.getRowTo() - old_f, map_a2b.getRowTo() - old_t 
                first_res, last_res = f, t + 1 
            
            ## map first and last residues
            mfirst_res = map_a2b.mapRowToCol( first_res )
            mlast_res = map_a2b.mapRowToCol( last_res )

            if (mfirst_res == 0 and old_f != 0) or (mlast_res == 0 and old_t != map_a2b.getRowTo() ):
                
                options.stderr.write("# mapping not possible for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i\n" % \
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      first_res, last_res,
                                      mfirst_res, mlast_res,
                                      f, t))
                
                options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken]))
                options.stderr.write("# %s\n" % str(entry))                
                options.stderr.flush()                
                nerrors_boundaries += 1
                is_error = True

                ## get extended boundaries for alignment later on
                while mfirst_res == 0 and first_res > 1:
                    first_res -= 1
                    mfirst_res = map_a2b.mapRowToCol(first_res)
                while mlast_res == 0 and last_res < map_a2b.getRowTo():
                    last_res += 1
                    mlast_res = map_a2b.mapRowToCol(last_res)

            ## convert to genomic coordinates            
            ## convert negative strand coordinates
            if is_positive:
                new_f = mfirst_res - 1
                new_t = mlast_res 
            else:
                new_f = mfirst_res
                new_t = mlast_res - 1
                
                new_f = map_a2b.getColTo() - new_f
                new_t = map_a2b.getColTo() - new_t

            ## Now map the alignment.
            try:
                MapAlignment( entry, map_a2b )
                
            except ValueError:
                options.stderr.write("# alignment mapping not possible for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i\n" % \
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      first_res, last_res,
                                      mfirst_res, mlast_res,
                                      new_f, new_t,
                                      entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken]))
                options.stderr.flush()
                nerrors_map += 1
                is_error= True
            
            if new_f != entry.mSbjctGenomeFrom or new_t != entry.mSbjctGenomeTo:
                options.stderr.write("# mapping inconsistency for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i <> %i-%i\n" % \
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      first_res, last_res,
                                      mfirst_res, mlast_res,                                      
                                      new_f, new_t,
                                      entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                
                nerrors_inconsistencies += 1
                is_error = True

        ##########################################################################################################
        ## Map via offsets
        if entry.mSbjctToken in breakpoints:

            old_f, old_t = entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo

            ## convert to forward coordinates:
            if is_positive:
                f, t= old_f, old_t
            else:
                f, t = contig_sizes_old[entry.mSbjctToken] - old_t, contig_sizes_old[entry.mSbjctToken] - old_f

            o1 = GetOffset( f,
                            breakpoints[entry.mSbjctToken],
                            endpoints[entry.mSbjctToken],
                            offsets[entry.mSbjctToken] )
            o2 = GetOffset( t,
                            breakpoints[entry.mSbjctToken],
                            endpoints[entry.mSbjctToken],
                            offsets[entry.mSbjctToken] )            

            if o1 != o2:
                options.stderr.write("# break within gene %s\n" % str(entry))
                nerrors_map += 1
                is_error = True
                
            f += o1
            t += o2

            if not is_positive:
                f, t = contig_sizes_new[entry.mSbjctToken] - t, contig_sizes_new[entry.mSbjctToken] - f

            entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = f, t

            if entry.mSbjctGenomeFrom > entry.mSbjctGenomeTo:
                options.stderr.write("# mapping error: start after end %s\n" % str(entry))
                nerrors_map += 1
                is_error = True
        
        ##########################################################################################################
        ## do translation check, if genome is given
        if options.genome_file:
            genomic_sequence = Genomics.GetGenomicSequence( new_sbjct_token, entry.mSbjctStrand,
                                                            entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo,
                                                            options.genome_file,
                                                            loglevel = 0)

            map_peptide2translation, translation = Genomics.Alignment2PeptideAlignment( \
                entry.mMapPeptide2Genome, entry.mQueryFrom, 0, genomic_sequence )

            if re.sub("X", "", translation) != re.sub("X", "", entry.mTranslation):
                options.stderr.write("# translation error for prediction %i on %s %s:%i-%i -> %i-%i <> %i-%i\n" % \
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                if map_sbjcts:
                    options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken]))
                options.stderr.write("# old=%s\n# new=%s\n" % (entry.mTranslation, translation))
                options.stderr.write("# old=%s\n# new=%s\n" % (entry.mAlignmentString, Genomics.Alignment2String(entry.mMapPeptide2Genome)))                    
                nerrors_translation += 1
                is_error = True

                if peptides and entry.mQueryToken in peptides:
                    naligned += 1

                    options.stdlog.write( "# aligning: %s versus %s:%s: %i-%i\n" % ( \
                        entry.mQueryToken,
                        new_sbjct_token, entry.mSbjctStrand,
                        entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                    
                    # do a quick reprediction
                    if entry.mQueryToken in peptides:
                        genomic_sequence = Genomics.GetGenomicSequence( new_sbjct_token, entry.mSbjctStrand,
                                                                        0, 0,
                                                                        genome_file = options.genome_pattern,                                                                        
                                                                        loglevel = 0)
                        predictor.mLogLevel = 0

                        
                        result =  predictor(entry.mQueryToken, peptides[entry.mQueryToken],
                                            entry.mSbjctToken, genomic_sequence,
                                            "--exhaustive --subopt FALSE --score '%s' " % str(80),
                                            new_f - 10, new_t + 10)
                        prediction_id = entry.mPredictionId
                        if result:
                            entry = result[0]
                            entry.mPredictionId = prediction_id
                            nerrors_realigned += 1
            else:
                if is_error:
                    nerrors_inconsequential += 1
                    
        entry.mSbjctToken = new_sbjct_token

        ## map query tokens
        if query_map_old2new:
            query_tokens = query_map_old2new[entry.mQueryToken]
        else:
            query_tokens = (entry.mQueryToken,)

        if options.skip_errors and is_error:
            continue

        for query_token in query_tokens:

            entry.mQueryToken = query_token
            
            prediction_id = entry.mPredictionId
            entry.mPredictionId = 0
            
            hid = Genomics.GetHID( str(entry) )
            if hid in output:
                nduplicates += 1
                continue
            
            noutput += 1                        
            if options.renumber: prediction_id = noutput

            entry.mPredictionId = prediction_id

            options.stdout.write( str(entry) + "\n")
            options.stdout.flush()
            found_transcripts[entry.mQueryToken] = 1

    ## write out found transcripts and genes
    nmissed_transcripts = 0
    missed_transcripts = []
    found_genes = {}
    if peptides:
        for x in peptides.keys():
            if x not in found_transcripts:
                nmissed_transcripts += 1
                missed_transcripts.append( x )
            else:
                found_genes[map_transcript2gene[x]] = 1

    missed_genes = {}
    nmissed_genes = 0
    if map_transcript2gene:

        for t in missed_transcripts:
            g = map_transcript2gene[t]
            if g not in found_genes:
                missed_genes[g] = 1
        nmissed_genes = len(missed_genes)
    
    if options.write_missed:
        outfile = open(options.write_missed, "w")
        for x in missed_transcripts:
            if x in unmapped:
                status = "unmapped"
            else:
                status = "mapped"
            outfile.write( "%s\t%s\t%s\n" % ("transcript", x, status ))
        for x in missed_genes:
            status = "unknown"
            outfile.write( "%s\t%s\t%s\n" % ("gene", x, status ))
        
        outfile.close()
        
    options.stdlog.write("# input=%i, output=%i, filtered=%i, nduplicates=%i, mapped=%i, errors=%i\n" % (\
         ninput, noutput, nfiltered, nduplicates, nmapped, nerrors ))
    options.stdlog.write("# errors: inconsequental=%i, boundaries=%i, mapping=%i, inconsistencies=%i, translation=%i, realigned=%i\n" % (\
       nerrors_inconsequential, nerrors_boundaries, nerrors_map, nerrors_inconsistencies, nerrors_translation, nerrors_realigned ))
    options.stdlog.write("# peptides: input=%i, found=%i, missed=%i, found_genes=%i, missed_genes=%i\n" % (\
        len(peptides), len(found_transcripts), nmissed_transcripts, len(found_genes), nmissed_genes) )
    
    E.Stop()
Esempio n. 13
0
    print E.GetParams()
    sys.stdout.flush()

    if param_loglevel >= 1:
        print "# reading exon boundaries."
        sys.stdout.flush()

    cds = Exons.ReadExonBoundaries( open(param_filename_cds, "r") )

    if param_loglevel >= 1:
        print "# read %i cds" % (len(cds))
        sys.stdout.flush()

    ninput, npairs, nskipped = 0, 0, 0

    map_row2col = alignlib_lite.makeAlignmentVector()
    tmp_map_row2col = alignlib_lite.makeAlignmentVector()    
    
    for line in sys.stdin:
        if line[0] == "#": continue
        ninput += 1
        link = BlastAlignments.Link()

        link.Read(line)
        
        if link.mQueryToken == link.mSbjctToken: continue
        
        if link.mQueryToken in cds and \
               link.mSbjctToken in cds:
            
            ## expand to codons
Esempio n. 14
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gpipe/compare_predictions2exons.py 2011 2008-07-04 10:40:51Z andreas $",
                                    usage = globals()["__doc__"] )

    parser.add_option( "-g", "--genome-file", dest="genome_file", type="string",
                       help="filename with genome."  )

    parser.add_option( "-b", "--boundaries", dest="filename_boundaries", type="string",
                       help="filename with exon boundaries."  )

    parser.add_option( "-e", "--exons", dest="filename_exons", type="string",
                       help="filename with exons (output)."  )

    parser.add_option( "-p", "--peptides", dest="filename_peptides", type="string",
                       help="filename with peptide sequences."  )

    parser.add_option( "-w", "--write-notfound", dest="write_notfound", action="store_true",
                       help="print exons for predictions not found in reference."  )

    parser.add_option( "-q", "--quality-pide", dest="quality_threshold_pide", type="int",
                       help="quality threshold (pide) for exons."  )

    parser.set_defaults( 
        genome_file = "genome",
        filename_boundaries = None,
        filename_exons = None,
        filename_peptides = None,
        quality_threshold_pide = 0,
        write_notfound = False,
        ## allowed number of nucleotides for exon boundaries to
        ## be considered equivalent.
        slipping_exon_boundary = 9,
        ## stop codons to search for        
        stop_codons = ("TAG", "TAA", "TGA"), )


    (options, args) = E.Start( parser, add_pipe_options = True )

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    reference_exon_boundaries = {}
    if options.filename_boundaries:
        reference_exon_boundaries = Exons.ReadExonBoundaries( open( options.filename_boundaries, "r"),
                                                              do_invert = 1,
                                                              remove_utr = 1)
        E.info( "read exon boundaries for %i queries" % len(reference_exon_boundaries) )
                
    if options.filename_exons:
        outfile_exons = open( options.filename_exons, "w")
        outfile_exons.write( "%s\n" % "\t".join( (
                    "prediction_id",
                    "exon_id",
                    "exon_from",
                    "exon_to",
                    "exon_frame",
                    "reference_id",
                    "reference_from",
                    "reference_to",
                    "reference_phase",
                    "pidentity",
                    "psimilarity",
                    "nframeshifts",
                    "ngaps",
                    "nstopcodons",
                    "is_ok",
                    "genome_exon_from",
                    "genome_exon_to") ) )

    else:
        outfile_exons = None

    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r") )
        E.info("read peptide sequences for %i queries" % len(peptide_sequences) )
    else:
        peptide_sequences = {}

    entry = PredictionParser.PredictionParserEntry()
    last_filename_genome = None
    
    nfound, nmissed_exons, nmissed_length = 0, 0, 0
    nempty_alignments = 0

    fasta = IndexedFasta.IndexedFasta( options.genome_file )

    options.stdout.write( "%s\n" % "\t".join( (
                "prediction_id", 
                "number",
                "dubious_exons",
                "boundaries_sum",
                "boundaries_max",
                "identical_exons",
                "inserted_exons",
                "deleted_exons",
                "inserted_introns",
                "deleted_introns",
                "truncated_Nterminus",
                "truncated_Cterminus",
                "deleted_Nexons",
                "deleted_Cexons",
                "inserted_Nexons",
                "inserted_Cexons" ) ) )

    for line in sys.stdin:

        if line[0] == "#": continue
        
        try:
            entry.Read(line)
        except ValueError, msg:
            print "# parsing failed with msg %s in line %s" % (msg, line[:-1])
            sys.exit(1)

        exons = Genomics.Alignment2ExonBoundaries( entry.mMapPeptide2Genome,
                                                   query_from = entry.mQueryFrom,
                                                   sbjct_from = entry.mSbjctGenomeFrom,
                                                   add_stop_codon = 0 )

        if exons[-1][4] != entry.mSbjctGenomeTo:
            print "# WARNING: discrepancy in exon calculation!!!"
            for e in exons:
                print "#", str(e)
            print "#", str(entry)

        if options.loglevel >= 5:
            for e in exons:
                print "#", str(e)
        
        genomic_fragment = fasta.getSequence( entry.mSbjctToken, entry.mSbjctStrand,
                                              entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo )
        
        skip = False
        if peptide_sequences.has_key( entry.mQueryToken ):
            
            query_sequence = alignlib_lite.makeSequence(peptide_sequences[entry.mQueryToken])
            sbjct_sequence = alignlib_lite.makeSequence(entry.mTranslation)
            
            percent_similarity, percent_identity = 0, 0
            if query_sequence.getLength() < entry.mMapPeptide2Translation.getRowTo():
                print "# WARNING: query sequence %s is too short: %i %i" % ( entry.mQueryToken,
                                                                             query_sequence.getLength(),
                                                                             entry.mMapPeptide2Translation.getRowTo())
                sys.stdout.flush()
                nmissed_length += 1
                skip = True
                
            elif sbjct_sequence.getLength() < entry.mMapPeptide2Translation.getColTo():
                print "# WARNING: sbjct sequence %s is too short: %i %i" % ( entry.mSbjctToken,
                                                                       sbjct_sequence.getLength(),
                                                                       entry.mMapPeptide2Translation.getColTo())
                sys.stdout.flush()                
                nmissed_length += 1
                skip = True
            else:
                alignlib_lite.rescoreAlignment( entry.mMapPeptide2Translation, 
                                           query_sequence, 
                                           sbjct_sequence,
                                           alignlib_lite.makeScorer( query_sequence, sbjct_sequence ) )
                percent_identity = alignlib_lite.calculatePercentIdentity( entry.mMapPeptide2Translation,
                                                                      query_sequence,
                                                                      sbjct_sequence ) * 100
                percent_similarity = alignlib_lite.calculatePercentSimilarity( entry.mMapPeptide2Translation ) * 100
                
            E.debug( "prediction %s: percent identity/similarity: before=%5.2f/%5.2f, realigned=%5.2f/%5.2f" % (
                    str(entry.mPredictionId), 
                    entry.mPercentSimilarity,
                    entry.mPercentIdentity,
                    percent_similarity,
                    percent_identity ) )
                
        else:
            query_sequence = None
            sbjct_sequence = None

        # default values
        exons_num_exons = "na"
        exons_boundaries_sum = "na"
        exons_boundaries_max = "na"
        dubious_exons = "na"

        ndeleted_exons, ninserted_exons, ndeleted_introns, ninserted_introns, nidentical_exons = 0,0,0,0,0
        truncated_Nterminal_exon, truncated_Cterminal_exon = 0,0
        ndeleted_Nexons, ndeleted_Cexons = 0, 0
        ninserted_Nexons, ninserted_Cexons = 0, 0
        
        exons_offset = exons[0][3]

        if not reference_exon_boundaries.has_key( entry.mQueryToken ):
            print "# WARNING: sequence %s has no exon boundaries" % ( entry.mQueryToken )
            sys.stdout.flush()
            nmissed_exons += 1
            skip = True
        
        if not skip:

            nfound += 1
            
            ref_exons = reference_exon_boundaries[entry.mQueryToken]

            ref_exons_offset = ref_exons[0].mGenomeFrom
            
            exons_num_exons = len(ref_exons) - len(exons)
            exons_boundaries_sum = 0
            exons_phase = 0
            exons_boundaries_max = 0
            dubious_exons = 0
            
            inserted_exons = 0
            temp_inserted_exons = 0
            
            if options.loglevel >= 3:
                for e in exons:
                    options.stdlog.write( "# %s\n" % str(e) )
                for e in ref_exons:
                    options.stdlog.write( "# %s\n" % str(e) )

            min_pide = entry.mPercentIdentity * options.quality_threshold_pide / 100

            in_sync = 0
            e,r = 0,0

            while e < len(exons) and r < len(ref_exons):

                this_e, this_r = e+1, r+1
                percent_identity = 0
                percent_similarity = 0
                is_good_exon = 0

                if options.loglevel >= 4:
                    options.stdlog.write( "# current exons: %i and %i\n" % (e, r) )
                    sys.stdout.flush()
                    
                exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exons[e][0:6]
                ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = (ref_exons[r].mPeptideFrom,
                                                                               ref_exons[r].mPeptideTo,
                                                                               ref_exons[r].frame,
                                                                               ref_exons[r].mGenomeFrom,
                                                                               ref_exons[r].mGenomeTo)

                ref_genome_from -= ref_exons_offset
                ref_genome_to   -= ref_exons_offset

                ## get percent identity for exon
                exon_percent_identity = 0
                exon_percent_similarity = 0
                
                if query_sequence and sbjct_sequence:
                    
                    tmp_ali = alignlib_lite.makeAlignmentVector()

                    xquery_from = exon_from / 3
                    xquery_to = exon_to / 3

                    alignlib_lite.copyAlignment( tmp_ali, entry.mMapPeptide2Translation, xquery_from, xquery_to )

                    if tmp_ali.getLength() == 0:
                        options.stdlog.write( "# WARNING: empty alignment %s\n" % str((ref_from, exon_from, ref_to, exon_to, xquery_from, xquery_to)))
                        nempty_alignments += 1
                    else:
                        if options.loglevel >= 5:
                            options.stdlog.write( "# %s\n" % str( alignlib_lite.AlignmentFormatExplicit( tmp_ali, query_sequence, sbjct_sequence ) ) )

                        exon_percent_identity = alignlib_lite.calculatePercentIdentity( tmp_ali,
                                                                                   query_sequence,
                                                                                   sbjct_sequence ) * 100
                        exon_percent_similarity = alignlib_lite.calculatePercentSimilarity( tmp_ali ) * 100

                if exon_percent_identity >= min_pide:
                    is_good_exon = 1
                else:
                    is_good_exon = 0
                    
                if e < len(exons) -1 :
                    (next_exon_from, next_exon_to, next_exon_phase,
                     next_exon_genome_from, next_exon_genome_to, next_exon_ali) = exons[e+1][0:6]
                else:
                    (next_exon_from, next_exon_to, next_exon_phase,
                     next_exon_genome_from, next_exon_genome_to, next_exon_ali) = 0, 0, 0, 0, 0, []
                    
                if r < len(ref_exons) - 1:
                    next_ref_from, next_ref_to, next_ref_phase = (ref_exons[r+1].mPeptideFrom,
                                                                  ref_exons[r+1].mPeptideTo,
                                                                  ref_exons[r+1].frame)
                else:
                    next_ref_from, next_ref_to, next_ref_phase = 0, 0, 0
                    
                if options.loglevel >= 2:
                    options.stdlog.write( "# %s\n" % "\t".join( map(str, (entry.mQueryToken,
                                                                          exon_from, exon_to, exon_phase,
                                                                          exon_genome_from, exon_genome_to,
                                                                          ref_from, ref_to, ref_phase ))))
                    sys.stdout.flush()                    

                # beware of small exons.
                # if less than options.slipping_exon_boundary: boundary is 0
                # check if end is more than options.splipping_exon_boundary apart as well.
                if exon_to - exon_from <= options.slipping_exon_boundary or \
                        ref_to - ref_from <= options.slipping_exon_boundary:
                    boundary = 0
                else:
                    boundary = options.slipping_exon_boundary
                    
                if ref_to <= exon_from + boundary and \
                   ref_to <= exon_to - options.slipping_exon_boundary:
                    ## no overlap 
                    is_good_exon = 0
                    if e == 0:
                        ndeleted_Nexons += 1
                    else:
                        ndeleted_exons += 1
                    r += 1
                    exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = 0, 0, 0, 0, 0
                    overlap = 0
                elif exon_to <= ref_from + boundary and \
                         exon_to <= ref_to - options.slipping_exon_boundary:
                    ## no overlap
                    is_good_exon = 0
                    if r == 0:
                        ninserted_Nexons += 1
                    else:
                        ninserted_exons += 1
                    e += 1
                    ref_from, ref_to, ref_phase = 0, 0, 0
                    overlap = 0
                else:
                    ## overlap
                    overlap = 1
                    dfrom = int(math.fabs(exon_from - ref_from))
                    dto = int(math.fabs(exon_to - ref_to))

                    ## get percent identity for overlapping fragment 
                    if query_sequence and sbjct_sequence:
                        ## this the problem
                        tmp_ali = alignlib_lite.makeAlignmentVector()
                        
                        xquery_from = max( ref_from / 3, exon_from / 3)
                        xquery_to = min(ref_to / 3, exon_to / 3)

                        alignlib_lite.copyAlignment( tmp_ali, entry.mMapPeptide2Translation, xquery_from, xquery_to )

                        if tmp_ali.getLength() == 0:
                            options.stdlog.write( "# warning: empty alignment %s\n" % str((ref_from, exon_from, ref_to, exon_to, xquery_from, xquery_to )))
                            percent_identity = 0
                            percent_similarity = 0
                        else:
                            if options.loglevel >= 5:
                                print str( alignlib_lite.AlignmentFormatExplicit( tmp_ali, query_sequence, sbjct_sequence ) )

                            percent_identity = alignlib_lite.calculatePercentIdentity( tmp_ali,
                                                                                  query_sequence,
                                                                                  sbjct_sequence ) * 100
                            percent_similarity = alignlib_lite.calculatePercentSimilarity( tmp_ali ) * 100
                            
                    if percent_identity >= min_pide:
                        is_good_exon = 1
                    else:
                        is_good_exon = 0
                        dubious_exons += 1

                    ## adjust regions for terminal exons
                    if e == 0 and r == 0 and dfrom <= (entry.mQueryFrom - 1) * 3 and dfrom > 0:
                        if is_good_exon:                        
                            truncated_Nterminal_exon = dfrom
                        dfrom = 0
                            
                    ## truncated terminal exons
                    if e == len(exons)-1 and r == len(ref_exons)-1 and dto <= (entry.mQueryLength - entry.mQueryTo) * 3 and dto > 0:
                        if is_good_exon:                        
                            truncated_Cterminal_exon = dto
                        dto = 0

                    ## do not count deviations for terminal query exons
                    if e == 0 and dfrom <= entry.mQueryFrom * 3 and dfrom > 0:
                        dfrom = 0
                            
                    if e == len(exons)-1 and dto <= (entry.mQueryLength - entry.mQueryTo) * 3 and dto > 0:
                        dto = 0

                    ## permit difference of one codon (assumed to be stop)
                    if e == len(exons)-1 and r == len(ref_exons)-1 and dto == 3:
                        dto = 0

                    ## deal with different boundary conditions:
                    if dfrom == 0 and dto == 0:
                        if is_good_exon: nidentical_exons += 1
                        e += 1
                        r += 1
                    ## next exon within this ref_exon
                    elif exon_to < ref_to and next_exon_to and next_exon_to <= ref_to + options.slipping_exon_boundary:
                        if is_good_exon: ninserted_introns += 1
                        e += 1
                        in_sync = 1
                        dto = 0
                    ## next ref_exon within this exon
                    elif ref_to < exon_to and next_ref_to and next_ref_to <= exon_to + options.slipping_exon_boundary:
                        if is_good_exon: ndeleted_introns += 1
                        r += 1
                        in_sync = 1
                        dto = 0
                    else:
                        e += 1
                        r += 1
                        if in_sync:
                            dfrom = 0

                    if is_good_exon:
                        exons_boundaries_sum += dfrom + dto
                        exons_boundaries_max = max( dfrom, exons_boundaries_max )
                        exons_boundaries_max = max( dto, exons_boundaries_max )
                    
                        
                    ###########################################################
                    ## count inserted/deleted introns and misplaced boundaries
                    ##
                    ## if exon and next_exon in ref_exon: inserted intron
                    ## if ref_exon and next_ref_exon in exon: deleted intron
                    
                if outfile_exons:

                    if genomic_fragment and exon_genome_to:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures( exon_genome_from - entry.mSbjctGenomeFrom,
                                                                                                                       exon_ali,
                                                                                                                       genomic_fragment,
                                                                                                                       border_stop_codon = 0
                                                                                                                       )
                    else:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons = 0, 0, 0, 0, 0

                    if exon_to == 0: this_e = 0
                    if ref_to == 0: this_r = 0
                    outfile_exons.write( string.join( map(str, (entry.mPredictionId,
                                                                this_e, exon_from, exon_to, exon_phase,
                                                                this_r, ref_from, ref_to, ref_phase,
                                                                percent_identity, percent_similarity,
                                                                nframeshifts, ngaps, nstopcodons,
                                                                is_good_exon,
                                                                exon_genome_from, exon_genome_to,
                                                                )), "\t") + "\n")
                    
            while e < len(exons):
                exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = exons[e][0:5]
                e += 1
                ninserted_Cexons += 1

                if outfile_exons:
                    outfile_exons.write( string.join( map(str, (entry.mPredictionId, 
                                                                e, exon_from, exon_to, exon_phase,
                                                                0, 0, 0, 0,
                                                                0, 0,
                                                                0, 0, 0,
                                                                1,
                                                                exon_genome_from, exon_genome_to,
                                                                )), "\t") + "\n")
                    
            while r < len(ref_exons):
                ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = (ref_exons[r].mPeptideFrom,
                                                                               ref_exons[r].mPeptideTo,
                                                                               ref_exons[r].frame,
                                                                               ref_exons[r].mGenomeFrom,
                                                                               ref_exons[r].mGenomeTo)
                ndeleted_Cexons += 1
                ref_genome_from -= ref_exons_offset
                ref_genome_to -= ref_exons_offset
                r += 1
                if outfile_exons:
                    outfile_exons.write( string.join( map(str, (entry.mPredictionId,
                                                                0, 0, 0, 0,
                                                                r, ref_from, ref_to, ref_phase, 
                                                                0, 0,
                                                                0, 0, 0,
                                                                0,
                                                                0, 0,
                                                                )), "\t") + "\n")
        else:
            if options.write_notfound:
                this_e = 0
                ## use prediction's identity/similarity for exons.
                ## This will still then flag stop-codons in later analysis
                percent_identity = entry.mPercentIdentity
                percent_similarity = entry.mPercentSimilarity
            
                for exon in exons:
                    this_e += 1
                    exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exon[0:6]
                    if genomic_fragment:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures( exon_genome_from - entry.mSbjctGenomeFrom,
                                                                                                                       exon_ali,
                                                                                                                       genomic_fragment )
                    
                    outfile_exons.write( string.join( map(str, (entry.mPredictionId,
                                                                this_e, exon_from, exon_to, exon_phase,
                                                                0, 0, 0, 0,
                                                                percent_identity, percent_similarity,
                                                                nframeshifts, ngaps, nstopcodons,
                                                                1,
                                                                exon_genome_from, exon_genome_to,
                                                                )), "\t") + "\n")
            
        options.stdout.write( "\t".join(map(str,
                              (entry.mPredictionId,
                               exons_num_exons,
                               dubious_exons,
                               exons_boundaries_sum,
                               exons_boundaries_max,
                               nidentical_exons,
                               ninserted_exons, ndeleted_exons,
                               ninserted_introns, ndeleted_introns,
                               truncated_Nterminal_exon, truncated_Cterminal_exon,
                               ndeleted_Nexons, ndeleted_Cexons,
                               ninserted_Nexons, ninserted_Cexons))) + "\n" )
Esempio n. 15
0
def ProcessRegion(predictions, region_id, region,
                  peptide_sequences=None,
                  filter_queries={}):
    """process a set of matches to a region.

    resolve region according to homology.
    """

    if options.loglevel >= 3:
        options.stdlog.write(
            "###################################################################\n")
        options.stdlog.write(
            "# resolving %i predictions in region %s\n" % (len(predictions), str(region)))
        sys.stdout.flush()

    predictions.sort(lambda x, y: cmp(x.score, y.score))
    predictions.reverse()

    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, options.gop, options.gep)
    result = alignlib_lite.makeAlignmentVector()

    cluster = []

    map_sequence2cluster = range(0, len(predictions))
    edges = []

    noutput, nskipped = 0, 0

    if peptide_sequences:
        for x in range(len(predictions)):
            if options.loglevel >= 5:
                options.stdlog.write("# filtering from %i with prediction %i: %s\n" % (
                    x, predictions[x].mPredictionId, predictions[x].mQueryToken))
                sys.stdout.flush()

            if map_sequence2cluster[x] != x:
                continue

            region_id += 1
            edges = []

            if predictions[x].mQueryToken not in filter_queries:
                edges.append(predictions[x])
            else:
                nskipped += 1

            for y in range(x + 1, len(predictions)):

                if map_sequence2cluster[y] != y:
                    continue

                if predictions[x].mQueryToken < predictions[y].mQueryToken:
                    key = "%s-%s" % (predictions[x].mQueryToken,
                                     predictions[y].mQueryToken)
                else:
                    key = "%s-%s" % (predictions[y].mQueryToken,
                                     predictions[x].mQueryToken)

                # check if predictions are overlapping on the genomic sequence
                if min(predictions[x].mSbjctGenomeTo,   predictions[y].mSbjctGenomeTo) - \
                   max(predictions[x].mSbjctGenomeFrom, predictions[y].mSbjctGenomeFrom) < 0:
                    if options.loglevel >= 4:
                        options.stdlog.write("# alignment of predictions %i and %i: no overlap on genomic sequence, thus skipped\n" %
                                             (predictions[x].mPredictionId,
                                              predictions[y].mPredictionId))
                        sys.stdout.flush()
                    continue

                if not global_alignments.has_key(key):

                    seq1 = peptide_sequences[predictions[x].mQueryToken]
                    seq2 = peptide_sequences[predictions[y].mQueryToken]
                    result.clear()
                    s1 = alignlib_lite.makeSequence(seq1)
                    s2 = alignlib_lite.makeSequence(seq2)
                    alignator.align(result, s1, s2)

                    c1 = 100 * \
                        (result.getRowTo() - result.getRowFrom()) / len(seq1)
                    c2 = 100 * \
                        (result.getColTo() - result.getColFrom()) / len(seq2)
                    min_cov = min(c1, c2)
                    max_cov = max(c1, c2)

                    identity = alignlib_lite.calculatePercentIdentity(
                        result, s1, s2) * 100

                    # check if predictions overlap and they are homologous
                    if result.getScore() >= options.overlap_min_score and \
                       max_cov >= options.overlap_max_coverage and \
                       min_cov >= options.overlap_min_coverage and \
                       identity >= options.overlap_min_identity:
                        global_alignments[key] = True
                    else:
                        global_alignments[key] = False

                    if options.loglevel >= 4:
                        options.stdlog.write("# alignment=%s score=%i pid=%5.2f c1=%i c2=%i min_cov=%i max_cov=%i homolog=%s\n" %
                                             (key,
                                              result.getScore(),
                                              identity,
                                              c1, c2, min_cov, max_cov,
                                              global_alignments[key]))
                        sys.stdout.flush()

                if global_alignments[key]:
                    map_sequence2cluster[y] = x
                    if predictions[y].mQueryToken not in filter_queries:
                        edges.append(predictions[y])
                    else:
                        nskipped += 1

            noutput += PrintEdges(region_id, region, edges)

    return region_id, noutput, nskipped
Esempio n. 16
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/compare_predictions2exons.py 2011 2008-07-04 10:40:51Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-b",
                      "--boundaries",
                      dest="filename_boundaries",
                      type="string",
                      help="filename with exon boundaries.")

    parser.add_option("-e",
                      "--exons",
                      dest="filename_exons",
                      type="string",
                      help="filename with exons (output).")

    parser.add_option("-p",
                      "--peptides",
                      dest="filename_peptides",
                      type="string",
                      help="filename with peptide sequences.")

    parser.add_option(
        "-w",
        "--write-notfound",
        dest="write_notfound",
        action="store_true",
        help="print exons for predictions not found in reference.")

    parser.add_option("-q",
                      "--quality-pide",
                      dest="quality_threshold_pide",
                      type="int",
                      help="quality threshold (pide) for exons.")

    parser.set_defaults(
        genome_file="genome",
        filename_boundaries=None,
        filename_exons=None,
        filename_peptides=None,
        quality_threshold_pide=0,
        write_notfound=False,
        ## allowed number of nucleotides for exon boundaries to
        ## be considered equivalent.
        slipping_exon_boundary=9,
        ## stop codons to search for
        stop_codons=("TAG", "TAA", "TGA"),
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    reference_exon_boundaries = {}
    if options.filename_boundaries:
        reference_exon_boundaries = Exons.ReadExonBoundaries(open(
            options.filename_boundaries, "r"),
                                                             do_invert=1,
                                                             remove_utr=1)
        E.info("read exon boundaries for %i queries" %
               len(reference_exon_boundaries))

    if options.filename_exons:
        outfile_exons = open(options.filename_exons, "w")
        outfile_exons.write("%s\n" % "\t".join(
            ("prediction_id", "exon_id", "exon_from", "exon_to", "exon_frame",
             "reference_id", "reference_from", "reference_to",
             "reference_phase", "pidentity", "psimilarity", "nframeshifts",
             "ngaps", "nstopcodons", "is_ok", "genome_exon_from",
             "genome_exon_to")))

    else:
        outfile_exons = None

    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences(
            open(options.filename_peptides, "r"))
        E.info("read peptide sequences for %i queries" %
               len(peptide_sequences))
    else:
        peptide_sequences = {}

    entry = PredictionParser.PredictionParserEntry()
    last_filename_genome = None

    nfound, nmissed_exons, nmissed_length = 0, 0, 0
    nempty_alignments = 0

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    options.stdout.write("%s\n" % "\t".join(
        ("prediction_id", "number", "dubious_exons", "boundaries_sum",
         "boundaries_max", "identical_exons", "inserted_exons",
         "deleted_exons", "inserted_introns", "deleted_introns",
         "truncated_Nterminus", "truncated_Cterminus", "deleted_Nexons",
         "deleted_Cexons", "inserted_Nexons", "inserted_Cexons")))

    for line in sys.stdin:

        if line[0] == "#": continue

        try:
            entry.Read(line)
        except ValueError, msg:
            print "# parsing failed with msg %s in line %s" % (msg, line[:-1])
            sys.exit(1)

        exons = Genomics.Alignment2ExonBoundaries(
            entry.mMapPeptide2Genome,
            query_from=entry.mQueryFrom,
            sbjct_from=entry.mSbjctGenomeFrom,
            add_stop_codon=0)

        if exons[-1][4] != entry.mSbjctGenomeTo:
            print "# WARNING: discrepancy in exon calculation!!!"
            for e in exons:
                print "#", str(e)
            print "#", str(entry)

        if options.loglevel >= 5:
            for e in exons:
                print "#", str(e)

        genomic_fragment = fasta.getSequence(entry.mSbjctToken,
                                             entry.mSbjctStrand,
                                             entry.mSbjctGenomeFrom,
                                             entry.mSbjctGenomeTo)

        skip = False
        if peptide_sequences.has_key(entry.mQueryToken):

            query_sequence = alignlib_lite.makeSequence(
                peptide_sequences[entry.mQueryToken])
            sbjct_sequence = alignlib_lite.makeSequence(entry.mTranslation)

            percent_similarity, percent_identity = 0, 0
            if query_sequence.getLength(
            ) < entry.mMapPeptide2Translation.getRowTo():
                print "# WARNING: query sequence %s is too short: %i %i" % (
                    entry.mQueryToken, query_sequence.getLength(),
                    entry.mMapPeptide2Translation.getRowTo())
                sys.stdout.flush()
                nmissed_length += 1
                skip = True

            elif sbjct_sequence.getLength(
            ) < entry.mMapPeptide2Translation.getColTo():
                print "# WARNING: sbjct sequence %s is too short: %i %i" % (
                    entry.mSbjctToken, sbjct_sequence.getLength(),
                    entry.mMapPeptide2Translation.getColTo())
                sys.stdout.flush()
                nmissed_length += 1
                skip = True
            else:
                alignlib_lite.rescoreAlignment(
                    entry.mMapPeptide2Translation, query_sequence,
                    sbjct_sequence,
                    alignlib_lite.makeScorer(query_sequence, sbjct_sequence))
                percent_identity = alignlib_lite.calculatePercentIdentity(
                    entry.mMapPeptide2Translation, query_sequence,
                    sbjct_sequence) * 100
                percent_similarity = alignlib_lite.calculatePercentSimilarity(
                    entry.mMapPeptide2Translation) * 100

            E.debug(
                "prediction %s: percent identity/similarity: before=%5.2f/%5.2f, realigned=%5.2f/%5.2f"
                %
                (str(entry.mPredictionId), entry.mPercentSimilarity,
                 entry.mPercentIdentity, percent_similarity, percent_identity))

        else:
            query_sequence = None
            sbjct_sequence = None

        # default values
        exons_num_exons = "na"
        exons_boundaries_sum = "na"
        exons_boundaries_max = "na"
        dubious_exons = "na"

        ndeleted_exons, ninserted_exons, ndeleted_introns, ninserted_introns, nidentical_exons = 0, 0, 0, 0, 0
        truncated_Nterminal_exon, truncated_Cterminal_exon = 0, 0
        ndeleted_Nexons, ndeleted_Cexons = 0, 0
        ninserted_Nexons, ninserted_Cexons = 0, 0

        exons_offset = exons[0][3]

        if not reference_exon_boundaries.has_key(entry.mQueryToken):
            print "# WARNING: sequence %s has no exon boundaries" % (
                entry.mQueryToken)
            sys.stdout.flush()
            nmissed_exons += 1
            skip = True

        if not skip:

            nfound += 1

            ref_exons = reference_exon_boundaries[entry.mQueryToken]

            ref_exons_offset = ref_exons[0].mGenomeFrom

            exons_num_exons = len(ref_exons) - len(exons)
            exons_boundaries_sum = 0
            exons_phase = 0
            exons_boundaries_max = 0
            dubious_exons = 0

            inserted_exons = 0
            temp_inserted_exons = 0

            if options.loglevel >= 3:
                for e in exons:
                    options.stdlog.write("# %s\n" % str(e))
                for e in ref_exons:
                    options.stdlog.write("# %s\n" % str(e))

            min_pide = entry.mPercentIdentity * options.quality_threshold_pide / 100

            in_sync = 0
            e, r = 0, 0

            while e < len(exons) and r < len(ref_exons):

                this_e, this_r = e + 1, r + 1
                percent_identity = 0
                percent_similarity = 0
                is_good_exon = 0

                if options.loglevel >= 4:
                    options.stdlog.write("# current exons: %i and %i\n" %
                                         (e, r))
                    sys.stdout.flush()

                exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exons[
                    e][0:6]
                ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = (
                    ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo,
                    ref_exons[r].frame, ref_exons[r].mGenomeFrom,
                    ref_exons[r].mGenomeTo)

                ref_genome_from -= ref_exons_offset
                ref_genome_to -= ref_exons_offset

                ## get percent identity for exon
                exon_percent_identity = 0
                exon_percent_similarity = 0

                if query_sequence and sbjct_sequence:

                    tmp_ali = alignlib_lite.makeAlignmentVector()

                    xquery_from = exon_from / 3
                    xquery_to = exon_to / 3

                    alignlib_lite.copyAlignment(tmp_ali,
                                                entry.mMapPeptide2Translation,
                                                xquery_from, xquery_to)

                    if tmp_ali.getLength() == 0:
                        options.stdlog.write(
                            "# WARNING: empty alignment %s\n" % str(
                                (ref_from, exon_from, ref_to, exon_to,
                                 xquery_from, xquery_to)))
                        nempty_alignments += 1
                    else:
                        if options.loglevel >= 5:
                            options.stdlog.write("# %s\n" % str(
                                alignlib_lite.AlignmentFormatExplicit(
                                    tmp_ali, query_sequence, sbjct_sequence)))

                        exon_percent_identity = alignlib_lite.calculatePercentIdentity(
                            tmp_ali, query_sequence, sbjct_sequence) * 100
                        exon_percent_similarity = alignlib_lite.calculatePercentSimilarity(
                            tmp_ali) * 100

                if exon_percent_identity >= min_pide:
                    is_good_exon = 1
                else:
                    is_good_exon = 0

                if e < len(exons) - 1:
                    (next_exon_from, next_exon_to, next_exon_phase,
                     next_exon_genome_from, next_exon_genome_to,
                     next_exon_ali) = exons[e + 1][0:6]
                else:
                    (next_exon_from, next_exon_to, next_exon_phase,
                     next_exon_genome_from, next_exon_genome_to,
                     next_exon_ali) = 0, 0, 0, 0, 0, []

                if r < len(ref_exons) - 1:
                    next_ref_from, next_ref_to, next_ref_phase = (
                        ref_exons[r + 1].mPeptideFrom,
                        ref_exons[r + 1].mPeptideTo, ref_exons[r + 1].frame)
                else:
                    next_ref_from, next_ref_to, next_ref_phase = 0, 0, 0

                if options.loglevel >= 2:
                    options.stdlog.write("# %s\n" % "\t".join(
                        map(str, (entry.mQueryToken, exon_from, exon_to,
                                  exon_phase, exon_genome_from, exon_genome_to,
                                  ref_from, ref_to, ref_phase))))
                    sys.stdout.flush()

                # beware of small exons.
                # if less than options.slipping_exon_boundary: boundary is 0
                # check if end is more than options.splipping_exon_boundary apart as well.
                if exon_to - exon_from <= options.slipping_exon_boundary or \
                        ref_to - ref_from <= options.slipping_exon_boundary:
                    boundary = 0
                else:
                    boundary = options.slipping_exon_boundary

                if ref_to <= exon_from + boundary and \
                   ref_to <= exon_to - options.slipping_exon_boundary:
                    ## no overlap
                    is_good_exon = 0
                    if e == 0:
                        ndeleted_Nexons += 1
                    else:
                        ndeleted_exons += 1
                    r += 1
                    exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = 0, 0, 0, 0, 0
                    overlap = 0
                elif exon_to <= ref_from + boundary and \
                         exon_to <= ref_to - options.slipping_exon_boundary:
                    ## no overlap
                    is_good_exon = 0
                    if r == 0:
                        ninserted_Nexons += 1
                    else:
                        ninserted_exons += 1
                    e += 1
                    ref_from, ref_to, ref_phase = 0, 0, 0
                    overlap = 0
                else:
                    ## overlap
                    overlap = 1
                    dfrom = int(math.fabs(exon_from - ref_from))
                    dto = int(math.fabs(exon_to - ref_to))

                    ## get percent identity for overlapping fragment
                    if query_sequence and sbjct_sequence:
                        ## this the problem
                        tmp_ali = alignlib_lite.makeAlignmentVector()

                        xquery_from = max(ref_from / 3, exon_from / 3)
                        xquery_to = min(ref_to / 3, exon_to / 3)

                        alignlib_lite.copyAlignment(
                            tmp_ali, entry.mMapPeptide2Translation,
                            xquery_from, xquery_to)

                        if tmp_ali.getLength() == 0:
                            options.stdlog.write(
                                "# warning: empty alignment %s\n" % str(
                                    (ref_from, exon_from, ref_to, exon_to,
                                     xquery_from, xquery_to)))
                            percent_identity = 0
                            percent_similarity = 0
                        else:
                            if options.loglevel >= 5:
                                print str(
                                    alignlib_lite.AlignmentFormatExplicit(
                                        tmp_ali, query_sequence,
                                        sbjct_sequence))

                            percent_identity = alignlib_lite.calculatePercentIdentity(
                                tmp_ali, query_sequence, sbjct_sequence) * 100
                            percent_similarity = alignlib_lite.calculatePercentSimilarity(
                                tmp_ali) * 100

                    if percent_identity >= min_pide:
                        is_good_exon = 1
                    else:
                        is_good_exon = 0
                        dubious_exons += 1

                    ## adjust regions for terminal exons
                    if e == 0 and r == 0 and dfrom <= (entry.mQueryFrom -
                                                       1) * 3 and dfrom > 0:
                        if is_good_exon:
                            truncated_Nterminal_exon = dfrom
                        dfrom = 0

                    ## truncated terminal exons
                    if e == len(exons) - 1 and r == len(
                            ref_exons) - 1 and dto <= (
                                entry.mQueryLength -
                                entry.mQueryTo) * 3 and dto > 0:
                        if is_good_exon:
                            truncated_Cterminal_exon = dto
                        dto = 0

                    ## do not count deviations for terminal query exons
                    if e == 0 and dfrom <= entry.mQueryFrom * 3 and dfrom > 0:
                        dfrom = 0

                    if e == len(exons) - 1 and dto <= (
                            entry.mQueryLength -
                            entry.mQueryTo) * 3 and dto > 0:
                        dto = 0

                    ## permit difference of one codon (assumed to be stop)
                    if e == len(exons) - 1 and r == len(
                            ref_exons) - 1 and dto == 3:
                        dto = 0

                    ## deal with different boundary conditions:
                    if dfrom == 0 and dto == 0:
                        if is_good_exon: nidentical_exons += 1
                        e += 1
                        r += 1
                    ## next exon within this ref_exon
                    elif exon_to < ref_to and next_exon_to and next_exon_to <= ref_to + options.slipping_exon_boundary:
                        if is_good_exon: ninserted_introns += 1
                        e += 1
                        in_sync = 1
                        dto = 0
                    ## next ref_exon within this exon
                    elif ref_to < exon_to and next_ref_to and next_ref_to <= exon_to + options.slipping_exon_boundary:
                        if is_good_exon: ndeleted_introns += 1
                        r += 1
                        in_sync = 1
                        dto = 0
                    else:
                        e += 1
                        r += 1
                        if in_sync:
                            dfrom = 0

                    if is_good_exon:
                        exons_boundaries_sum += dfrom + dto
                        exons_boundaries_max = max(dfrom, exons_boundaries_max)
                        exons_boundaries_max = max(dto, exons_boundaries_max)

                    ###########################################################
                    ## count inserted/deleted introns and misplaced boundaries
                    ##
                    ## if exon and next_exon in ref_exon: inserted intron
                    ## if ref_exon and next_ref_exon in exon: deleted intron

                if outfile_exons:

                    if genomic_fragment and exon_genome_to:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures(
                            exon_genome_from - entry.mSbjctGenomeFrom,
                            exon_ali,
                            genomic_fragment,
                            border_stop_codon=0)
                    else:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons = 0, 0, 0, 0, 0

                    if exon_to == 0: this_e = 0
                    if ref_to == 0: this_r = 0
                    outfile_exons.write(
                        string.join(
                            map(str, (
                                entry.mPredictionId,
                                this_e,
                                exon_from,
                                exon_to,
                                exon_phase,
                                this_r,
                                ref_from,
                                ref_to,
                                ref_phase,
                                percent_identity,
                                percent_similarity,
                                nframeshifts,
                                ngaps,
                                nstopcodons,
                                is_good_exon,
                                exon_genome_from,
                                exon_genome_to,
                            )), "\t") + "\n")

            while e < len(exons):
                exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = exons[
                    e][0:5]
                e += 1
                ninserted_Cexons += 1

                if outfile_exons:
                    outfile_exons.write(
                        string.join(
                            map(str, (
                                entry.mPredictionId,
                                e,
                                exon_from,
                                exon_to,
                                exon_phase,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                1,
                                exon_genome_from,
                                exon_genome_to,
                            )), "\t") + "\n")

            while r < len(ref_exons):
                ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = (
                    ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo,
                    ref_exons[r].frame, ref_exons[r].mGenomeFrom,
                    ref_exons[r].mGenomeTo)
                ndeleted_Cexons += 1
                ref_genome_from -= ref_exons_offset
                ref_genome_to -= ref_exons_offset
                r += 1
                if outfile_exons:
                    outfile_exons.write(
                        string.join(
                            map(str, (
                                entry.mPredictionId,
                                0,
                                0,
                                0,
                                0,
                                r,
                                ref_from,
                                ref_to,
                                ref_phase,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                            )), "\t") + "\n")
        else:
            if options.write_notfound:
                this_e = 0
                ## use prediction's identity/similarity for exons.
                ## This will still then flag stop-codons in later analysis
                percent_identity = entry.mPercentIdentity
                percent_similarity = entry.mPercentSimilarity

                for exon in exons:
                    this_e += 1
                    exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exon[
                        0:6]
                    if genomic_fragment:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures(
                            exon_genome_from - entry.mSbjctGenomeFrom,
                            exon_ali, genomic_fragment)

                    outfile_exons.write(
                        string.join(
                            map(str, (
                                entry.mPredictionId,
                                this_e,
                                exon_from,
                                exon_to,
                                exon_phase,
                                0,
                                0,
                                0,
                                0,
                                percent_identity,
                                percent_similarity,
                                nframeshifts,
                                ngaps,
                                nstopcodons,
                                1,
                                exon_genome_from,
                                exon_genome_to,
                            )), "\t") + "\n")

        options.stdout.write("\t".join(
            map(str, (entry.mPredictionId, exons_num_exons, dubious_exons,
                      exons_boundaries_sum, exons_boundaries_max,
                      nidentical_exons, ninserted_exons, ndeleted_exons,
                      ninserted_introns, ndeleted_introns,
                      truncated_Nterminal_exon, truncated_Cterminal_exon,
                      ndeleted_Nexons, ndeleted_Cexons, ninserted_Nexons,
                      ninserted_Cexons))) + "\n")
Esempio n. 17
0
    print E.GetParams()
    sys.stdout.flush()

    if param_loglevel >= 1:
        print "# reading exon boundaries."
        sys.stdout.flush()

    cds = Exons.ReadExonBoundaries(open(param_filename_cds, "r"))

    if param_loglevel >= 1:
        print "# read %i cds" % (len(cds))
        sys.stdout.flush()

    ninput, npairs, nskipped = 0, 0, 0

    map_row2col = alignlib_lite.makeAlignmentVector()
    tmp_map_row2col = alignlib_lite.makeAlignmentVector()

    for line in sys.stdin:
        if line[0] == "#":
            continue
        ninput += 1
        link = BlastAlignments.Link()

        link.Read(line)

        if link.mQueryToken == link.mSbjctToken:
            continue

        if link.mQueryToken in cds and \
                link.mSbjctToken in cds:
Esempio n. 18
0
	nstopcodons, 
	pidentity, 
	psimilarity, 
	sequence, 
	sbjct_genome_from, 
	sbjct_genome_to, 
	map_query2genome
    FROM %s AS p 
    WHERE p.sbjct_token = '%s' AND
    p.sbjct_strand = '%s' AND 
    OVERLAP( %i, %i, p.sbjct_genome_from, sbjct_genome_to) > 0 
    """

    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, param_gop, param_gep)
    map_reference2target = alignlib_lite.makeAlignmentVector()
    assignment_id = 0

    for line in cr.fetchall():

        reference = PredictionParser.PredictionParserEntry()
        reference.FillFromTable(line)

        ct = dbhandle.cursor()
        ct.execute(statement % (param_tablename_predictions_target,
                                reference.mSbjctToken, reference.mSbjctStrand,
                                reference.mSbjctGenomeFrom, reference.mSbjctGenomeTo))

        reference_exons = Exons.Alignment2Exons(reference.mMapPeptide2Genome,
                                                0,
                                                reference.mSbjctFrom)
Esempio n. 19
0
def IsParalogLink(link, cds1, cds2):
    """sort out ortholog relationships between
    transcripts of orthologous genes.

    """

    map_a2b = alignlib_lite.makeAlignmentVector()
    alignlib_lite.AlignmentFormatEmissions(link.mQueryFrom, link.mQueryAli, link.mSbjctFrom, link.mSbjctAli).copy(
        map_a2b
    )

    if link.mQueryLength < (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) or link.mSbjctLength < (
        map_a2b.getColTo() - map_a2b.getColFrom() + 1
    ):
        print "ERRONEOUS LINK: %s" % str(link)
        raise "length discrepancy"

    coverage_a = 100.0 * (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) / link.mQueryLength
    coverage_b = 100.0 * (map_a2b.getColTo() - map_a2b.getColFrom() + 1) / link.mSbjctLength

    # check exon boundaries, look at starts, skip first exon
    def MyMap(a, x):
        if x < a.getRowFrom():
            return 0
        while x <= a.getRowTo():
            c = a.mapRowToCol(x)
            if c:
                return c
            x += 1
        else:
            return 0

    mapped_boundaries = UniquifyList(map(lambda x: MyMap(map_a2b, x.mPeptideFrom / 3 + 1), cds1[1:]))
    reference_boundaries = UniquifyList(map(lambda x: x.mPeptideFrom / 3 + 1, cds2[1:]))

    nmissed = 0
    nfound = 0
    nmin = min(len(mapped_boundaries), len(reference_boundaries))
    nmax = max(len(mapped_boundaries), len(reference_boundaries))
    both_single_exon = len(cds1) == 1 and len(cds2) == 1
    one_single_exon = len(cds1) == 1 or len(cds2) == 1
    if len(mapped_boundaries) < len(reference_boundaries):
        mless = mapped_boundaries
        mmore = reference_boundaries
    else:
        mmore = mapped_boundaries
        mless = reference_boundaries

    # check if exon boundaries are ok
    for x in mless:
        is_ok = 0
        for c in mmore:
            if abs(x - c) < param_boundaries_max_slippage:
                is_ok = 1
                break
        if is_ok:
            nfound += 1
        else:
            nmissed += 1

    # set is_ok for dependent on exon boundaries
    # in single exon cases, require a check of coverage
    is_ok = False
    check_coverage = False
    if both_single_exon or one_single_exon:
        is_ok = True
        check_coverage = True
    else:
        if nmin == 1:
            is_ok = nmissed == 0
        elif nmin == 2:
            is_ok = nmissed <= 1
        elif nmin > 2:
            is_ok = nfound >= 2

    cc = min(coverage_a, coverage_b)

    if param_loglevel >= 3:
        print "# nquery=", len(cds1), "nsbjct=", len(
            cds2
        ), "nmin=", nmin, "nmissed=", nmissed, "nfound=", nfound, "is_ok=", is_ok, "check_cov=", check_coverage, "min_cov=", cc, coverage_a, coverage_b, "mapped=", mapped_boundaries, "reference=", reference_boundaries

    if not is_ok:
        return True, "different exon boundaries"

    if check_coverage and cc < param_min_coverage:
        return True, "low coverage"

    return False, None
Esempio n. 20
0
def PrintCluster(cluster,
                 cluster_id,
                 lengths,
                 peptide_sequences=None,
                 regex_preferred=None):
    """print a cluster.

    Take longest sequence as representative. If preferred is given, only take
    genes matching preferred identifier.
    """

    if regex_preferred:
        rx = re.compile(regex_preferred)
    else:
        rx = None

    max_al = 0
    max_pl = 0
    rep_a = None
    rep_p = None
    for c in cluster:
        l = 0
        if c in lengths: l = lengths[c]

        if l > max_al:
            max_al = l
            rep_a = c

        if rx and rx.search(c) and l > max_pl:
            max_pl = l
            rep_p = c

    if max_pl > 0:
        max_l = max_pl
        rep = rep_p
    else:
        max_l = max_al
        rep = rep_a

    for mem in cluster:
        l = 0
        if mem in lengths: l = lengths[mem]
        if peptide_sequences:
            map_rep2mem = alignlib_lite.makeAlignmentVector()

            if rep == mem and rep in lengths:
                alignlib_lite.addDiagonal2Alignment(map_rep2mem, 1,
                                                    lengths[rep], 0)
            elif mem in peptide_sequences and \
                     rep in peptide_sequences:
                alignator = alignlib_lite.makeAlignatorDPFull(
                    alignlib_lite.ALIGNMENT_LOCAL, -10.0, -1.0)
                alignator.align(
                    map_rep2mem,
                    alignlib_lite.makeSequence(peptide_sequences[rep]),
                    alignlib_lite.makeSequence(peptide_sequences[mem]))

            f = alignlib_lite.AlignmentFormatEmissions(map_rep2mem)
            print string.join(map(str, (rep, mem, l, f)), "\t")

        else:
            print string.join(map(str, (rep, mem, l)), "\t")

    sys.stdout.flush()

    return cluster_id
Esempio n. 21
0
def PrintCluster(cluster,
                 cluster_id,
                 lengths,
                 peptide_sequences=None,
                 regex_preferred=None):
    """print a cluster.

    Take longest sequence as representative. If preferred is given, only take
    genes matching preferred identifier.
    """

    if regex_preferred:
        rx = re.compile(regex_preferred)
    else:
        rx = None

    max_al = 0
    max_pl = 0
    rep_a = None
    rep_p = None
    for c in cluster:
        l = 0
        if c in lengths:
            l = lengths[c]

        if l > max_al:
            max_al = l
            rep_a = c

        if rx and rx.search(c) and l > max_pl:
            max_pl = l
            rep_p = c

    if max_pl > 0:
        max_l = max_pl
        rep = rep_p
    else:
        max_l = max_al
        rep = rep_a

    for mem in cluster:
        l = 0
        if mem in lengths:
            l = lengths[mem]
        if peptide_sequences:
            map_rep2mem = alignlib_lite.makeAlignmentVector()

            if rep == mem and rep in lengths:
                alignlib_lite.addDiagonal2Alignment(
                    map_rep2mem, 1, lengths[rep], 0)
            elif mem in peptide_sequences and \
                    rep in peptide_sequences:
                alignator = alignlib_lite.makeAlignatorDPFull(
                    alignlib_lite.ALIGNMENT_LOCAL, -10.0, -1.0)
                alignator.align(map_rep2mem,
                                alignlib_lite.makeSequence(
                                    peptide_sequences[rep]),
                                alignlib_lite.makeSequence(peptide_sequences[mem]))

            f = alignlib_lite.AlignmentFormatEmissions(map_rep2mem)
            print string.join(map(str, (rep, mem, l, f)), "\t")

        else:
            print string.join(map(str, (rep, mem, l)), "\t")

    sys.stdout.flush()

    return cluster_id
Esempio n. 22
0
        if param_loglevel >= 1:
            print "# reading has finished."
            sys.stdout.flush()

        alignator = alignlib_lite.makeAlignatorDPFull(
            alignlib_lite.ALIGNMENT_LOCAL, -10.0, -2.0)

        for q1, q2 in pairs:

            ninput += 1

            if param_loglevel >= 1:
                print "# processing %s and %s" % (q1, q2)

            if q1 in transcripts1 and q2 in transcripts2:
                map_a2b = alignlib_lite.makeAlignmentVector()

                alignator.align(map_a2b,
                                alignlib_lite.makeSequence(peptides1[q1]),
                                alignlib_lite.makeSequence(peptides2[q2]))

                if map_a2b.getLength() == 0:
                    if param_loglevel >= 1:
                        print "# Alignment failed between %s and %s" % (q1, q2)
                        sys.stdout.flush()

                    ntotal_errors += 1
                    continue

                nerrors, nskipped = WriteExons(q1, peptides1[q1], cds1[q1],
                                               transcripts1[q1], q2,
Esempio n. 23
0
def EliminateRedundantEntries(
    rep, data, eliminated_predictions, options, peptides, extended_peptides, filter_quality=None, this_quality=None
):
    """eliminate redundant entries in a set."""

    eliminated = []

    rep_id = rep.transcript_id
    rep_coverage, rep_pid = rep.mQueryCoverage, rep.mPid

    alignator = alignlib_lite.makeAlignatorDPFull(alignlib_lite.ALIGNMENT_LOCAL, options.gop, options.gep)
    result = alignlib_lite.makeAlignmentVector()

    rep_seq = peptides[rep_id]
    rep_extended_seq = extended_peptides[rep_id]

    for entry in data:

        mem_id, mem_coverage, mem_pid, mem_quality = (
            entry.transcript_id,
            entry.mQueryCoverage,
            entry.mPid,
            entry.mQuality,
        )

        mem_seq = peptides[mem_id]
        mem_extended_seq = extended_peptides[mem_id]

        if options.loglevel >= 4:
            options.stdlog.write("# processing: id=%s class=%s\n" % (mem_id, mem_quality))

        if mem_id in eliminated_predictions:
            continue

        if mem_extended_seq == rep_extended_seq:
            eliminated_predictions[mem_id] = rep_id
            eliminated.append((mem_id, "i"))

        elif mem_extended_seq in rep_extended_seq:
            eliminated_predictions[mem_id] = rep_id
            eliminated.append((mem_id, "p"))

        else:
            if mem_quality != this_quality or mem_quality in options.quality_exclude_same:

                seq1 = alignlib_lite.makeSequence(str(rep_seq))
                seq2 = alignlib_lite.makeSequence(str(mem_seq))

                alignator.align(result, seq1, seq2)

                if options.loglevel >= 5:
                    options.stdlog.write("# ali\n%s\n" % alignlib_lite.AlignmentFormatExplicit(result, seq1, seq2))

                pidentity = 100 * alignlib_lite.calculatePercentIdentity(result, seq1, seq2)

                num_gaps = result.getNumGaps()

                if options.loglevel >= 4:
                    options.stdlog.write(
                        "# processing: id=%s class=%s pid=%5.2f rep_cov=%i mem_cov=%i\n"
                        % (mem_id, mem_quality, pidentity, rep_coverage, mem_coverage)
                    )

                if pidentity >= options.min_identity:

                    keep = False
                    if rep_coverage < mem_coverage - options.safety_coverage or rep_pid < mem_pid - options.safety_pide:
                        keep = True
                        reason = "covpid"
                    elif num_gaps >= options.max_gaps and mem_coverage > rep_coverage - options.safety_coverage:
                        keep = True
                        reason = "gaps"
                    elif (
                        mem_coverage >= rep_coverage - options.safety_coverage
                        and 100 * (result.getColTo() - result.getColFrom()) / len(mem_seq) < options.max_member_coverage
                    ):
                        keep = True
                        reason = "memcov"

                    if keep:
                        options.stdlog.write(
                            "# WARNING: not removing possibly good prediction: %s: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n"
                            % (reason, rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid)
                        )
                    else:
                        eliminated_predictions[mem_id] = rep_id
                        eliminated.append((mem_id, "h"))

                elif (
                    pidentity >= options.min_identity_non_genes
                    and this_quality in options.quality_genes
                    and mem_quality not in options.quality_genes
                ):
                    if rep_coverage < mem_coverage - options.safety_coverage or rep_pid < mem_pid - options.safety_pide:
                        options.stdlog.write(
                            "# WARNING: not removing possibly good prediction: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n"
                            % (rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid)
                        )
                    else:
                        eliminated_predictions[mem_id] = rep_id
                        eliminated.append((mem_id, "l"))

    return eliminated
Esempio n. 24
0
def WriteExons(token1, peptide1, cds1, transcript1, token2, peptide2, cds2,
               transcript2, peptide_map_a2b):

    if param_loglevel >= 3:
        for cd in cds1:
            print "#", str(cd)
        for cd in cds2:
            print "#", str(cd)
        print "# peptide_map_a2b", str(
            alignlib_lite.AlignmentFormatExplicit(peptide_map_a2b))
        sys.stdout.flush()

    dna_map_a2b = Genomics.AlignmentProtein2CDNA(peptide_map_a2b, cds1, cds2)

    if len(cds1) != len(cds2):
        if param_loglevel >= 4:
            print ""  # WARNING: different number of exons!"

    seq1 = alignlib_lite.makeSequence(transcript1)
    seq2 = alignlib_lite.makeSequence(transcript2)
    tmp_map_a2b = alignlib_lite.makeAlignmentVector()

    dialign = WrapperDialign.Dialign("-n")
    dialignlgs = WrapperDialign.Dialign("-n -it -thr 2 -lmax 30 -smin 8")
    dba = WrapperDBA.DBA()
    #clustal = WrapperClustal.Clustal()

    matrix, gop, gep = global_substitution_matrix
    alignator_nw = alignlib_lite.makeAlignatorDPFullDP(
        alignlib_lite.ALIGNMENT_GLOBAL, gop, gep, matrix)
    alignator_sw = alignlib_lite.makeAlignatorDPFullDP(
        alignlib_lite.ALIGNMENT_LOCAL, gop, gep, matrix)

    # concatenated alignments for exons:
    # 1: only the common parts
    ali_common1 = ""
    ali_common2 = ""

    e1, e2 = 0, 0
    while cds1[e1].mGenomeTo <= dna_map_a2b.getRowFrom():
        e1 += 1
    while cds2[e2].mGenomeTo <= dna_map_a2b.getColFrom():
        e2 += 1

    nskipped, nerrors = 0, 0

    if param_loglevel >= 5:
        nmapped = 0
        for x in range(dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo() + 1):
            if dna_map_a2b.mapRowToCol(x) >= 0:
                nmapped += 1
        print "# nmapped=", nmapped
        print str(alignlib_lite.AlignmentFormatEmissions(dna_map_a2b))

    # declare alignments used
    map_intron_a2b = alignlib_lite.makeAlignmentVector()

    result = Exons.CompareGeneStructures(cds1,
                                         cds2,
                                         map_cmp2ref=peptide_map_a2b)

    if param_loglevel >= 2:
        print result.Pretty("#")

    nskipped_exons, nskipped_introns = 0, 0

    last_e1, last_e2 = None, None

    for link in result.mEquivalences:

        if link.mCoverage <= param_min_exon_coverage:
            nskipped_exons += 1
            continue

        e1, e2 = link.mId1, link.mId2

        c1 = cds1[e1]
        c2 = cds2[e2]
        exon_fragment1 = transcript1[c1.mGenomeFrom:c1.mGenomeTo]
        exon_fragment2 = transcript2[c2.mGenomeFrom:c2.mGenomeTo]

        #######################################################################
        # write unaligned exons
        if param_write_exons:
            pair = AlignedPairs.UnalignedPair()

            pair.mCategory = "exon"
            pair.mToken1 = token1
            pair.mId1 = e1 + 1
            pair.mNum1 = len(cds1)
            pair.mLen1 = len(exon_fragment1)
            pair.mSequence1 = exon_fragment1
            pair.mToken2 = token2
            pair.mId2 = e2 + 1
            pair.mNum2 = len(cds2)
            pair.mLen2 = len(exon_fragment2)
            pair.mSequence2 = exon_fragment2
            pair.mFrom1, pair.mTo1 = c1.mGenomeFrom, c1.mGenomeTo,
            pair.mFrom2, pair.mTo2 = c2.mGenomeFrom, c2.mGenomeTo,

            print str(pair)
            sys.stdout.flush()

        #######################################################################
        # build alignment for overlap of both exons
# tmp_map_a2b.clear()
# alignlib_lite.copyAlignment( tmp_map_a2b, dna_map_a2b,
# c1.mGenomeFrom + 1, c1.mGenomeTo )

# if param_loglevel >= 5:
# print "# alignment: %i-%i" % (c1.mGenomeFrom + 1, c1.mGenomeTo)
# for x in alignlib_lite.writeAlignmentTable( tmp_map_a2b ).split("\n"):
# print "#", x
# if tmp_map_a2b.getLength() == 0:
# if param_loglevel >= 1:
# print "# WARNING: empty alignment between exon %i (from %i to %i) and exon %i" % \
##                       (e1,c1.mGenomeFrom + 1, c1.mGenomeTo, e2)
# print "## peptide_map_a2b", peptide_map_a2b.getRowFrom(), peptide_map_a2b.getRowTo(),\
##                       peptide_map_a2b.getColFrom(), peptide_map_a2b.getColTo(), \
# Alignlib.writeAlignmentCompressed(peptide_map_a2b)
# print "## dna_map_a2b", dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo(),\
##                       dna_map_a2b.getColFrom(), dna_map_a2b.getColTo(), \
# Alignlib.writeAlignmentCompressed(dna_map_a2b)
# for cd in cds1: print "##", str(cd)
# for cd in cds2: print "##", str(cd)
##             nerrors += 1
# continue
##         data = map(lambda x: x.split("\t"), alignlib_lite.writePairAlignment( seq1, seq2, tmp_map_a2b  ).split("\n"))
# if "caligned" in param_write_exons :
# print "exon\tcaligned\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, e1,
##                                                                                token2, e2,
##                                                                                data[0][0], data[0][2],
##                                                                                data[1][0], data[1][2],
# data[0][1], data[1][1] )
##         ali_common1 += data[0][1]
##         ali_common2 += data[1][1]
#######################################################################
# write alignment of introns for orthologous introns
# orthologous introns are between orthologous exons
        if param_write_introns:

            if last_e1 is not None:
                if e1 - last_e1 != 1 or e2 - last_e2 != 1:
                    nskipped_introns += 1
                else:
                    pair = AlignedPairs.UnalignedPair()

                    intron_from1 = cds1[e1 - 1].mGenomeTo
                    intron_to1 = cds1[e1].mGenomeFrom
                    intron_from2 = cds2[e2 - 1].mGenomeTo
                    intron_to2 = cds2[e2].mGenomeFrom

                    intron_fragment1 = transcript1[intron_from1:intron_to1]
                    intron_fragment2 = transcript2[intron_from2:intron_to2]

                    if len(intron_fragment1) == 0 or len(
                            intron_fragment2) == 0:
                        print "## ERROR: empty intron fragments: %i-%i out of %i and %i-%i out of %i." %\
                              (intron_from1, intron_to1, len(transcript1),
                               intron_from2, intron_to2, len(transcript2))
                        continue

                    pair.mCategory = "intron"
                    pair.mToken1 = token1
                    pair.mId1 = e1 + 1
                    pair.mNum1 = len(cds1) - 1
                    pair.mLen1 = len(intron_fragment1)
                    pair.mFrom1 = intron_from1
                    pair.mTo1 = intron_to1
                    pair.mSequence1 = intron_fragment1
                    pair.mToken2 = token2
                    pair.mId2 = e2 + 1
                    pair.mNum1 = len(cds2) - 1
                    pair.mLen2 = len(intron_fragment2)
                    pair.mFrom2 = intron_from2
                    pair.mTo2 = intron_to2
                    pair.mSequence2 = intron_fragment2

                    if (param_min_intron_length and len(intron_fragment1) < param_min_intron_length) or \
                            (param_min_intron_length and len(intron_fragment2) < param_min_intron_length) or \
                            (param_max_intron_length and len(intron_fragment1) > param_max_intron_length) or \
                            (param_max_intron_length and len(intron_fragment2) > param_max_intron_length):
                        if param_loglevel >= 1:
                            print "# skipped: fragment lengths out of bounds for: %s\t%s\t%s\t%s\t%i\t%i" %\
                                  (token1, e1, token2, e2,
                                   len(intron_fragment1),
                                   len(intron_fragment2))
                            sys.stdout.flush()
                            nskipped += 1

                    print str(pair)

# else:
##                         anchored_from1 = intron_from1 - param_extend_introns
##                         anchored_to1 = intron_to1 + param_extend_introns
##                         anchored_from2 = intron_from2 - param_extend_introns
##                         anchored_to2 = intron_to2 + param_extend_introns

##                         anchored_fragment1 = transcript1[anchored_from1:anchored_to1]
##                         anchored_fragment2 = transcript2[anchored_from2:anchored_to2]

# for method in param_write_introns:

# if param_loglevel >= 2:
# print "## aligning with method %s" % method
# sys.stdout.flush

# map_intron_a2b.clear()

# if method == "unaligned":

##                                 from1, to1, ali1, from2, to2, ali2 = 0, 0, intron_fragment1, 0, 0, intron_fragment2

# elif method in ("dialigned", "dbaligned", "clusaligned", "dialignedlgs"):

##                                 tmp_intron_a2b = alignlib_lite.makeAlignmentVector()

# if param_loglevel >= 1:
# print "# aligning with method %s two fragments of length %i and %i" % (method,
# len(anchored_fragment1),
# len(anchored_fragment2))
# sys.stdout.flush()

# if method == "dialigned":
##                                     result = dialign.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# elif method == "dialignedlgs":
##                                     result = dialignlgs.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# elif method == "dbaligned":
##                                     result = dba.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# elif method == "clusaligned":
##                                     result = clustal.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# if not result or result.getLength() == 0:
# if param_loglevel >= 1:
# print "# Error: empty intron alignment"
# sys.stdout.flush()
##                                     nerrors += 1
# continue
##                                 tmp_intron_a2b.moveAlignment( anchored_from1, anchored_from2 )
# alignlib_lite.copyAlignment( map_intron_a2b, tmp_intron_a2b,
##                                                        intron_from1 + 1, intron_to1,
# intron_from2 + 1, intron_to2 )
# elif method == "nwaligned":
##                                 seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom )
##                                 seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom )
##                                 alignator_nw.Align( seq1, seq2, map_intron_a2b )
# seq1.useFullLength()
# seq2.useFullLength()
# elif method == "swaligned":
##                                 seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom )
##                                 seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom )
##                                 alignlib_lite.performIterativeAlignment( map_intron_a2b, seq1, seq2, alignator_sw, param_min_score_sw )
# seq1.useFullLength()
# seq2.useFullLength()
# else:
##                                 raise "unknown method %s" % method
# if map_intron_a2b.getLength() > 0:
# if param_compress:
##                                     from1, to1 = map_intron_a2b.getRowFrom(), map_intron_a2b.getRowTo()
##                                     from2, to2 = map_intron_a2b.getColFrom(), map_intron_a2b.getColTo()
##                                     ali1, ali2 = Alignlib.writeAlignmentCompressed( map_intron_a2b )
# else:
# data = map(lambda x: x.split("\t"),
# alignlib_lite.writePairAlignment( seq1, seq2, map_intron_a2b  ).split("\n"))
# if len(data) < 2:
##                                         data=[ ( 0, "", 0), (0, "", 0)]
##                                     from1, ali1, to1 = data[0]
##                                     from2, ali2, to2 = data[1]
# print string.join(map(str, ("intron",
# method,
##                                                         token1, e1, len(cds1) - 1, len(intron_fragment1),
##                                                         token2, e2, len(cds2) - 1, len(intron_fragment2),
# map_intron_a2b.getNumGaps(),
# map_intron_a2b.getLength(),
##                                                         map_intron_a2b.getLength() - map_intron_a2b.getNumGaps(),
##                                                         from1, to1, ali1,
##                                                         from2, to2, ali2,
##                                                         intron_from1, intron_to1,
# intron_from2, intron_to2)), "\t")
# sys.stdout.flush()
        last_e1, last_e2 = e1, e2

    ##########################################################################
    # write concatenated exons
# for method in param_write_exons:
# if method == "common":
# print "exon\tcommon\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0,
##                                                                            token2, 0,
##                                                                            0, 0,
##                                                                            0, 0,
# ali_common1, ali_common2 )
# elif method == "exons":
# Write full alignment without gaps.
# This will not care about exon boundaries and gaps.
# data = map(lambda x: x.split("\t"),
# alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b  ).split("\n"))

# try:
##                 from1, s1, to1, from2, s2, to2 = data[0] + data[1]
# except ValueError:
##                 from1, to1, from2, to2 = 0, 0, 0, 0
##                 s1, s2 = "", ""
##                 nerrors += 1
# except IndexError:
##                 from1, to1, from2, to2 = 0, 0, 0, 0
##                 s1, s2 = "", ""
##                 nerrors += 1

# if from1:
# if len(s1) != len(s2):
# print "# WARNING: alignment of different lengths: %i and %i" % (len(s1), len(s2))
##                     nerrors += 1
##                     from1, to1, from2, to2 = 0, 0, 0, 0
##                     s1, s2 = "", ""
# else:
##                     a1, a2 = [], []
# for x in range( min(len(s1), len(s2)) ):
# if s1[x] != "-" and s2[x] != "-":
##                             a1.append( s1[x] )
##                             a2.append( s2[x] )
##                     s1 = string.join(a1, "")
##                     s2 = string.join(a2, "")

# print "exon\texons\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( (token1, 0,
##                                                                              token2, 0,
##                                                                              from1, to1,
##                                                                              from2, to2,
# s1, s2 ) )
# elif method == "full":
# write full alignment (do not care about exon boundaries)
# data = map(lambda x: x.split("\t"),
# alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b  ).split("\n"))
##             if len(data) < 2: data=[ ( 0, "", 0), (0, "", 0)]
# print "exon\tfull\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0,
##                                                                            token2, 0,
##                                                                            data[0][0], data[0][2],
##                                                                            data[1][0], data[1][2],
# data[0][1], data[1][1] )

    if param_loglevel >= 3:
        print "# skipped_exons=%i, skipped_introns=%i" % (nskipped_exons,
                                                          nskipped_introns)

    return nerrors, nskipped
Esempio n. 25
0
def ProcessRegion(predictions,
                  region_id,
                  region,
                  peptide_sequences=None,
                  filter_queries={}):
    """process a set of matches to a region.

    resolve region according to homology.
    """

    if options.loglevel >= 3:
        options.stdlog.write(
            "###################################################################\n"
        )
        options.stdlog.write("# resolving %i predictions in region %s\n" %
                             (len(predictions), str(region)))
        sys.stdout.flush()

    predictions.sort(lambda x, y: cmp(x.score, y.score))
    predictions.reverse()

    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, options.gop, options.gep)
    result = alignlib_lite.makeAlignmentVector()

    cluster = []

    map_sequence2cluster = range(0, len(predictions))
    edges = []

    noutput, nskipped = 0, 0

    if peptide_sequences:
        for x in range(len(predictions)):
            if options.loglevel >= 5:
                options.stdlog.write(
                    "# filtering from %i with prediction %i: %s\n" %
                    (x, predictions[x].mPredictionId,
                     predictions[x].mQueryToken))
                sys.stdout.flush()

            if map_sequence2cluster[x] != x:
                continue

            region_id += 1
            edges = []

            if predictions[x].mQueryToken not in filter_queries:
                edges.append(predictions[x])
            else:
                nskipped += 1

            for y in range(x + 1, len(predictions)):

                if map_sequence2cluster[y] != y:
                    continue

                if predictions[x].mQueryToken < predictions[y].mQueryToken:
                    key = "%s-%s" % (predictions[x].mQueryToken,
                                     predictions[y].mQueryToken)
                else:
                    key = "%s-%s" % (predictions[y].mQueryToken,
                                     predictions[x].mQueryToken)

                # check if predictions are overlapping on the genomic sequence
                if min(predictions[x].mSbjctGenomeTo,   predictions[y].mSbjctGenomeTo) - \
                   max(predictions[x].mSbjctGenomeFrom, predictions[y].mSbjctGenomeFrom) < 0:
                    if options.loglevel >= 4:
                        options.stdlog.write(
                            "# alignment of predictions %i and %i: no overlap on genomic sequence, thus skipped\n"
                            % (predictions[x].mPredictionId,
                               predictions[y].mPredictionId))
                        sys.stdout.flush()
                    continue

                if not global_alignments.has_key(key):

                    seq1 = peptide_sequences[predictions[x].mQueryToken]
                    seq2 = peptide_sequences[predictions[y].mQueryToken]
                    result.clear()
                    s1 = alignlib_lite.makeSequence(seq1)
                    s2 = alignlib_lite.makeSequence(seq2)
                    alignator.align(result, s1, s2)

                    c1 = 100 * \
                        (result.getRowTo() - result.getRowFrom()) / len(seq1)
                    c2 = 100 * \
                        (result.getColTo() - result.getColFrom()) / len(seq2)
                    min_cov = min(c1, c2)
                    max_cov = max(c1, c2)

                    identity = alignlib_lite.calculatePercentIdentity(
                        result, s1, s2) * 100

                    # check if predictions overlap and they are homologous
                    if result.getScore() >= options.overlap_min_score and \
                       max_cov >= options.overlap_max_coverage and \
                       min_cov >= options.overlap_min_coverage and \
                       identity >= options.overlap_min_identity:
                        global_alignments[key] = True
                    else:
                        global_alignments[key] = False

                    if options.loglevel >= 4:
                        options.stdlog.write(
                            "# alignment=%s score=%i pid=%5.2f c1=%i c2=%i min_cov=%i max_cov=%i homolog=%s\n"
                            % (key, result.getScore(), identity, c1, c2,
                               min_cov, max_cov, global_alignments[key]))
                        sys.stdout.flush()

                if global_alignments[key]:
                    map_sequence2cluster[y] = x
                    if predictions[y].mQueryToken not in filter_queries:
                        edges.append(predictions[y])
                    else:
                        nskipped += 1

            noutput += PrintEdges(region_id, region, edges)

    return region_id, noutput, nskipped
Esempio n. 26
0
def GetOrthologTranscripts(transcripts1, peptides1, cds1, transcripts2,
                           peptides2, cds2):
    """sort out ortholog relationships between
    transcripts of orthologous genes.

    Orthologs have:
        the same number of exons        
        compatible intron/exon boundaries

    For the remaining transcript pairs, take reciprocal bet hits.

    I see the following:
    0: 0(100%), 1: 0(94%), 2: 0,1(100%)
    0: 0(100%), 1: 0,1,2(100%)

    Selecting 1-0 first, would result in a suboptimal match, because one transcript
    is longer than the other, while matching up 0-0 and 2-1 would be better.

    Objective function: it is the maximal matching/assignment problem. Use greedy
    implementation instead. Assign as much as possible according to descending weights.
    """

    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, -10.0, -2.0)

    # for long sequence: use dot alignment with tuple size of three
    dottor = alignlib_lite.makeAlignatorTuples(3)
    alignator_dots = alignlib_lite.makeAlignatorDotsSquared(
        param_gop, param_gep, dottor)

    seqs1 = map(lambda x: alignlib_lite.makeSequence(peptides1[x[0]]),
                transcripts1)
    seqs2 = map(lambda x: alignlib_lite.makeSequence(peptides2[x[0]]),
                transcripts2)

    if param_loglevel >= 4:
        print "# building sequence 1"
    for i in range(len(seqs1)):
        if not cds1.has_key(transcripts1[i][0]):
            if param_loglevel >= 4:
                print "# %s not found" % transcripts1[i][0]

    if param_loglevel >= 4:
        print "# building sequence 2"

    for i in range(len(seqs2)):
        if not cds2.has_key(transcripts2[i][0]):
            if param_loglevel >= 4:
                print "# %s not found" % transcripts1[i][0]

    if param_loglevel >= 4:
        print "# all-vs-all alignment"

    # do all versus all alignment
    alis1 = []
    alis2 = []
    for i in range(len(seqs1)):
        alis1.append([])
    for i in range(len(seqs2)):
        alis2.append([])

    if param_loglevel >= 3:

        print "#################################"

        for i in range(len(seqs1)):
            for cd in cds1[transcripts1[i][0]]:
                print "#", str(cd)
        print "# versus"
        for i in range(len(seqs2)):
            for cd in cds2[transcripts2[i][0]]:
                print "#", str(cd)
        sys.stdout.flush()

    weights = {}
    for i in range(len(seqs1)):
        prediction_id1, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1 = transcripts1[
            i]

        for j in range(len(seqs2)):
            prediction_id2, sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2 = transcripts2[
                j]
            map_a2b = alignlib_lite.makeAlignmentVector()

            m = seqs1[i].getLength() * seqs2[j].getLength()

            if param_loglevel >= 3:
                print "# Starting alignment of pair (%i,%i) of lengths %s:%i and %s:%i" %\
                      (i, j, prediction_id1, seqs1[
                       i].getLength(), prediction_id2, seqs2[j].getLength())
                sys.stdout.flush()

            if m > param_max_matrix_size:
                # switch to tuple alignment if sequences are too large
                if param_loglevel >= 2:
                    print "# WARNING: sequences are of length %i and %i: switching to dot alignment." % (
                        seqs1[i].getLength(), seqs2[j].getLength())
                    sys.stdout.flush()

                alignator_dots.align(map_a2b, seqs1[i], seqs2[j])
            else:
                alignator.align(map_a2b, seqs1[i], seqs2[j])

            coverage_a = 100.0 * \
                (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) / \
                seqs1[i].getLength()
            coverage_b = 100.0 * \
                (map_a2b.getColTo() - map_a2b.getColFrom() + 1) / \
                seqs2[j].getLength()

            # get copy of cds, but only those overlapping with alignment
            c1 = Exons.GetExonsRange(
                cds1[prediction_id1], (map_a2b.getRowFrom() - 1) * 3,
                (map_a2b.getRowTo()) * 3 + 1,
                full=False,
                min_overlap=param_min_alignment_exon_overlap,
                min_exon_size=param_min_exon_size)
            c2 = Exons.GetExonsRange(
                cds2[prediction_id2], (map_a2b.getColFrom() - 1) * 3,
                (map_a2b.getColTo()) * 3 + 1,
                full=False,
                min_overlap=param_min_alignment_exon_overlap,
                min_exon_size=param_min_exon_size)

            # check exon boundaries, look at starts, skip first exon
            def MyMap(a, x):
                while x <= a.getRowTo():
                    c = a.mapRowToCol(x)
                    if c:
                        return c
                    x += 1
                else:
                    return 0

            mapped_boundaries = map(
                lambda x: MyMap(map_a2b, x.mPeptideFrom / 3 + 1), c1[1:])
            mapped_boundaries.sort()
            reference_boundaries = map(lambda x: x.mPeptideFrom / 3 + 1,
                                       c2[1:])
            reference_boundaries.sort()

            nmissed_cmp2ref = Exons.CountMissedBoundaries(
                mapped_boundaries, reference_boundaries,
                param_boundaries_max_slippage)
            nmissed_ref2cmp = Exons.CountMissedBoundaries(
                reference_boundaries, mapped_boundaries,
                param_boundaries_max_slippage)

            min_nmissed = min(nmissed_cmp2ref, nmissed_ref2cmp)

            # set is_ok for the whole thing
            # no intron: is ok
            is_ok = 0
            if (len(c1) == 1 and len(c2) == 1):
                is_ok = 1
            else:
                # allow for missed boundaries, if param_boundaries_allow_missed
                # > 0
                if min_nmissed == 0:
                    is_ok = 1
                else:
                    if param_boundaries_allow_missed and \
                            len(mapped_boundaries) >= param_boundaries_allow_missed and \
                            min_nmissed <= param_boundaries_max_missed:
                        is_ok = 1

            cc = min(coverage_a, coverage_b)
            if cc >= param_min_coverage:
                is_ok_coverage = 1
            else:
                is_ok_coverage = 0

            # check for missing introns
            is_ok_exons = 1
            if abs(len(c1) - len(c2)) != 0:
                if param_missing_max_missing:
                    if ((abs(len(c1) - len(c2)) > param_missing_max_missing) or
                        (min(len(c1), len(c2)) < param_missing_min_present)):
                        is_ok_exons = 0
                else:
                    is_ok_exons = 0

            if param_loglevel >= 3:
                print "# i=", i, "li=", len(c1), "j=", j, "lj=", len(c2), \
                      "boundaries_ok=", is_ok, \
                      "nexons_ok=", is_ok_exons, \
                      "missed_c2r=", nmissed_cmp2ref, \
                      "missed_r2c=", nmissed_ref2cmp, \
                      "min_cov=", cc, \
                      "mapped=", mapped_boundaries, \
                      "reference=", reference_boundaries

                print "#", string.join(
                    map(str, (alignlib_lite.AlignmentFormatEmissions(map_a2b),
                              map_a2b.getNumGaps(), coverage_a, coverage_b)),
                    "\t")
                sys.stdout.flush()

            # dump out pairs
            for method in param_write_pairs:
                if method == "all":
                    print string.join(
                        map(str,
                            ("pair", method, prediction_id1, prediction_id2,
                             sbjct_token1, sbjct_strand1, sbjct_from1,
                             sbjct_to1, seqs1[i].getLength(), sbjct_token2,
                             sbjct_strand2, sbjct_from2, sbjct_to2,
                             seqs2[j].getLength(), map_a2b.getRowFrom(),
                             map_a2b.getRowTo(), row_ali, map_a2b.getColFrom(),
                             map_a2b.getColTo(), col_ali, map_a2b.getNumGaps(),
                             coverage_a, coverage_b, nmissed_cmp2ref,
                             mapped_boundaries, nmissed_ref2cmp,
                             reference_boundaries, i, j, len(c1), len(c2), cc,
                             is_ok, is_ok_exons, is_ok_coverage)), "\t")
                elif method == "alignment":
                    print string.join(
                        map(str,
                            ("pair", method, prediction_id1, prediction_id2,
                             map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali,
                             map_a2b.getColFrom(), map_a2b.getColTo(), col_ali,
                             map_a2b.getNumGaps(), coverage_a, coverage_b)),
                        "\t")
                elif method == "location":
                    print string.join(
                        map(str, ("pair", method, prediction_id1,
                                  prediction_id2, sbjct_token1, sbjct_strand1,
                                  sbjct_from1, sbjct_to1, seqs1[i].getLength(),
                                  sbjct_token2, sbjct_strand2, sbjct_from2,
                                  sbjct_to2, seqs2[j].getLength())), "\t")
            if not is_ok_exons:
                if param_loglevel >= 4:
                    print "# rejected %i and %i: too many exons difference." % (
                        i, j)
                continue

            if param_check_exon_boundaries:
                if not is_ok:
                    continue

            if cc < param_min_coverage:
                continue

            if not weights.has_key(cc):
                weights[cc] = []

            alis1[i].append((coverage_a, j))
            alis2[j].append((coverage_b, i))

            weights[cc].append((i, j, map_a2b))

    # sort out alignments
    ww = weights.keys()
    ww.sort()
    ww.reverse()

    pairs = []
    assigned1 = {}
    assigned2 = {}

    if param_loglevel >= 3:
        print "# alis1=", alis1
        print "# alis2=", alis2
        print "# --------------------------------------"

    for w in ww:
        for i, j, map_a2b in weights[w]:
            if not assigned1.has_key(i) and not assigned2.has_key(j):
                pairs.append((transcripts1[i], transcripts2[j], w, map_a2b))
                assigned1[i] = 1
                assigned2[j] = 1
        if len(assigned1) == len(transcripts1):
            break
        if len(assigned2) == len(transcripts2):
            break

    return pairs
Esempio n. 27
0
def FilterConflicts(old_predictions, new_predictions, removed_predictions,
                    min_overlap, peptide_sequences):
    """remove conflicts.

    Remove overlapping entries between different queries.

    Only remove those sequences, which are alignable.

    If they are alignable, take the sequence with the highest score and highest coverage.
    (Take both, if score and coverage are not correlated.)
    """
    ##################################################################################################
    ## sort predictions by genomic region
    if isinstance(old_predictions, PredictionFile.PredictionFile):
        old_predictions.sort(('mSbjctToken', 'mSbjctStrand',
                              'mSbjctGenomeFrom', 'mSbjctGenomeTo'))
    else:
        old_predictions.sort(lambda x, y: cmp(
            (x.mSbjctToken, x.mSbjctStrand, x.mSbjctGenomeFrom, x.
             mSbjctGenomeTo), (y.mSbjctToken, y.mSbjctStrand, y.
                               mSbjctGenomeFrom, y.mSbjctGenomeTo)))

    ##################################################################################################
    ## filter predictions and resolve conflicts based on genomic overlap
    ## deleted segments are put in a temporary storage space.
    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, param_gop, param_gep)
    result = alignlib_lite.makeAlignmentVector()
    alignments = {}
    noverlaps = 0
    nredundants = 0

    nnew = 0
    last_prediction = None

    for this_prediction in old_predictions:
        try:
            this_query_peptide, this_query_status, this_query_gene, this_query_transcript = \
                                re.split("\s+", this_prediction.mQueryToken)
        except ValueError:
            this_query_gene = None

        if not last_prediction:
            last_prediction = this_prediction
            last_query_gene = this_query_gene
            continue

        overlap = min(last_prediction.mSbjctGenomeTo, this_prediction.mSbjctGenomeTo) - \
                  max(last_prediction.mSbjctGenomeFrom, this_prediction.mSbjctGenomeFrom)
        union   = max(last_prediction.mSbjctGenomeTo, this_prediction.mSbjctGenomeTo) - \
                  min(last_prediction.mSbjctGenomeFrom, this_prediction.mSbjctGenomeFrom)

        # resolve overlap between different genes
        if overlap > 0 and \
               (last_query_gene != this_query_gene or last_query_gene == None):

            noverlaps += 1
            relative_overlap = 100 * overlap / union

            # Start conflict resolution, if overlap is above threshold.
            # Keep higher scoring segment.
            #
            # Check if queries are homologous.
            if relative_overlap >= param_max_percent_overlap:

                if peptide_sequences:
                    if last_prediction.mQueryToken < this_prediction.mQueryToken:
                        key = "%s-%s" % (last_prediction.mQueryToken,
                                         this_prediction.mQueryToken)
                    else:
                        key = "%s-%s" % (this_prediction.mQueryToken,
                                         last_prediction.mQueryToken)

                    if not alignments.has_key(key):
                        result.clear()
                        alignator.align(
                            result,
                            alignlib_lite.makeSequence(peptide_sequences[
                                this_prediction.mQueryToken]),
                            alignlib_lite.makeSequence(peptide_sequences[
                                last_prediction.mQueryToken]))
                        alignments[key] = result.getScore()
                        if result.getScore() >= param_min_score_overlap:
                            nredundants += 1

                    if alignments[key] >= param_min_score_overlap:
                        is_overlap = 1
                    else:
                        is_overlap = 0
                else:
                    is_overlap = 1
            else:
                is_overlap = 0
        else:
            is_overlap = 0

        if is_overlap:
            # take best prediction. If difference is very small, set
            # difference to 0 (difference does not matter). In this case,
            # the first prediction is taken.
            d1 = last_prediction.mQueryCoverage - this_prediction.mQueryCoverage
            if float(abs(d1)) / float(last_prediction.mQueryCoverage
                                      ) < param_conflicts_min_difference:
                d1 = 0
            d2 = last_prediction.score - this_prediction.score
            if float(abs(d2)) / float(
                    this_prediction.score) < param_conflicts_min_difference:
                d2 = 0
            if d1 >= 0 and d2 >= 0:
                if param_loglevel >= 2:
                    print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % (
                        last_prediction.mPredictionId,
                        last_prediction.mQueryToken,
                        last_prediction.mSbjctGenomeFrom, overlap,
                        relative_overlap, str(this_prediction))
                if param_benchmarks:
                    if CheckBenchmark(this_prediction, last_prediction):
                        print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % (
                            overlap, relative_overlap, str(last_prediction))

                removed_predictions.append(this_prediction)
                continue
            elif d1 <= 0 and d2 <= 0:
                if param_loglevel >= 2:
                    print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % (
                        this_prediction.mPredictionId,
                        this_prediction.mQueryToken,
                        this_prediction.mSbjctGenomeFrom, overlap,
                        relative_overlap, str(last_prediction))
                if param_benchmarks:
                    if CheckBenchmark(last_prediction, this_prediction):
                        print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % (
                            overlap, relative_overlap, str(this_prediction))
                removed_predictions.append(last_prediction)
                last_prediction = this_prediction
                last_query_gene = this_query_gene
                continue
            else:
                if param_loglevel >= 2:
                    print "# CONFLICT: non-correlated score/coverage. Keeping both %i(%s-%i) (%5.2f/%i/%i) and %i(%s-%i) (%5.2f/%i/%i)" % \
                          (this_prediction.mPredictionId,
                           this_prediction.mQueryToken, this_prediction.mSbjctGenomeFrom,
                           this_prediction.score, this_prediction.mQueryCoverage,
                           this_prediction.mPercentIdentity,
                           last_prediction.mPredictionId,
                           last_prediction.mQueryToken, last_prediction.mSbjctGenomeFrom,
                           last_prediction.score, last_prediction.mQueryCoverage,
                           last_prediction.mPercentIdentity)

        new_predictions.append(last_prediction)
        nnew += 1
        last_query_gene = this_query_gene
        last_prediction = this_prediction

    new_predictions.append(last_prediction)
    nnew += 1

    if param_loglevel >= 1:
        print "# calculated %i alignments for %i potential conflicts (%i above threshold)" % \
              (len(alignments), noverlaps, nredundants)

    return nnew
Esempio n. 28
0
def EliminateRedundantEntries( rep, 
                               data,
                               eliminated_predictions,
                               options, 
                               peptides,
                               extended_peptides,
                               filter_quality = None,
                               this_quality = None ):
    """eliminate redundant entries in a set."""
    
    eliminated = []

    rep_id = rep.transcript_id
    rep_coverage, rep_pid = rep.mQueryCoverage, rep.mPid

    alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, options.gop, options.gep )
    result = alignlib_lite.makeAlignmentVector()
    
    rep_seq = peptides[rep_id]
    rep_extended_seq = extended_peptides[rep_id]

    for entry in data:

        mem_id, mem_coverage, mem_pid, mem_quality = ( entry.transcript_id,
                                                       entry.mQueryCoverage,
                                                       entry.mPid,
                                                       entry.mQuality )

        mem_seq = peptides[mem_id]
        mem_extended_seq = extended_peptides[mem_id]

        if options.loglevel >= 4:
            options.stdlog.write( "# processing: id=%s class=%s\n" % (mem_id, mem_quality))
            
        if mem_id in eliminated_predictions: continue

        if mem_extended_seq == rep_extended_seq:
            eliminated_predictions[mem_id] = rep_id
            eliminated.append( (mem_id, "i") )

        elif mem_extended_seq in rep_extended_seq:
            eliminated_predictions[mem_id] = rep_id
            eliminated.append( (mem_id, "p") )

        else:
            if mem_quality != this_quality or \
                   mem_quality in options.quality_exclude_same:
          
                seq1 = alignlib_lite.makeSequence( str(rep_seq) )
                seq2 = alignlib_lite.makeSequence( str(mem_seq) )            

                alignator.align( result, seq1, seq2 )

                if options.loglevel >= 5:
                    options.stdlog.write( "# ali\n%s\n" % alignlib_lite.AlignmentFormatExplicit( result, seq1, seq2 ) )
                
                pidentity = 100 * alignlib_lite.calculatePercentIdentity( result, seq1, seq2 )
                
                num_gaps = result.getNumGaps()

                if options.loglevel >= 4:
                    options.stdlog.write( "# processing: id=%s class=%s pid=%5.2f rep_cov=%i mem_cov=%i\n" %\
                                              ( mem_id, mem_quality, pidentity, rep_coverage, mem_coverage ) )
                    
                if pidentity >= options.min_identity:

                    keep = False
                    if rep_coverage < mem_coverage - options.safety_coverage or \
                       rep_pid < mem_pid - options.safety_pide:
                        keep = True
                        reason = "covpid"
                    elif num_gaps >= options.max_gaps and \
                         mem_coverage > rep_coverage - options.safety_coverage:
                        keep = True
                        reason = "gaps"
                    elif mem_coverage >= rep_coverage - options.safety_coverage and \
                             100 * (result.getColTo() - result.getColFrom()) / len(mem_seq) < options.max_member_coverage:
                        keep = True
                        reason = "memcov"

                    if keep:
                        options.stdlog.write( "# WARNING: not removing possibly good prediction: %s: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" %\
                              (reason, rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) )
                    else:
                        eliminated_predictions[mem_id] = rep_id                
                        eliminated.append( (mem_id, "h") )
                        
                elif pidentity >= options.min_identity_non_genes and \
                         this_quality in options.quality_genes and \
                         mem_quality not in options.quality_genes:
                    if rep_coverage < mem_coverage - options.safety_coverage or \
                       rep_pid < mem_pid - options.safety_pide:
                        options.stdlog.write( "# WARNING: not removing possibly good prediction: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" %\
                              (rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) )
                    else:
                        eliminated_predictions[mem_id] = rep_id                
                        eliminated.append( (mem_id, "l") )

    return eliminated