Python Genomics.Alignment2PeptideAlignment Examples

Programming Language: Python

Namespace/Package Name: CGAT

Class/Type: Genomics

Method/Function: Alignment2PeptideAlignment

Examples at hotexamples.com: 6

Python Genomics.Alignment2PeptideAlignment - 6 examples found. These are the top rated real world Python examples of CGAT.Genomics.Alignment2PeptideAlignment extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

IsPositiveStrand(13)

IsNegativeStrand(10)

MapCodon2AA(10)

IsStopCodon(7)

CalculatePairIndices(7)

Alignment2PeptideAlignment(6)

GetHID(6)

ReadGenomicSequences(5)

CalculateCodonFrequenciesFromCounts(4)

CountGeneFeatures(4)

GetDegeneracy(4)

Alignment2ExonBoundaries(3)

GetUniformCodonUsage(3)

ReadContigSizes(3)

ParseFasta2Hash(2)

Protein2Wobble(2)

MaskStopCodons(2)

Alignment2CDNA(2)

CountCodons(2)

Alignment2String(2)

GetIntronType(2)

GetMapAA2Codons(1)

GetGenomicSequence(1)

GetDegenerateSites(1)

MapSequences(1)

CalculateCAIWeightsFromCounts(1)

ParseFasta2HashFromIndex(1)

AlignmentProtein2CDNA(1)

ReadClusters(1)

Example #1

Show file

File: patch_translations.py Project: santayana/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/patch_translations.py 1841 2008-05-08 12:07:13Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.set_defaults(genome_file=None, )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    entry = PredictionParser.PredictionParserEntry()

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    ninput, noutput = 0, 0
    for line in sys.stdin:
        if line[0] == "#":
            print line[:-1]
            continue

        entry.Read(line)

        ninput += 1

        # get genomic sequence
        genomic_sequence = fasta.getSequence(entry.mSbjctToken,
                                             entry.mSbjctStrand,
                                             entry.mSbjctGenomeFrom,
                                             entry.mSbjctGenomeTo)

        entry.mMapPeptide2Translation, entry.mTranslation = Genomics.Alignment2PeptideAlignment(
            entry.mMapPeptide2Genome, entry.mQueryFrom, 0, genomic_sequence)

        options.stdout.write(str(entry) + "\n")

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i\n" % (ninput, noutput))

    E.Stop()

Example #2

Show file

File: gff2predictions.py Project: santayana/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/gff2predictions.py 2021 2008-07-10 16:00:48Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--trans",
                      dest="trans",
                      help="input is translated DNA.",
                      action="store_true")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      help="input format.",
                      type="choice",
                      choices=("exons", "psl", "gff"))

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      help="output format",
                      type="choice",
                      choices=('exontable', 'exons', 'predictions', 'cds',
                               'fasta'))

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genomic data (indexed).")

    parser.add_option(
        "--predictions-file",
        dest="predictions_file",
        type="string",
        help=
        "filename with predictions. Use gene structures from this file if available."
    )

    parser.add_option("-i",
                      "--gff-field-id",
                      dest="gff_field_id",
                      type="string",
                      help="field for the feature id in the gff info section.")

    parser.add_option(
        "-p",
        "--filename-peptides",
        dest="filename_peptides",
        type="string",
        help=
        "Filename with peptide sequences. If given, it is used to check the predicted translated sequences."
    )

    parser.add_option(
        "--no-realignment",
        dest="do_realignment",
        action="store_false",
        help="do not re-align entries that do not parse correctly.")

    parser.add_option(
        "--remove-unaligned",
        dest="remove_unaligned",
        action="store_true",
        help="remove entries that have not been aligned correctly.")

    parser.add_option(
        "--input-coordinates",
        dest="input_coordinates",
        type="string",
        help=
        "specify input format for input coordinates [forward|both-zero|one-closed|open]."
    )

    parser.set_defaults(trans=False,
                        output_format="predictions",
                        format="psl",
                        gff_field_id='id',
                        input_coordinates="both-zero-open",
                        filename_peptides=None,
                        genome_file=None,
                        do_realignment=True,
                        predictions_file=None,
                        remove_unaligned=False)

    (options, args) = E.Start(parser)

    if not options.genome_file:
        raise "please specify a genome file."

    fasta = IndexedFasta.IndexedFasta(options.genome_file)
    contig_sizes = fasta.getContigSizes()

    ninput, noutput, nskipped = 0, 0, 0
    nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0, 0, 0, 0, 0, 0

    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences(
            IOTools.openFile(options.filename_peptides, "r"))
        predictor = Predictor.PredictorExonerate()
        predictor.mLogLevel = 0
    else:
        peptide_sequences = None
        predictor = None

    converter = IndexedFasta.getConverter(options.input_coordinates)

    predictions = {}
    if options.predictions_file:
        parser = PredictionParser.iterator_predictions(
            IOTools.openFile(options.predictions_file, "r"))
        for p in parser:
            predictions[p.mPredictionId] = p

    if options.output_format == "predictions":

        if options.format == "psl":

            if options.trans:
                parser = PredictionParser.PredictionParserBlatTrans()
            else:
                parser = PredictionParser.PredictionParserBlatCDNA()

            nmatches = 1
            for line in sys.stdin:
                if line[0] == "#":
                    continue
                if not re.match("^[0-9]", line):
                    continue

                try:
                    entries = parser.Parse((line, ))
                except PredictionParser.AlignmentError, e:
                    print "# %s" % str(e)
                    print "#", line[:-1]
                    sys.exit(1)

                for entry in entries:
                    entry.mPredictionId = nmatches
                    nmatches += 1

                print str(entries)

        elif options.format == "exons":
            parser = PredictionParser.PredictionParserExons(
                contig_sizes=contig_sizes)
        else:
            raise "unknown format %s for output option %s" % (
                options.format, options.output_format)

        if options.loglevel >= 2:
            options.stdlog.write("# parsing.\n")
            options.stdlog.flush()

        results = parser.Parse(sys.stdin.readlines())

        if options.loglevel >= 2:
            options.stdlog.write("# parsing finished.\n")
            options.stdlog.flush()

        if options.loglevel >= 1:
            options.stdlog.write(
                "# parsing: ninput=%i, noutput=%i, nerrors=%i\n" %
                (parser.GetNumInput(), parser.GetNumOutput(),
                 parser.GetNumErrors()))

            for error, msg in parser.mErrors:
                options.stdlog.write("# %s : %s\n" % (str(error), msg))
                options.stdlog.flush()

        # if genomes are given: build translation
        if options.genome_file:

            results.Sort(lambda x, y: cmp(x.mSbjctToken, y.mSbjctToken))

            new_results = PredictionParser.Predictions()

            for entry in results:

                ninput += 1

                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# processing entry %s:%s on %s:%s %i/%i.\n" %
                        (entry.mPredictionId, entry.mQueryToken,
                         entry.mSbjctToken, entry.mSbjctStrand, ninput,
                         len(results)))
                    options.stdlog.flush()

                try:
                    lgenome = fasta.getLength(entry.mSbjctToken)
                    # added 3 residues - was a problem at split codons just before the stop.
                    # See for example the chicken sequence ENSGALP00000002741
                    genomic_sequence = fasta.getSequence(
                        entry.mSbjctToken, entry.mSbjctStrand,
                        entry.mSbjctGenomeFrom,
                        min(entry.mSbjctGenomeTo + 3, lgenome))

                except KeyError:
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# did not find entry for %s on %s.\n" %
                            (entry.mPredictionId, entry.mSbjctToken))
                    nskipped += 1
                    continue

                if predictions and entry.mPredictionId in predictions:
                    if options.loglevel >= 2:
                        options.stdlog.write(
                            "# substituting entry %s on %s:%s.\n" %
                            (entry.mPredictionId, entry.mSbjctToken,
                             entry.mSbjctStrand))
                        options.stdlog.flush()
                    entry = predictions[entry.mPredictionId]

                exons = Exons.Alignment2Exons(entry.mMapPeptide2Genome, 0,
                                              entry.mSbjctGenomeFrom)

                entry.mMapPeptide2Translation, entry.mTranslation = Genomics.Alignment2PeptideAlignment(
                    Genomics.String2Alignment(entry.mAlignmentString),
                    entry.mQueryFrom, 0, genomic_sequence)

                entry.score = entry.mMapPeptide2Translation.getColTo(
                ) - entry.mMapPeptide2Translation.getColFrom() + 1

                (entry.mNIntrons, entry.mNFrameShifts, entry.mNGaps, entry.mNSplits, entry.mNStopCodons, entry.mNDisruptions ) = \
                    Genomics.CountGeneFeatures(0,
                                               entry.mMapPeptide2Genome,
                                               genomic_sequence)

                if peptide_sequences:

                    if str(entry.mPredictionId) in peptide_sequences:

                        reference = peptide_sequences[str(
                            entry.mPredictionId)].upper()

                        translation = entry.mTranslation
                        nfound += 1

                        is_identical, nmismatches = checkIdentity(
                            reference, translation, options)

                        if is_identical:
                            nidentical += 1
                        else:
                            nmismatch += 1

                            if options.do_realignment:
                                if options.loglevel >= 2:
                                    options.stdlog.write(
                                        "# %s: mismatches..realigning in region %i:%i\n"
                                        % (entry.mPredictionId,
                                           entry.mSbjctGenomeFrom,
                                           entry.mSbjctGenomeTo))
                                    options.stdlog.flush()

                                    result = predictor(
                                        entry.mPredictionId, reference,
                                        entry.mSbjctToken, genomic_sequence,
                                        "--subopt FALSE --score '%s'" %
                                        str(80))
                                    # "--exhaustive --subopt FALSE --score '%s'" % str(80) )

                                    if result:
                                        translation = result[0].mTranslation
                                        is_identical, nmismatches = checkIdentity(
                                            reference, translation, options)
                                    else:
                                        if options.loglevel >= 2:
                                            options.stdlog.write(
                                                "# %s: realignment returned empty result\n"
                                                % (entry.mPredictionId))
                                            options.stdlog.flush()
                                        is_identical = False

                                    if is_identical:
                                        naligned += 1
                                        prediction_id = entry.mPredictionId
                                        sbjct_genome_from = entry.mSbjctGenomeFrom
                                        entry = result[0]
                                        entry.mPredictionId = prediction_id
                                        entry.mSbjctGenomeFrom += sbjct_genome_from
                                    else:
                                        nunaligned += 1
                                        if options.loglevel >= 1:
                                            options.stdlog.write(
                                                "# %s: mismatch on %s:%s:%i-%i after realignment\n# reference =%s\n# translated=%s\n# realigned =%s\n"
                                                %
                                                (entry.mPredictionId,
                                                 entry.mSbjctToken,
                                                 entry.mSbjctStrand,
                                                 entry.mSbjctGenomeFrom,
                                                 entry.mSbjctGenomeTo,
                                                 reference, entry.mTranslation,
                                                 translation))
                                            options.stdlog.flush()
                                        if options.remove_unaligned:
                                            nskipped += 1
                                            continue

                            else:
                                if options.loglevel >= 2:
                                    options.stdlog.write(
                                        "# %s: mismatches on %s ... no realignment\n"
                                        % (
                                            entry.mPredictionId,
                                            entry.mSbjctToken,
                                        ))
                                    if options.loglevel >= 3:
                                        options.stdlog.write(
                                            "# %s: mismatch before realignment\n# reference =%s\n# translated=%s\n"
                                            % (entry.mPredictionId, reference,
                                               translation))
                                    options.stdlog.flush()

                                if options.remove_unaligned:
                                    nskipped += 1
                                    continue

                    else:
                        nnotfound += 1

                new_results.append(entry)
                noutput += 1

            results = new_results
        if results:
            options.stdout.write(str(results) + "\n")

Example #3

Show file

 def setTranslation(self, genomic_sequence):
     """set translation from genomic sequence."""
     self.mMapPeptide2Translation, self.mTranslation = Genomics.Alignment2PeptideAlignment(
         self.mMapPeptide2Genome, self.mQueryFrom, self.mSbjctGenomeFrom,
         genomic_sequence)

Example #4

Show file

File: liftover_predictions.py Project: yangjl/cgat

def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser.add_option("-m", "--filename-map", dest="filename_map", type="string",
                      help="filename with mapping information.")
    parser.add_option("-o", "--pattern-old", dest="pattern_old", type="string",
                      help="pattern for mapping new to old identifiers: extract string from old.")
    parser.add_option("-n", "--pattern-new", dest="pattern_new", type="string",
                      help="pattern for mapping new to old identifiers: put string into new.")
    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="genome_file.")
    parser.add_option("-p", "--peptides", dest="filename_peptides", type = "string",
                      help="filename with peptide sequences.")
    parser.add_option("-f", "--input-format", dest="input_format", type="choice",
                      help="format of mapping file", choices=("alignment", "offsets") )
    parser.add_option("-i", "--write-missed", dest="write_missed", type="string",
                      help="write missed identifiers to separate file.")
    parser.add_option("-a", "--filename-genes", dest="filename_genes", type="string",
                      help="filename with gene information.")
    parser.add_option("--filename-old-peptides", dest="filename_old_peptides", type="string",
                      help="filename with old peptide information.")
    parser.add_option("--no-renumber", dest="renumber", action="store_false",
                      help="do not renumber predictions.")
    parser.add_option("--contig-sizes-old", dest="contig_sizes_old", type="string",
                      help="contig sizes for old data.")
    parser.add_option("--contig-sizes-new", dest="contig_sizes_new", type="string",
                      help="contig sizes for new data.")
    parser.add_option("--skip-errors", dest="skip_errors", action="store_true",
                      help="skip entries with errors.")
    
    parser.set_defaults(
        filename_map = None,
        pattern_old = "(.+)",
        pattern_new = "%s",
        genome_file = None,
        filename_peptides = None,
        write_missed = None,
        filename_genes = None,
        filename_old_peptides = None,
        renumber = True,
        input_format = "alignment",
        contig_sizes_old = None,
        contig_sizes_new = None,
        skip_errors = None
        )

    (options, args) = E.Start( parser, add_pipe_options = True)

    predictor = PredictorExonerate()

    ## the different mapping criteria
    map_sbjcts = {}
    breakpoints = {}

    ################################################################################################
    map_transcript2gene = {}
    if options.filename_genes:
        infile = open(options.filename_genes, "r")
        for gene, transcript in map( lambda x: x[:-1].split("\t")[:2], filter( lambda x: x[0] != "#", infile.readlines())):
            map_transcript2gene[transcript] = gene
        infile.close()

    ################################################################################################
    peptides = {}
    if options.filename_peptides:
        peptides = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r"))
        options.stdlog.write( "# read %i peptide sequences.\n" % len(peptides))

    ################################################################################################
    ## read old query sequences and compare against new query sequences
    ## this can be used to build a map between old and new queries
    query_map_old2new = {}        
    if options.filename_old_peptides:
        old_peptides = Genomics.ReadPeptideSequences( open(options.filename_old_peptides, "r"))
        options.stdlog.write( "# read %i old peptide sequences.\n" % len(old_peptides))
        query_map_old2new, unmappable, unmapped = Genomics.MapSequences( old_peptides, peptides)
        options.stdlog.write( "# built map: unmappable=%i unmapped=%i.\n" % (len(unmappable), len(unmapped)))
        if options.loglevel >= 2:
            options.stdlog.write( "# unmappable: %s.\n" % ";".join(unmappable))
            options.stdlog.write( "# unmapped: %s.\n" % ";".join(unmapped))            

    ################################################################################################
    ## read old/new contig sizes for mapping positive/negative coordinates
    contig_sizes_old = {}
    contig_sizes_new = {}
    if options.contig_sizes_old:
        contig_sizes_old = Genomics.ReadContigSizes( open(options.contig_sizes_old, "r") )
    if options.contig_sizes_new:
        contig_sizes_new = Genomics.ReadContigSizes( open(options.contig_sizes_new, "r") )
        
    ################################################################################################        
    if options.filename_map:
        
        infile = open(options.filename_map)
        if options.input_format == "alignments":
            for line in infile:
                if line[0] == "#": continue

                x, old_token, old_from, old_to, old_ali, new_from, new_to, new_ali = line[:-1].split("\t")

                map_sbjcts[old_token] = (old_from, old_ali, new_from, new_ali)

            if options.loglevel >= 1:
                options.stdlog.write( "# read %i alignments.\n" % len(map_sbjcts))

        elif options.input_format == "offsets":
            ## input is a list of segments and their offsets.

            breakpoints, endpoints, offsets = ReadOffsets( infile )
            if options.loglevel >= 1:
                options.stdlog.write( "# read breakpoints for %i chromosomes.\n" % len(breakpoints))

        infile.close()
        
    ################################################################################################
    ################################################################################################
    ################################################################################################
    ## end of input section
    ################################################################################################
    ################################################################################################
    ################################################################################################        

    rx = re.compile(options.pattern_old)
    last_sbjct_token = None
    ninput = 0
    nerrors = 0
    nerrors_map = 0
    nerrors_inconsistencies = 0
    nerrors_boundaries = 0
    nerrors_translation = 0
    nerrors_inconsequential = 0
    nerrors_realigned = 0
    nmapped = 0
    nfiltered = 0
    naligned = 0
    noutput = 0
    found_transcripts = {}
    nduplicates = 0
    output = {}
    
    for line in sys.stdin:
        if line[0] == "#": continue
        
        entry = PredictionParser.PredictionParserEntry()

        entry.Read( line )
        
        ninput += 1
        is_positive = entry.mSbjctStrand == "+"
        
        is_error = False
        
        ## check if query token is mappable: using sequence map
        if (query_map_old2new and entry.mQueryToken not in query_map_old2new):
            options.stdlog.write("# skipping prediction %i: obsolete query %s\n" % (entry.mPredictionId, entry.mQueryToken) )
            nfiltered += 1
            continue
        else:
            ## check if query token is mappable: using filter        
            if (peptides and entry.mQueryToken not in peptides):
                options.stdlog.write("# skipping prediction %i: obsolete query %s\n" % (entry.mPredictionId, entry.mQueryToken) )
                nfiltered += 1
                continue

        new_sbjct_token = options.pattern_new % rx.search(entry.mSbjctToken).groups()[0]

        ##########################################################################################################
        ## Map via alignments
        if entry.mSbjctToken in map_sbjcts:
            nmapped += 1
            if last_sbjct_token != entry.mSbjctToken:
                old_from, old_ali, new_from, new_ali = map_sbjcts[entry.mSbjctToken]
                map_a2b = alignlib_lite.makeAlignmentVector()
                alignlib_lite.AlignmentFormatExplicit(
                    int(old_from), old_ali,
                    int(new_from), new_ali).copy( map_a2b )
                
            last_sbjct_token = entry.mSbjctToken
            
            if options.loglevel >= 3:
                print "#", str(entry)
                print "#", map_sbjcts[entry.mSbjctToken]
                sys.stdout.flush()

            old_f, old_t = entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo

            ## convert to forward coordinates:
            if is_positive:
                f, t= old_f, old_t
                first_res, last_res = f + 1, t                
            else:
                f, t = map_a2b.getRowTo() - old_f, map_a2b.getRowTo() - old_t 
                first_res, last_res = f, t + 1 
            
            ## map first and last residues
            mfirst_res = map_a2b.mapRowToCol( first_res )
            mlast_res = map_a2b.mapRowToCol( last_res )

            if (mfirst_res == 0 and old_f != 0) or (mlast_res == 0 and old_t != map_a2b.getRowTo() ):
                
                options.stderr.write("# mapping not possible for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i\n" % \
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      first_res, last_res,
                                      mfirst_res, mlast_res,
                                      f, t))
                
                options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken]))
                options.stderr.write("# %s\n" % str(entry))                
                options.stderr.flush()                
                nerrors_boundaries += 1
                is_error = True

                ## get extended boundaries for alignment later on
                while mfirst_res == 0 and first_res > 1:
                    first_res -= 1
                    mfirst_res = map_a2b.mapRowToCol(first_res)
                while mlast_res == 0 and last_res < map_a2b.getRowTo():
                    last_res += 1
                    mlast_res = map_a2b.mapRowToCol(last_res)

            ## convert to genomic coordinates            
            ## convert negative strand coordinates
            if is_positive:
                new_f = mfirst_res - 1
                new_t = mlast_res 
            else:
                new_f = mfirst_res
                new_t = mlast_res - 1
                
                new_f = map_a2b.getColTo() - new_f
                new_t = map_a2b.getColTo() - new_t

            ## Now map the alignment.
            try:
                MapAlignment( entry, map_a2b )
                
            except ValueError:
                options.stderr.write("# alignment mapping not possible for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i\n" % \
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      first_res, last_res,
                                      mfirst_res, mlast_res,
                                      new_f, new_t,
                                      entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken]))
                options.stderr.flush()
                nerrors_map += 1
                is_error= True
            
            if new_f != entry.mSbjctGenomeFrom or new_t != entry.mSbjctGenomeTo:
                options.stderr.write("# mapping inconsistency for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i <> %i-%i\n" % \
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      first_res, last_res,
                                      mfirst_res, mlast_res,                                      
                                      new_f, new_t,
                                      entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                
                nerrors_inconsistencies += 1
                is_error = True

        ##########################################################################################################
        ## Map via offsets
        if entry.mSbjctToken in breakpoints:

            old_f, old_t = entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo

            ## convert to forward coordinates:
            if is_positive:
                f, t= old_f, old_t
            else:
                f, t = contig_sizes_old[entry.mSbjctToken] - old_t, contig_sizes_old[entry.mSbjctToken] - old_f

            o1 = GetOffset( f,
                            breakpoints[entry.mSbjctToken],
                            endpoints[entry.mSbjctToken],
                            offsets[entry.mSbjctToken] )
            o2 = GetOffset( t,
                            breakpoints[entry.mSbjctToken],
                            endpoints[entry.mSbjctToken],
                            offsets[entry.mSbjctToken] )            

            if o1 != o2:
                options.stderr.write("# break within gene %s\n" % str(entry))
                nerrors_map += 1
                is_error = True
                
            f += o1
            t += o2

            if not is_positive:
                f, t = contig_sizes_new[entry.mSbjctToken] - t, contig_sizes_new[entry.mSbjctToken] - f

            entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = f, t

            if entry.mSbjctGenomeFrom > entry.mSbjctGenomeTo:
                options.stderr.write("# mapping error: start after end %s\n" % str(entry))
                nerrors_map += 1
                is_error = True
        
        ##########################################################################################################
        ## do translation check, if genome is given
        if options.genome_file:
            genomic_sequence = Genomics.GetGenomicSequence( new_sbjct_token, entry.mSbjctStrand,
                                                            entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo,
                                                            options.genome_file,
                                                            loglevel = 0)

            map_peptide2translation, translation = Genomics.Alignment2PeptideAlignment( \
                entry.mMapPeptide2Genome, entry.mQueryFrom, 0, genomic_sequence )

            if re.sub("X", "", translation) != re.sub("X", "", entry.mTranslation):
                options.stderr.write("# translation error for prediction %i on %s %s:%i-%i -> %i-%i <> %i-%i\n" % \
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                if map_sbjcts:
                    options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken]))
                options.stderr.write("# old=%s\n# new=%s\n" % (entry.mTranslation, translation))
                options.stderr.write("# old=%s\n# new=%s\n" % (entry.mAlignmentString, Genomics.Alignment2String(entry.mMapPeptide2Genome)))                    
                nerrors_translation += 1
                is_error = True

                if peptides and entry.mQueryToken in peptides:
                    naligned += 1

                    options.stdlog.write( "# aligning: %s versus %s:%s: %i-%i\n" % ( \
                        entry.mQueryToken,
                        new_sbjct_token, entry.mSbjctStrand,
                        entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                    
                    # do a quick reprediction
                    if entry.mQueryToken in peptides:
                        genomic_sequence = Genomics.GetGenomicSequence( new_sbjct_token, entry.mSbjctStrand,
                                                                        0, 0,
                                                                        genome_file = options.genome_pattern,                                                                        
                                                                        loglevel = 0)
                        predictor.mLogLevel = 0

                        
                        result =  predictor(entry.mQueryToken, peptides[entry.mQueryToken],
                                            entry.mSbjctToken, genomic_sequence,
                                            "--exhaustive --subopt FALSE --score '%s' " % str(80),
                                            new_f - 10, new_t + 10)
                        prediction_id = entry.mPredictionId
                        if result:
                            entry = result[0]
                            entry.mPredictionId = prediction_id
                            nerrors_realigned += 1
            else:
                if is_error:
                    nerrors_inconsequential += 1
                    
        entry.mSbjctToken = new_sbjct_token

        ## map query tokens
        if query_map_old2new:
            query_tokens = query_map_old2new[entry.mQueryToken]
        else:
            query_tokens = (entry.mQueryToken,)

        if options.skip_errors and is_error:
            continue

        for query_token in query_tokens:

            entry.mQueryToken = query_token
            
            prediction_id = entry.mPredictionId
            entry.mPredictionId = 0
            
            hid = Genomics.GetHID( str(entry) )
            if hid in output:
                nduplicates += 1
                continue
            
            noutput += 1                        
            if options.renumber: prediction_id = noutput

            entry.mPredictionId = prediction_id

            options.stdout.write( str(entry) + "\n")
            options.stdout.flush()
            found_transcripts[entry.mQueryToken] = 1

    ## write out found transcripts and genes
    nmissed_transcripts = 0
    missed_transcripts = []
    found_genes = {}
    if peptides:
        for x in peptides.keys():
            if x not in found_transcripts:
                nmissed_transcripts += 1
                missed_transcripts.append( x )
            else:
                found_genes[map_transcript2gene[x]] = 1

    missed_genes = {}
    nmissed_genes = 0
    if map_transcript2gene:

        for t in missed_transcripts:
            g = map_transcript2gene[t]
            if g not in found_genes:
                missed_genes[g] = 1
        nmissed_genes = len(missed_genes)
    
    if options.write_missed:
        outfile = open(options.write_missed, "w")
        for x in missed_transcripts:
            if x in unmapped:
                status = "unmapped"
            else:
                status = "mapped"
            outfile.write( "%s\t%s\t%s\n" % ("transcript", x, status ))
        for x in missed_genes:
            status = "unknown"
            outfile.write( "%s\t%s\t%s\n" % ("gene", x, status ))
        
        outfile.close()
        
    options.stdlog.write("# input=%i, output=%i, filtered=%i, nduplicates=%i, mapped=%i, errors=%i\n" % (\
         ninput, noutput, nfiltered, nduplicates, nmapped, nerrors ))
    options.stdlog.write("# errors: inconsequental=%i, boundaries=%i, mapping=%i, inconsistencies=%i, translation=%i, realigned=%i\n" % (\
       nerrors_inconsequential, nerrors_boundaries, nerrors_map, nerrors_inconsistencies, nerrors_translation, nerrors_realigned ))
    options.stdlog.write("# peptides: input=%i, found=%i, missed=%i, found_genes=%i, missed_genes=%i\n" % (\
        len(peptides), len(found_transcripts), nmissed_transcripts, len(found_genes), nmissed_genes) )
    
    E.Stop()

Example #5

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: mali2predictions.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-l",
                      "--filename-locations",
                      dest="filename_locations",
                      type="string",
                      help="filename with locations")

    parser.add_option("-m",
                      "--master",
                      dest="master",
                      type="string",
                      help="the master determines the frame.")

    parser.set_defaults(filename_locations=None, gap_chars="-.", master=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    mali = Mali.Mali()

    mali.readFromFile(sys.stdin)

    identifiers = mali.getIdentifiers()

    aligned_columns, aligned_exons = getAlignedColumns(mali, options)

    map_id2location = {}

    if options.filename_locations:
        map_id2location = IOTools.ReadMap(open(options.filename_locations,
                                               "r"))

    options.stdout.write(Prediction.Prediction().getHeader() + "\n")

    nid = 1

    for identifier in identifiers:

        if options.loglevel >= 2:
            options.stdlog.write("# processing %s\n" % (identifier))

        entry = mali.getEntry(identifier)

        sequence = entry.mString
        if sequence[0] not in string.lowercase:
            raise "all sequences should start with an exon."

        was_exon = True
        d = 0
        alignment = []
        carry_over = 0

        last_codon = []
        codon = []
        nchars_in_codon = 0
        n = 0

        last_master_residue = 0
        master_residue = 0
        for column in range(len(sequence)):

            c = sequence[column]
            is_gap = c in options.gap_chars
            is_aligned = column in aligned_columns
            is_exon = column in aligned_exons

            if is_gap:
                continue

            if is_exon:
                master_residue = aligned_exons[column]
                codon.append((n, master_residue))

            n += 1

            # check if we have a complete codon
            if is_exon:
                # A codon is complete, if it ends at frame 2 or
                # it spans more than one codons in the master.
                # Gaps in the master that are a multiple of 3 are ignored
                d = master_residue - last_master_residue - 1

                if master_residue % 3 == 2 or (d % 3 != 0 and d > 0):

                    if last_codon:
                        d = codon[0][0] - last_codon[-1][0] - 1
                        if d > 0:
                            # add in-frame introns
                            if d > 10:
                                alignment.append(["5", 0, 2])
                                alignment.append(["I", 0, d - 4])
                                alignment.append(["3", 0, 2])
                            else:
                                raise "untreated case"

                    alignment += processCodon(codon)
                    last_codon = codon
                    codon = []

            last_master_residue = master_residue

        last = alignment[0]
        new_alignment = []
        for this in alignment[1:]:
            if this[0] == last[0]:
                last[1] += this[1]
                last[2] += this[2]
                continue

            new_alignment.append(last)
            last = this

        new_alignment.append(last)

        if options.loglevel >= 4:
            options.stdlog.write("# output=%s\n" % (str(new_alignment)))

        assert (new_alignment[-1][2] % 3 == 0)

        lalignment = sum(map(lambda x: x[2], new_alignment))

        prediction = Prediction.Prediction()

        prediction.mQueryToken = identifier

        genomic_sequence = re.sub("[%s]" % options.gap_chars, "",
                                  mali[identifier])

        prediction.mPredictionId = nid
        nid += 1

        if identifier in map_id2location:

            prediction.mSbjctToken, prediction.mSbjctStrand, sfrom, sto = map_id2location[
                identifier].split(":")[:4]

            prediction.mSbjctGenomeFrom = int(sfrom) + entry.mFrom
            prediction.mSbjctGenomeTo = int(sto)

        else:
            prediction.mSbjctToken = "unk"
            prediction.mSbjctStrand = "+"
            prediction.mSbjctGenomeFrom = 0

        prediction.mQueryCoverage = 100
        prediction.mPercentIdentity = 100
        prediction.mPercentSimilarity = 100

        prediction.mQueryLength = prediction.mQueryTo

        prediction.mSbjctGenomeTo = prediction.mSbjctGenomeFrom + lalignment

        prediction.mMapPeptide2Genome = new_alignment
        prediction.mAlignmentString = string.join(
            map(lambda x: string.join(map(str, x), " "),
                prediction.mMapPeptide2Genome), " ")

        prediction.mMapPeptide2Translation, prediction.mTranslation = Genomics.Alignment2PeptideAlignment(
            prediction.mMapPeptide2Genome, 0, 0, genomic_sequence)

        (prediction.mNIntrons, prediction.mNFrameShifts, prediction.mNGaps, prediction.mNSplits, prediction.mNStopCodons, disruptions) = \
            Genomics.CountGeneFeatures(0,
                                       prediction.mMapPeptide2Genome,
                                       genomic_sequence)

        options.stdout.write(str(prediction) + "\n")

    E.Stop()

Example #6

Show file

File: predictions2transcripts.py Project: yangjl/cgat

def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gpipe/predictions2transcripts.py 1841 2008-05-08 12:07:13Z andreas $",
                                    usage = globals()["__doc__"] )
    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome."  )

    parser.add_option("-o", "--output-filename-summary", dest="output_filename_summary", type="string",
                      help="filename with summary information."  )

    parser.add_option( "--skip-header", dest="skip_header", action="store_true",
                       help="skip header."  )

    parser.add_option( "--start-codon-boundary", dest="start_codon_boundary", type="int",
                      help="maximum extension for start codon (make divisible by 3)."  )
    
    parser.add_option( "--stop-codon-boundary", dest="stop_codon_boundary", type="int",
                      help="maximum extension for stop codon (make divisible by 3)."  )

    parser.add_option( "--left-extension-mode", dest="left_extension_mode", type="choice",
                       choices=("first-start", "first-stop-backtrack"),
                       help="extension mode for 5' end.")

    parser.add_option( "--fill-introns", dest="fill_introns", type="int",
                      help="fill intron if divisible by three and no stop codon up to a maximum length of #."  )

    parser.add_option( "--introns-max-stops", dest="introns_max_stops", type="int",
                      help="maximum number of stop codons to tolerate within an intron."  )

    parser.add_option( "--output-format", dest="output_format", type="choice",
                       choices=("predictions", "extensions", "filled-introns"),
                      help="output format."  )
    
    parser.set_defaults(
        genome_file = "genome",
        start_codons = ("ATG"),
        stop_codons = ("TAG", "TAA", "TGA"),
        start_codon_boundary = 9999,
        stop_codon_boundary  = 9999,
        fill_introns = 0,
        introns_max_stops = 0,
        left_splice_signals = ("GT",),
        right_splice_signals = ("AG",),
        output_format="extensions",
        left_extension_mode = "first-start",
        skip_header = False,
        output_filename_summary = None,
        )

    (options, args) = E.Start( parser, add_pipe_options = True )

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    options.start_codon_boundary = int(options.start_codon_boundary / 3)
    options.stop_codon_boundary = int(options.stop_codon_boundary / 3)

    fasta = IndexedFasta.IndexedFasta( options.genome_file )
    
    p = PredictionParser.PredictionParserEntry()

    ninput, noutput = 0, 0
    nfilled = 0
    nseqs_filled = 0
    nseqs_extended = 0
    left_extensions = []
    right_extensions = []
    filled_introns = []

    if not options.skip_header:
        if options.output_format == "predictions":
            options.stdout.write( Prediction.Prediction().getHeader() + "\n" )
        elif options.output_format == "filled-introns":
            options.stdout.write("\t".join( ("prediction_id",
                                             "intron",
                                             "peptide_sequence",
                                             "genomic_sequence") ) + "\n" )

    if options.output_filename_summary:
        outfile_summary = open (options.output_filename_summary, "w" )
        outfile_summary.write( "id\ttype\tnumber\tlength\tfrom\tto\tsequence\n" )
    else:
        outfile_summary = None

    for line in options.stdin:
        
        if line[0] == "#": continue

        ninput += 1
        p.Read(line)

        lsequence = fasta.getLength( p.mSbjctToken )

        genome_from = max( 0, p.mSbjctGenomeFrom - options.start_codon_boundary)
        genome_to = min( lsequence, p.mSbjctGenomeTo + options.stop_codon_boundary)
        
        genomic_sequence = fasta.getSequence( p.mSbjctToken, p.mSbjctStrand,
                                              genome_from,
                                              genome_to ).upper()

        ########################################################################
        ########################################################################
        ########################################################################            
        ## Do extensions
        
        if options.start_codon_boundary or options.stop_codon_boundary:
            
            extension_start = p.mSbjctGenomeFrom - genome_from 
            extension_stop  = genome_to - p.mSbjctGenomeTo
            
            fragment_to = extension_start + p.mSbjctGenomeTo - p.mSbjctGenomeFrom

            lfragment = len(genomic_sequence)

            ########################################################################
            ########################################################################
            ########################################################################            
            ## find start codon
            start = extension_start
            found_start = False
            if options.left_extension_mode == "first-start":

                found_start, start = findCodonReverse( genomic_sequence,
                                                       start,
                                                       options.start_codons,
                                                       options.stop_codons )
                
            elif options.left_extension_mode == "first-stop-backtrack":

                if genomic_sequence[start:start+3] in options.start_codons:
                    found_start = True
                else:
                    found_start, start = findCodonReverse( genomic_sequence,
                                                           start,
                                                           options.stop_codons )
                    
                    if found_start:
                        E.info("prediction %s: stop found at %i (%i) backtracking ..." % ( p.mPredictionId, start, extension_start - start) )
                        
                        ## bracktrack to first start codon
                        found_start = False
                        while start < extension_start:
                            start += 3
                            if genomic_sequence[start:start+3] in options.start_codons:
                                found_start = True
                                break
                        else:
                            start = extension_start

                        if found_start:
                            E.info("start codon found at %i (%i)." % ( start, extension_start - start) )
                        else:
                            E.info("no start codon found." )
                    else:
                        E.info("prediction %s: no stop found ... backtracking to start codon." % ( p.mPredictionId ) )

                        found_start, start = findCodonReverse( genomic_sequence, start, options.start_codons )

                        E.info("prediction %s: no start codon found." % ( p.mPredictionId ) )

            if found_start:
                start += genome_from
            else:
                start = p.mSbjctGenomeFrom

            dstart = p.mSbjctGenomeFrom - start
            
            ########################################################################
            ########################################################################
            ########################################################################            
            ## find stop codon
            ## stop points to the beginning of the codon, thus the stop codon will
            ## not be part of the sequence.
            stop = fragment_to
            found_stop = 0
            while stop < lfragment and \
                      genomic_sequence[stop:stop+3] not in ("NNN", "XXX"):
                if genomic_sequence[stop:stop+3] in options.stop_codons:
                    found_stop = 1
                    break

                stop += 3

            if found_stop:
                stop += genome_from 
            else:
                stop = p.mSbjctGenomeTo

            dstop = stop - p.mSbjctGenomeTo 

            ########################################################################
            ########################################################################
            ########################################################################            
            ## build new prediction
            map_peptide2genome = []
            if dstart: map_peptide2genome.append( ("G", 0, dstart) )
            map_peptide2genome += p.mMapPeptide2Genome
            if dstop: map_peptide2genome.append( ("G", 0, dstop) )

            E.info("prediction %s: extension: found_start=%i, found_stop=%i, left=%i, right=%i" % ( p.mPredictionId, found_start, found_stop, dstart, dstop ) )

            ## save results
            p.mMapPeptide2Genome = map_peptide2genome
            p.mAlignmentString = Genomics.Alignment2String( map_peptide2genome )
            p.mSbjctGenomeFrom -= dstart
            p.mSbjctGenomeTo += dstop
            p.mSbjctFrom += dstart / 3
            p.mSbjctTo += dstart / 3 + dstop / 3            
            
            if dstart or dstop:
                if dstart: left_extensions.append( dstart )
                if dstop: right_extensions.append( dstop )
                
                nseqs_extended += 1

        ## update genomic sequence because borders might have changed.
        genomic_sequence = fasta.getSequence( p.mSbjctToken,
                                              p.mSbjctStrand,
                                              p.mSbjctGenomeFrom,
                                              p.mSbjctGenomeTo ).upper()

        if options.fill_introns:
            
            has_filled = False

            exons = Exons.Alignment2Exons( p.mMapPeptide2Genome,
                                           query_from = 0,
                                           sbjct_from = 0 )

            new_exons = []

            last_e = exons[0]

            nintron = 0

            for e in exons[1:]:

                nintron += 1
                lintron = e.mGenomeFrom - last_e.mGenomeTo
                
                if lintron > options.fill_introns or (lintron) % 3 != 0:
                    E.debug( "prediction %s: intron %i of size %i discarded." % \
                                 (p.mPredictionId,
                                  nintron, lintron ) )
                    
                    new_exons.append(last_e)
                    last_e = e
                    continue

                ## get sequence, include also residues from split codons
                ## when checking for stop codons.
                if e.mAlignment[0][0] == "S":
                    offset_left = last_e.mAlignment[-1][2]
                    offset_right = e.mAlignment[0][2]
                else:
                    offset_left, offset_right = 0, 0
                    
                sequence = genomic_sequence[last_e.mGenomeTo - offset_left:e.mGenomeFrom+offset_right]
                
                ## check for splice sites
                for signal in options.left_splice_signals:
                    if sequence[offset_left:offset_left+len(signal)] == signal:
                        left_signal = True
                        break
                else:
                    left_signal = False
                    
                for signal in options.right_splice_signals:
                    if sequence[-(len(signal)+offset_right):-offset_right] == signal:
                        right_signal = True
                        break
                else:
                    right_signal = False

                nstops, ngaps = 0, 0
                for codon in [ sequence[x:x+3] for x in range(0,len(sequence),3) ]:
                    if codon in options.stop_codons: nstops += 1
                    if "N" in codon.upper(): ngaps += 1
                        
                    E.debug( "prediction %s: intron %i of size %i (%i-%i) (%s:%s:%i:%i): stops=%i, gaps=%i, signals=%s,%s." % \
                                 (p.mPredictionId,
                                  nintron, lintron,
                                  offset_left, offset_right,
                                  p.mSbjctToken, p.mSbjctStrand,
                                  p.mSbjctGenomeFrom + last_e.mGenomeTo,
                                  p.mSbjctGenomeFrom + e.mGenomeFrom,
                                  nstops,
                                  ngaps,
                                  left_signal, right_signal ) )

                if nstops + ngaps > options.introns_max_stops:
                    new_exons.append(last_e)                                        
                    last_e = e
                    continue
                
                E.info( "prediction %s: filling intron %i of size %i: stops=%i, gaps=%i, signals=%s,%s" % \
                            (p.mPredictionId,
                             nintron, lintron,
                             nstops,
                             ngaps,
                             left_signal, right_signal))

                e.Merge( last_e )
                has_filled = True
                nfilled += 1
                last_e = e

                if options.output_format == "filled-introns":
                    options.stdout.write( "\t".join( map(str, ( p.mPredictionId,
                                                                nintron,
                                                                Genomics.TranslateDNA2Protein( sequence ),
                                                                sequence ) ) ) + "\n" )
                                                                
                
                filled_introns.append(lintron)
                p.mNIntrons -= 1
                
            new_exons.append(last_e)

            if has_filled: nseqs_filled += 1

            Exons.UpdatePeptideCoordinates( new_exons )
            
            p.mMapPeptide2Genome = Exons.Exons2Alignment( new_exons )
            p.mAlignmentString = Genomics.Alignment2String( p.mMapPeptide2Genome )

        ## build translated sequence
        p.mMapPeptide2Translation, p.mTranslation = Genomics.Alignment2PeptideAlignment( \
               p.mMapPeptide2Genome, p.mQueryFrom, 0, genomic_sequence )

        ## output info
        if options.output_format == "predictions":
            options.stdout.write( str(p) + "\n" )
        elif options.output_format == "extensions":
            if found_start: found_start = 1
            if found_stop: found_stop = 1
            options.stdout.write( "\t".join( map(str, ( p.mPredictionId,
                                                        found_start, found_stop, 
                                                        dstart, dstop,
                                                        p.mTranslation,
                                                        p.mSbjctGenomeFrom, p.mSbjctGenomeTo,
                                                        p.mAlignmentString ))) + "\n" )

        noutput += 1
        options.stdout.flush()

    E.info("stats  : %s" % "\t".join(Stats.DistributionalParameters().getHeaders() ))
    E.info("left   : %s" % str(Stats.DistributionalParameters(left_extensions)) )
    E.info("right  : %s" % str(Stats.DistributionalParameters(right_extensions)) )
    E.info("introns: %s" % str(Stats.DistributionalParameters(filled_introns)) )        
    E.info("ninput=%i, noutput=%i, nextended=%i, nfilled=%i, nexons_filled=%i" % (\
            ninput, noutput, nseqs_extended, nseqs_filled, nfilled))
        
    E.Stop()