コード例 #1
0
ファイル: table2table.py プロジェクト: yangjl/cgat
def readAndGroupTable( infile, options ):
    """read table from infile and group.
    """
    fields, table  = CSV.ReadTable( infile, with_header = options.has_headers, as_rows = True )
    options.columns = getColumns( fields, options.columns )
    assert options.group_column not in options.columns

    converter = float
    new_fields = [ fields[options.group_column] ] + [ fields[x] for x in options.columns ]

    if options.group_function == "min":
        f = min
    elif options.group_function == "max":
        f = max
    elif options.group_function == "sum":
        f = lambda z: reduce( lambda x,y: x+y, z)
    elif options.group_function == "mean":
        f = scipy.mean
    elif options.group_function == "cat":
        f = lambda x: ";".join( [ y for y in x if y != "" ] )
        converter = str
    elif options.group_function == "uniq":
        f = lambda x: ";".join( [ y for y in set(x) if y != "" ] )
        converter = str
    elif options.group_function == "stats":
        f = lambda x: str(Stats.DistributionalParameters(x))
        # update headers
        new_fields = [ fields[options.group_column] ]
        for c in options.columns:
            new_fields += list( map(lambda x: "%s_%s" % (fields[c], x), Stats.DistributionalParameters().getHeaders() ) )

    ## convert values to floats (except for group_column)
    ## Delete rows with unconvertable values and not in options.columns
    new_table = []
    for row in table:
        skip = False
        new_row = [ row[options.group_column] ]

        for c in options.columns:
            if row[c] == options.missing_value:
                new_row.append(row[c])
            else:
                try:
                    new_row.append( converter(row[c]) )
                except ValueError:
                    skip = True
                    break
        if not skip: new_table.append(new_row)
    table = new_table

    new_rows = CSV.GroupTable( table,
                               group_column = 0,
                               group_function = f )

    options.stdout.write("\t".join(new_fields) + "\n")        
    for row in new_rows:
        options.stdout.write( "\t".join( map(str,row) ) + "\n")
コード例 #2
0
ファイル: analyze_genetrees.py プロジェクト: yangjl/cgat
    def printHeightsPerTree(values, section, options, prefix_header,
                            prefix_row):

        if not values: return

        outfile, is_new = TreeReconciliation.getFile(options, section)
        if is_new:
            outfile.write("%s%s\theights\n" % (prefix_header, "\t".join(
                Stats.DistributionalParameters().getHeaders())))

        s = Stats.DistributionalParameters(values)
        s.setFormat(options.format_branch_length)
        outfile.write("%s%s\t%s\n" % (prefix_row, str(s), ",".join(
            map(lambda x: options.format_branch_length % x, values))))
コード例 #3
0
ファイル: maq2assembly.py プロジェクト: santayana/cgat
    def process(self, contig, start, end, reads, qualities):

        aligned = filter(lambda x: x > 0, reads)
        self.mOutFile.write(
            "%s\t%s\t%i\t%i\t%i\t%i\t%i\t%s\n" %
            (self.mOutputId, contig, start, end, end - start, len(reads),
             len(aligned), str(Stats.DistributionalParameters(aligned))))
コード例 #4
0
ファイル: maq2assembly.py プロジェクト: yangjl/cgat
    def process( self, contig, start, end, reads, qualities ):

        self.mOutFile.write( "%s\t%s\t%i\t%i\t%i\t%i\t%i\t%s\n" % (self.mOutputId, 
                                                                   contig, start, end, end - start, 
                                                                   len(reads),
                                                                   len(qualities),
                                                                   str(Stats.DistributionalParameters( qualities ) )))
コード例 #5
0
    def process(self, contig, start, end, reads, qualities):

        aligned = [x for x in reads if x > 0]
        self.mOutFile.write("%s\t%s\t%i\t%i\t%i\t%i\t%i\t%s\n" % (self.mOutputId,
                                                                  contig, start, end, end -
                                                                  start,
                                                                  len(reads),
                                                                  len(aligned),
                                                                  str(Stats.DistributionalParameters(aligned))))
コード例 #6
0
ファイル: analyze_genetrees.py プロジェクト: yangjl/cgat
    def printHeightsPerSpecies(values, section, options, prefix_header,
                               prefix_row):

        if not values: return

        ## distributions of distance to node
        outfile, is_new = TreeReconciliation.getFile(options, section)
        if is_new:
            outfile.write("%sspecies\t%s\theights\n" %
                          (prefix_header, "\t".join(
                              Stats.DistributionalParameters().getHeaders())))

        for species in sorted(values.keys()):
            s = Stats.DistributionalParameters(values[species])
            s.setFormat(options.format_branch_length)
            outfile.write("%s%s\t%s\t%s\n" %
                          (prefix_row, species, str(s), ",".join(
                              map(lambda x: options.format_branch_length % x,
                                  values[species]))))
コード例 #7
0
ファイル: gff2table.py プロジェクト: kathrinjansen/cgat
def decorator_median_length(intervals, start, end, contig, fasta):
    """compute length distribution."""
    d = Stats.DistributionalParameters([x[1] - x[0] for x in intervals])
    return d['median'], str(d)
コード例 #8
0
ファイル: gff2table.py プロジェクト: kathrinjansen/cgat
def decorator_median_score(values, start, end, contig):
    """compute median of values."""
    d = Stats.DistributionalParameters(values)
    return d['median'], str(d)
コード例 #9
0
ファイル: gff2table.py プロジェクト: kathrinjansen/cgat
def decorator_max_score(values, start, end, contig):
    """compute minumum of values."""
    d = Stats.DistributionalParameters(values)
    return d['max'], str(d)
コード例 #10
0
ファイル: gff2table.py プロジェクト: kathrinjansen/cgat
def decorator_percent_coverage(intervals, start, end, contig, fasta):
    """compute length of intervals."""
    d = Stats.DistributionalParameters([x[1] - x[0] for x in intervals])
    return 100.0 * float(d['sum']) / (end - start), str(d)
コード例 #11
0
ファイル: gff2table.py プロジェクト: zpeng1989/cgat
def decorator_median_length(intervals, start, end, contig, fasta):
    """compute length distribution."""
    d = Stats.DistributionalParameters(map(lambda x: x[1] - x[0], intervals))
    return d['median'], str(d)
コード例 #12
0
ファイル: gff2table.py プロジェクト: kathrinjansen/cgat
def decorator_stddev_score(values, start, end, contig):
    """compute stddev of values."""
    d = Stats.DistributionalParameters(values)
    return d['stddev'], str(d)
コード例 #13
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: psl2wiggle_stats.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome.")

    parser.add_option("--wiggle-files", dest="wiggle_files", type="string",
                      help="glob expression for wiggle files [%default].")

    parser.add_option("--prefix", dest="prefix", type="string",
                      help="prefix to add to contig names before lookup [%default].")

    parser.add_option("-z", "--from-zipped", dest="from_zipped", action="store_true",
                      help="input is zipped.")

    parser.add_option("--test", dest="test", type="int",
                      help="test - stop after # rows of parsing [%default].")

    parser.add_option("--with-values", dest="with_values", action="store_true",
                      help="output values in last column [%default].")

    parser.set_defaults(wiggle_files="*.data.bz2",
                        from_zipped=False,
                        prefix="",
                        with_values=False,
                        test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    # open indexed access to wiggles
    wiggle_files = glob.glob(options.wiggle_files)
    if not wiggle_files:
        raise IOError("could not find wiggle files with '%s'" %
                      options.wiggle_files)

    index = Wiggle.WiggleMultiIndexedAccess(wiggle_files,
                                            keep_open=True,
                                            use_cache=False)

    iterator = Blat.BlatIterator(sys.stdin)

    ninput, noutput, nskipped = 0, 0, 0

    options.stdout.write(
        "query\tnali\t%s" % ("\t".join(Stats.DistributionalParameters().getHeaders())))
    if options.with_values:
        options.stdout.write("\tvalues")
    options.stdout.write("\n")

    while 1:

        if options.test and ninput >= options.test:
            break

        match = iterator.next()

        if match is None:
            break

        ninput += 1

        if options.loglevel >= 2:
            options.stdlog.write(str(match) + "\n")

        # psl always matches on the forward strand

        map_genome2query = alignlib_lite.py_makeAlignmentBlocks()
        f = alignlib_lite.py_AlignmentFormatBlat("%i\t%i\t%i\t%i\t%s\t%s\t%s\n" % (
            match.mSbjctFrom,
            match.mSbjctTo,
            match.mQueryFrom,
            match.mQueryTo,
            match.mSbjctBlockStarts,
            match.mQueryBlockStarts,
            match.mBlockSizes))
        f.copy(map_genome2query)

        data = index.get(options.prefix + match.mSbjctId,
                         match.mSbjctFrom,
                         match.mSbjctTo)

        values = []
        for x, vv in data:
            for v in vv:
                if map_genome2query.mapRowToCol(x) >= 0:
                    values.append(v)
                x += 1
        if len(values) == 0:
            nskipped += 1
            continue

        noutput += 1

        if options.loglevel >= 2:
            options.stdlog.write(
                "# %s\n" % ",".join(["%5.3f" % v for v in values]))

        s = Stats.DistributionalParameters(values)
        options.stdout.write("%s\t%i\t%s" % (match.mQueryId,
                                             match.mNMismatches +
                                             match.mNMatches,
                                             str(s)))

        if options.with_values:
            options.stdout.write(
                "\t%s" % ",".join(["%5.3f" % v for v in values]))

        options.stdout.write("\n")

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped))

    E.Stop()
コード例 #14
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: tree2stats.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("branchlengths", ),
                      help="methods to apply.")

    parser.set_defaults(
        methods=[],
        filtered_branch_length=-999,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    nexus = TreeTools.Newick2Nexus(sys.stdin)
    if options.loglevel >= 1:
        options.stdlog.write("# read %i trees from stdin.\n" %
                             len(nexus.trees))

    ninput = len(nexus.trees)

    nskipped = 0

    for method in options.methods:

        outfile = options.stdout

        if method == "branchlengths":

            outfile.write(
                "tree\t%s\n" %
                "\t".join(Stats.DistributionalParameters().getHeaders()))

            for tree in nexus.trees:
                branchlengths = []
                for node in tree.chain.values():
                    # ignore branch length of root if it is zero
                    if not node.prev and node.data.branchlength == 0: continue

                    if node.data.branchlength == options.filtered_branch_length:
                        continue

                    branchlengths.append(node.data.branchlength)

                s = Stats.DistributionalParameters(branchlengths)
                outfile.write("%s\t%s\n" % (tree.name, str(s)))

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, nskipped=%i\n" % (ninput, nskipped))

    E.Stop()
コード例 #15
0
 def getHeader(self):
     return "id\tcontig\tstart\tend\tsize\tnmatches\tncovered\t%s" % ("\t".join(Stats.DistributionalParameters().getHeaders()))
コード例 #16
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/analyze_ribosomes.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--schemas",
                      dest="schemas",
                      type="string",
                      help="schemas in the set.")

    parser.add_option("-e",
                      "--field-extract",
                      dest="field_extract",
                      type="string",
                      help="pattern for the field to extract.")

    parser.add_option("-c",
                      "--field-compare",
                      dest="field_compare",
                      type="string",
                      help="pattern for the field to compare.")

    parser.add_option("-i",
                      "--filename-identifiers",
                      dest="filename_identifiers",
                      type="string",
                      help="identifiers in the positive set.")

    parser.add_option("-u",
                      "--filename-subset",
                      dest="filename_subset",
                      type="string",
                      help="subset in the positive set.")

    parser.add_option("--filter-min-ratio",
                      dest="filter_min_ratio",
                      type="float",
                      help="minimum boundary for filter.")

    parser.add_option("--filter-max-ratio",
                      dest="filter_max_ratio",
                      type="float",
                      help="maximum boundary for filter.")

    parser.add_option(
        "-o",
        "--output-fields",
        dest="output_fields",
        type="string",
        help=
        "output fields, choices are: zscore, val, nvals, sum, min, max, stddev, mean, median."
    )

    parser.add_option(
        "--output-pattern",
        dest="output_pattern",
        type="string",
        help=
        "pattern for table headers, should contain %s for schema and %s for field anme."
    )

    parser.add_option(
        "-f",
        "--output-format",
        dest="output_format",
        type="choice",
        choices=("table", "list", "values"),
        help="output format. Tabular form (one row per ortholog) or list form."
    )

    parser.add_option("--format",
                      dest="format",
                      type="string",
                      help="output format for numbers.")

    parser.add_option("--remove-na",
                      dest="remove_na",
                      action="store_true",
                      help="remove entries with any na values.")

    parser.set_defaults(
        field_extract="%s_length",
        field_compare="%s_length",
        filename_identifiers=None,
        filename_subset=None,
        filter_min_ratio=0.00,
        filter_max_ratio=0.00,
        schemas="",
        output_fields="",
        output_pattern="%s_%s",
        output_format="table",
        format="%6.4f",
        remove_na=False,
    )

    (options, args) = E.Start(parser, add_csv_options=True)

    options.schemas = options.schemas.split(",")
    if not options.schemas:
        raise "please supply schemas."

    if options.output_fields:
        options.output_fields = options.output_fields.split(",")
    else:
        options.output_fields = ()

    fields, table = CSV.ReadTable(sys.stdin)

    map_fields2column = {}
    for x in fields:
        map_fields2column[x] = len(map_fields2column)

    if options.loglevel >= 1:
        options.stdlog.write("# read a %i x %i table.\n" %
                             (len(table), len(fields)))

    if options.filename_subset:
        subset, nerrors = IOTools.ReadList(open(options.filename_subset, "r"))
        subset = set(subset)

        table = filter(lambda x: x[0] in subset, table)

        if options.loglevel >= 1:
            options.stdlog.write(
                "# subset of %i entries reduced table to a %i x %i table.\n" %
                (len(subset), len(table), len(fields)))

    if options.filename_identifiers:
        identifiers, nerrors = IOTools.ReadList(
            open(options.filename_identifiers, "r"))
    else:
        identifiers = []

    identifiers = set(identifiers)

    # extract rows with positive identifiers
    positive_rows = filter(lambda x: x[0] in identifiers, table)

    if options.loglevel >= 1:
        options.stdlog.write(
            "# subset of %i identifiers gives %i positive entries.\n" %
            (len(identifiers), len(positive_rows)))

    if options.output_format == "table":
        options.stdout.write("id")
        for schema in options.schemas:
            if options.output_fields:
                for field in options.output_fields:
                    options.stdout.write("\t" + options.output_pattern %
                                         (schema, field))
            else:
                options.stdout.write("\t%s" % (schema))

        options.stdout.write("\n")
    else:
        options.stdout.write("schema\tvalue\n")

    if identifiers:
        for row in positive_rows:

            if options.output_format == "table":
                options.stdout.write(row[0])

            for schema in options.schemas:

                # set fields for extraction
                f_extract = map_fields2column[options.field_extract % schema]
                f_compare = map_fields2column[options.field_compare % schema]

                # get region for extraction
                if row[f_compare] != "na":
                    r = float(row[f_compare])
                    if options.filter_min_ratio or options.filter_max_ratio:
                        mi = r * options.filter_min_ratio
                        ma = r * options.filter_max_ratio
                        f = lambda x: x[f_compare] != "na" and float(
                            x[f_compare]
                        ) >= mi and float(x[f_compare]) <= ma and x[
                            0] not in identifiers and x[f_extract] != "na"
                    else:
                        f = lambda x: x[0] not in identifiers and x[f_extract
                                                                    ] != "na"
                    # extract values: filter by minimum and maximum range and remove
                    # positive identifiers.
                    v = float(row[f_extract])
                    values = map(lambda x: float(x[f_extract]),
                                 filter(f, table))

                    stats = Stats.DistributionalParameters(values)
                else:
                    v = None

                for field in options.output_fields:

                    if v is not None:
                        if field == "zscore":
                            f = options.format % stats.getZScore(v)
                        elif field == "diff":
                            f = options.format % (v - stats["mean"])
                        elif field == "reldiff":
                            f = options.format % (
                                (v - stats["mean"]) / stats["mean"])
                        elif field == "val":
                            f = options.format % v
                        else:
                            f = options.format % stats[field]
                    else:
                        f = "na"

                    if options.output_format == "table":
                        options.stdout.write("\t%s" % f)
                    elif options.output_format == "list":
                        options.stdout.write("%s\t%s\n" % (schema, f))
                    elif options.output_format == "values":
                        options.stdout.write(
                            "%s\t%s\t%5.2f\t%s\n" %
                            (row[0], schema, v, ",".join(
                                map(lambda x: options.format % x, values))))

            if options.output_format == "table":
                options.stdout.write("\n")

    else:

        extract_columns = []

        for schema in options.schemas:
            extract_columns.append(map_fields2column[options.field_extract %
                                                     schema])

        # simply dump a subset of values
        for row in table:

            skip = False

            if options.filter_min_ratio or options.filter_max_ratio:

                master = options.schemas[0]

                v = row[map_fields2column[options.field_compare % master]]

                if v == "na":
                    continue

                v = float(v)

                mi = v * options.filter_min_ratio
                ma = v * options.filter_max_ratio

                for schema in options.schemas[1:]:

                    r = row[map_fields2column[options.field_compare % schema]]

                    if r == "na":
                        if options.remove_na:
                            skip = True
                        continue

                    r = float(r)

                    if r < mi or r > ma:
                        skip = True
                        if options.loglevel >= 3:
                            if options.format == "table":
                                options.stdout.write("* ")
                                options.stdout.write("%s\t" % row[0])
                                options.stdout.write("\t".join(
                                    [row[y] for y in extract_columns]))
                                options.stdout.write("\n")
                        break

            if skip:
                continue

            if options.output_format == "table":
                options.stdout.write("%s\t" % row[0])
                options.stdout.write("\t".join(
                    [row[y] for y in extract_columns]))
                options.stdout.write("\n")

            elif options.output_format == "list":
                has_na = False
                for x in range(len(options.schemas)):
                    v = row[extract_columns[x]]
                    if v == "na":
                        has_na = True

                if has_na and options.remove_na:
                    continue

                for x in range(len(options.schemas)):
                    options.stdout.write(
                        "%s\t%s\n" %
                        (options.schemas[x], row[extract_columns[x]]))

    E.Stop()
コード例 #17
0
    index = Wiggle.WiggleMultiIndexedAccess(wiggle_files,
                                            keep_open=True,
                                            use_cache=False)

    if options.as_gtf:
        iterator = GTF.flat_gene_iterator(GTF.iterator(sys.stdin))
        id = "gene_id"
    else:
        iterator = GTF.chunk_iterator(GTF.iterator(sys.stdin))
        id = "query"

    ninput, noutput, nskipped = 0, 0, 0

    options.stdout.write(
        "%s\tnali\t%s" %
        (id, "\t".join(Stats.DistributionalParameters().getHeaders())))
    if options.with_values:
        options.stdout.write("\tvalues")
    options.stdout.write("\n")

    for gffs in iterator:

        if options.test and ninput >= options.test:
            break

        ninput += 1

        if options.loglevel >= 2:
            for gff in gffs:
                options.stdlog.write(str(gff) + "\n")
コード例 #18
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gpipe/predictions2transcripts.py 1841 2008-05-08 12:07:13Z andreas $",
                                    usage = globals()["__doc__"] )
    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome."  )

    parser.add_option("-o", "--output-filename-summary", dest="output_filename_summary", type="string",
                      help="filename with summary information."  )

    parser.add_option( "--skip-header", dest="skip_header", action="store_true",
                       help="skip header."  )

    parser.add_option( "--start-codon-boundary", dest="start_codon_boundary", type="int",
                      help="maximum extension for start codon (make divisible by 3)."  )
    
    parser.add_option( "--stop-codon-boundary", dest="stop_codon_boundary", type="int",
                      help="maximum extension for stop codon (make divisible by 3)."  )

    parser.add_option( "--left-extension-mode", dest="left_extension_mode", type="choice",
                       choices=("first-start", "first-stop-backtrack"),
                       help="extension mode for 5' end.")

    parser.add_option( "--fill-introns", dest="fill_introns", type="int",
                      help="fill intron if divisible by three and no stop codon up to a maximum length of #."  )

    parser.add_option( "--introns-max-stops", dest="introns_max_stops", type="int",
                      help="maximum number of stop codons to tolerate within an intron."  )

    parser.add_option( "--output-format", dest="output_format", type="choice",
                       choices=("predictions", "extensions", "filled-introns"),
                      help="output format."  )
    
    parser.set_defaults(
        genome_file = "genome",
        start_codons = ("ATG"),
        stop_codons = ("TAG", "TAA", "TGA"),
        start_codon_boundary = 9999,
        stop_codon_boundary  = 9999,
        fill_introns = 0,
        introns_max_stops = 0,
        left_splice_signals = ("GT",),
        right_splice_signals = ("AG",),
        output_format="extensions",
        left_extension_mode = "first-start",
        skip_header = False,
        output_filename_summary = None,
        )

    (options, args) = E.Start( parser, add_pipe_options = True )

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    options.start_codon_boundary = int(options.start_codon_boundary / 3)
    options.stop_codon_boundary = int(options.stop_codon_boundary / 3)

    fasta = IndexedFasta.IndexedFasta( options.genome_file )
    
    p = PredictionParser.PredictionParserEntry()

    ninput, noutput = 0, 0
    nfilled = 0
    nseqs_filled = 0
    nseqs_extended = 0
    left_extensions = []
    right_extensions = []
    filled_introns = []

    if not options.skip_header:
        if options.output_format == "predictions":
            options.stdout.write( Prediction.Prediction().getHeader() + "\n" )
        elif options.output_format == "filled-introns":
            options.stdout.write("\t".join( ("prediction_id",
                                             "intron",
                                             "peptide_sequence",
                                             "genomic_sequence") ) + "\n" )

    if options.output_filename_summary:
        outfile_summary = open (options.output_filename_summary, "w" )
        outfile_summary.write( "id\ttype\tnumber\tlength\tfrom\tto\tsequence\n" )
    else:
        outfile_summary = None

    for line in options.stdin:
        
        if line[0] == "#": continue

        ninput += 1
        p.Read(line)

        lsequence = fasta.getLength( p.mSbjctToken )

        genome_from = max( 0, p.mSbjctGenomeFrom - options.start_codon_boundary)
        genome_to = min( lsequence, p.mSbjctGenomeTo + options.stop_codon_boundary)
        
        genomic_sequence = fasta.getSequence( p.mSbjctToken, p.mSbjctStrand,
                                              genome_from,
                                              genome_to ).upper()

        ########################################################################
        ########################################################################
        ########################################################################            
        ## Do extensions
        
        if options.start_codon_boundary or options.stop_codon_boundary:
            
            extension_start = p.mSbjctGenomeFrom - genome_from 
            extension_stop  = genome_to - p.mSbjctGenomeTo
            
            fragment_to = extension_start + p.mSbjctGenomeTo - p.mSbjctGenomeFrom

            lfragment = len(genomic_sequence)

            ########################################################################
            ########################################################################
            ########################################################################            
            ## find start codon
            start = extension_start
            found_start = False
            if options.left_extension_mode == "first-start":

                found_start, start = findCodonReverse( genomic_sequence,
                                                       start,
                                                       options.start_codons,
                                                       options.stop_codons )
                
            elif options.left_extension_mode == "first-stop-backtrack":

                if genomic_sequence[start:start+3] in options.start_codons:
                    found_start = True
                else:
                    found_start, start = findCodonReverse( genomic_sequence,
                                                           start,
                                                           options.stop_codons )
                    
                    if found_start:
                        E.info("prediction %s: stop found at %i (%i) backtracking ..." % ( p.mPredictionId, start, extension_start - start) )
                        
                        ## bracktrack to first start codon
                        found_start = False
                        while start < extension_start:
                            start += 3
                            if genomic_sequence[start:start+3] in options.start_codons:
                                found_start = True
                                break
                        else:
                            start = extension_start

                        if found_start:
                            E.info("start codon found at %i (%i)." % ( start, extension_start - start) )
                        else:
                            E.info("no start codon found." )
                    else:
                        E.info("prediction %s: no stop found ... backtracking to start codon." % ( p.mPredictionId ) )

                        found_start, start = findCodonReverse( genomic_sequence, start, options.start_codons )

                        E.info("prediction %s: no start codon found." % ( p.mPredictionId ) )

            if found_start:
                start += genome_from
            else:
                start = p.mSbjctGenomeFrom

            dstart = p.mSbjctGenomeFrom - start
            
            ########################################################################
            ########################################################################
            ########################################################################            
            ## find stop codon
            ## stop points to the beginning of the codon, thus the stop codon will
            ## not be part of the sequence.
            stop = fragment_to
            found_stop = 0
            while stop < lfragment and \
                      genomic_sequence[stop:stop+3] not in ("NNN", "XXX"):
                if genomic_sequence[stop:stop+3] in options.stop_codons:
                    found_stop = 1
                    break

                stop += 3

            if found_stop:
                stop += genome_from 
            else:
                stop = p.mSbjctGenomeTo

            dstop = stop - p.mSbjctGenomeTo 

            ########################################################################
            ########################################################################
            ########################################################################            
            ## build new prediction
            map_peptide2genome = []
            if dstart: map_peptide2genome.append( ("G", 0, dstart) )
            map_peptide2genome += p.mMapPeptide2Genome
            if dstop: map_peptide2genome.append( ("G", 0, dstop) )

            E.info("prediction %s: extension: found_start=%i, found_stop=%i, left=%i, right=%i" % ( p.mPredictionId, found_start, found_stop, dstart, dstop ) )

            ## save results
            p.mMapPeptide2Genome = map_peptide2genome
            p.mAlignmentString = Genomics.Alignment2String( map_peptide2genome )
            p.mSbjctGenomeFrom -= dstart
            p.mSbjctGenomeTo += dstop
            p.mSbjctFrom += dstart / 3
            p.mSbjctTo += dstart / 3 + dstop / 3            
            
            if dstart or dstop:
                if dstart: left_extensions.append( dstart )
                if dstop: right_extensions.append( dstop )
                
                nseqs_extended += 1

        ## update genomic sequence because borders might have changed.
        genomic_sequence = fasta.getSequence( p.mSbjctToken,
                                              p.mSbjctStrand,
                                              p.mSbjctGenomeFrom,
                                              p.mSbjctGenomeTo ).upper()

        if options.fill_introns:
            
            has_filled = False

            exons = Exons.Alignment2Exons( p.mMapPeptide2Genome,
                                           query_from = 0,
                                           sbjct_from = 0 )

            new_exons = []

            last_e = exons[0]

            nintron = 0

            for e in exons[1:]:

                nintron += 1
                lintron = e.mGenomeFrom - last_e.mGenomeTo
                
                if lintron > options.fill_introns or (lintron) % 3 != 0:
                    E.debug( "prediction %s: intron %i of size %i discarded." % \
                                 (p.mPredictionId,
                                  nintron, lintron ) )
                    
                    new_exons.append(last_e)
                    last_e = e
                    continue

                ## get sequence, include also residues from split codons
                ## when checking for stop codons.
                if e.mAlignment[0][0] == "S":
                    offset_left = last_e.mAlignment[-1][2]
                    offset_right = e.mAlignment[0][2]
                else:
                    offset_left, offset_right = 0, 0
                    
                sequence = genomic_sequence[last_e.mGenomeTo - offset_left:e.mGenomeFrom+offset_right]
                
                ## check for splice sites
                for signal in options.left_splice_signals:
                    if sequence[offset_left:offset_left+len(signal)] == signal:
                        left_signal = True
                        break
                else:
                    left_signal = False
                    
                for signal in options.right_splice_signals:
                    if sequence[-(len(signal)+offset_right):-offset_right] == signal:
                        right_signal = True
                        break
                else:
                    right_signal = False

                nstops, ngaps = 0, 0
                for codon in [ sequence[x:x+3] for x in range(0,len(sequence),3) ]:
                    if codon in options.stop_codons: nstops += 1
                    if "N" in codon.upper(): ngaps += 1
                        
                    E.debug( "prediction %s: intron %i of size %i (%i-%i) (%s:%s:%i:%i): stops=%i, gaps=%i, signals=%s,%s." % \
                                 (p.mPredictionId,
                                  nintron, lintron,
                                  offset_left, offset_right,
                                  p.mSbjctToken, p.mSbjctStrand,
                                  p.mSbjctGenomeFrom + last_e.mGenomeTo,
                                  p.mSbjctGenomeFrom + e.mGenomeFrom,
                                  nstops,
                                  ngaps,
                                  left_signal, right_signal ) )

                if nstops + ngaps > options.introns_max_stops:
                    new_exons.append(last_e)                                        
                    last_e = e
                    continue
                
                E.info( "prediction %s: filling intron %i of size %i: stops=%i, gaps=%i, signals=%s,%s" % \
                            (p.mPredictionId,
                             nintron, lintron,
                             nstops,
                             ngaps,
                             left_signal, right_signal))

                e.Merge( last_e )
                has_filled = True
                nfilled += 1
                last_e = e

                if options.output_format == "filled-introns":
                    options.stdout.write( "\t".join( map(str, ( p.mPredictionId,
                                                                nintron,
                                                                Genomics.TranslateDNA2Protein( sequence ),
                                                                sequence ) ) ) + "\n" )
                                                                
                
                filled_introns.append(lintron)
                p.mNIntrons -= 1
                
            new_exons.append(last_e)

            if has_filled: nseqs_filled += 1

            Exons.UpdatePeptideCoordinates( new_exons )
            
            p.mMapPeptide2Genome = Exons.Exons2Alignment( new_exons )
            p.mAlignmentString = Genomics.Alignment2String( p.mMapPeptide2Genome )

        ## build translated sequence
        p.mMapPeptide2Translation, p.mTranslation = Genomics.Alignment2PeptideAlignment( \
               p.mMapPeptide2Genome, p.mQueryFrom, 0, genomic_sequence )

        ## output info
        if options.output_format == "predictions":
            options.stdout.write( str(p) + "\n" )
        elif options.output_format == "extensions":
            if found_start: found_start = 1
            if found_stop: found_stop = 1
            options.stdout.write( "\t".join( map(str, ( p.mPredictionId,
                                                        found_start, found_stop, 
                                                        dstart, dstop,
                                                        p.mTranslation,
                                                        p.mSbjctGenomeFrom, p.mSbjctGenomeTo,
                                                        p.mAlignmentString ))) + "\n" )

        noutput += 1
        options.stdout.flush()

    E.info("stats  : %s" % "\t".join(Stats.DistributionalParameters().getHeaders() ))
    E.info("left   : %s" % str(Stats.DistributionalParameters(left_extensions)) )
    E.info("right  : %s" % str(Stats.DistributionalParameters(right_extensions)) )
    E.info("introns: %s" % str(Stats.DistributionalParameters(filled_introns)) )        
    E.info("ninput=%i, noutput=%i, nextended=%i, nfilled=%i, nexons_filled=%i" % (\
            ninput, noutput, nseqs_extended, nseqs_filled, nfilled))
        
    E.Stop()
コード例 #19
0
                           options.verify_num_iterations, options.verify_fragment_size,                           
                           quiet = True )        

        options.stdout.write("%s\t%i\t%i\t%i\n" % (compression, t, nerrors1, nerrors2 ))
        options.stdout.flush()
        
        dbfiles.append( dbfile )

    ##############################################################################
    ##############################################################################
    ##############################################################################
    ## random sampling of data points
    ##############################################################################    
    options.stdout.write("//\n")
    
    options.stdout.write( "method\tsize\t%s\tvalues\n" % ("\t".join(Stats.DistributionalParameters().getHeaders())))
    options.stdout.flush()
        
    for fragment_size in options.fragment_sizes:

        times = [ [] for x in range(len(options.methods)+1)] 

        for iteration in range(options.num_iterations):

            for x in range(len(options.methods)):

                if options.stdlog >= 1:
                    options.stdlog.write("# fragment_size=%i, iteration=%i/%i, method=%s.\n" % (fragment_size, iteration, options.num_iterations,options.methods[x]) )
                    options.stdlog.flush()

                timer = timeit.Timer( stmt="benchmarkRandomFragment( fasta = fasta, size = %i)" % (fragment_size),