def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: bed2annotator2tsv.py 2885 2010-04-07 08:46:50Z andreas $", 
                             usage = globals()["__doc__"])
    
    parser.add_option( "-g", "--genome-file", dest="genome_file", type="string",
                       help="filename with genome."  )

    parser.add_option( "-f", "--features", dest="features", type="string", 
                       help="feature to collect [default=None]."  )

    parser.add_option( "-i", "--files", dest="files", action="append",
                       help="use multiple annotations [default=None]."  )

    parser.add_option(  "-a", "--annotations", dest="annotations", type="string", 
                       help="aggregate name for annotations if only single file is provided from STDIN [default=None]."  )

    parser.add_option(  "--input-filename-map", dest="input_filename_map", type="string", 
                       help="filename with a map of gene_ids to categories [default=None]."  )

    parser.add_option( "-l", "--max-length", dest="max_length", type="string", 
                       help="maximum segment length [default=None]."  )

    parser.add_option( "-m", "--merge", dest="merge", action="store_true",
                       help="merge overlapping bed segments [default=%default]."  )

    parser.add_option( "-s", "--section", dest="section", type="choice", 
                       choices=("segments", "annotations", "workspace" ),
                       help="annotator section [default=None]."  )

    parser.add_option( "--subset", dest="subsets", type="string", action="append",
                       help="add filenames to delimit subsets within the gff files. The syntax is filename.gff,label,filename.ids [default=None]."  )

    parser.set_defaults(
        genome_file = None,
        feature = None,
        remove_random = True,
        section = "segments",
        annotations = "annotations",
        max_length = 100000,
        files = [],
        subsets = [],
        input_filename_map = None,
        merge = False,
        )

    (options, args) = E.Start( parser )

    options.files += args
    if len(options.files) == 0: options.files.append("-")
    options.files = list( itertools.chain( *[ re.split( "[,; ]+", x) for x in options.files ] ) )

    if options.subsets:
        subsets = collections.defaultdict( list )
        for s in options.subsets: 
            filename_gff,label,filename_ids = s.split( "," )
            subsets[filename_gff].append( (label,filename_ids) )
        options.subsets = subsets

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta( options.genome_file )
    else:
        fasta = None

    if options.section == "segments":
        prefix = "##Segs"
    elif options.section == "annotations":
        prefix = "##Id"
    elif options.section == "workspace":
        prefix = "##Work"
    else:
        raise ValueError("unknown section %s" % options.section)

    if options.max_length:
        max_length = options.max_length
    else:
        max_length = 0

    ninput, ntracks, ncontigs, nsegments, ndiscarded = 0, 0, 0, 0, 0

    if options.section in ("annotations"):
        contigs = set()
        it = itertools.groupby( Bed.iterator( options.stdin ), key=lambda x: x.track["name"])

        map_track2segments = {}
        for track, beds in it:
            ntracks += 1
            map_track2segments[track] = []
            first_segment = nsegments

            beds = list(beds)

            if options.merge: beds = Bed.merge( beds )

            for bed in beds:
                contig, start, end = bed.contig, bed.start, bed.end

                if options.remove_random and "random" in contig: continue
                
                if max_length > 0 and end - start > max_length:
                    ndiscarded += 1
                    continue

                contigs.add(contig)
                map_track2segments[track].append( nsegments )
                options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end ) )
                nsegments += 1
                
            options.stdout.write("##Ann\t%s\t%s\n" % (track, "\t".join( ["%i" % x for x in range(first_segment, nsegments) ] ) ) )
            E.info( "track %s: annotated with %i segments" % (track, nsegments - first_segment) )

        ncontigs = len(contigs)
        E.info( "ninput=%i, ntracks=%i, ncontigs=%i, nsegments=%i, ndiscarded=%i" % (ninput, ntracks, ncontigs, nsegments, ndiscarded))

    E.Stop()
Exemple #2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: bed2annotator2tsv.py 2885 2010-04-07 08:46:50Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-f",
                      "--features",
                      dest="features",
                      type="string",
                      help="feature to collect [default=None].")

    parser.add_option("-i",
                      "--files",
                      dest="files",
                      action="append",
                      help="use multiple annotations [default=None].")

    parser.add_option(
        "-a",
        "--annotations",
        dest="annotations",
        type="string",
        help=
        "aggregate name for annotations if only single file is provided from STDIN [default=None]."
    )

    parser.add_option(
        "--input-filename-map",
        dest="input_filename_map",
        type="string",
        help="filename with a map of gene_ids to categories [default=None].")

    parser.add_option("-l",
                      "--max-length",
                      dest="max_length",
                      type="string",
                      help="maximum segment length [default=None].")

    parser.add_option(
        "-m",
        "--merge",
        dest="merge",
        action="store_true",
        help="merge overlapping bed segments [default=%default].")

    parser.add_option("-s",
                      "--section",
                      dest="section",
                      type="choice",
                      choices=("segments", "annotations", "workspace"),
                      help="annotator section [default=None].")

    parser.add_option(
        "--subset",
        dest="subsets",
        type="string",
        action="append",
        help=
        "add filenames to delimit subsets within the gff files. The syntax is filename.gff,label,filename.ids [default=None]."
    )

    parser.set_defaults(
        genome_file=None,
        feature=None,
        remove_random=True,
        section="segments",
        annotations="annotations",
        max_length=100000,
        files=[],
        subsets=[],
        input_filename_map=None,
        merge=False,
    )

    (options, args) = E.Start(parser)

    options.files += args
    if len(options.files) == 0: options.files.append("-")
    options.files = list(
        itertools.chain(*[re.split("[,; ]+", x) for x in options.files]))

    if options.subsets:
        subsets = collections.defaultdict(list)
        for s in options.subsets:
            filename_gff, label, filename_ids = s.split(",")
            subsets[filename_gff].append((label, filename_ids))
        options.subsets = subsets

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.section == "segments":
        prefix = "##Segs"
    elif options.section == "annotations":
        prefix = "##Id"
    elif options.section == "workspace":
        prefix = "##Work"
    else:
        raise ValueError("unknown section %s" % options.section)

    if options.max_length:
        max_length = options.max_length
    else:
        max_length = 0

    ninput, ntracks, ncontigs, nsegments, ndiscarded = 0, 0, 0, 0, 0

    if options.section in ("annotations"):
        contigs = set()
        it = itertools.groupby(Bed.iterator(options.stdin),
                               key=lambda x: x.track["name"])

        map_track2segments = {}
        for track, beds in it:
            ntracks += 1
            map_track2segments[track] = []
            first_segment = nsegments

            beds = list(beds)

            if options.merge: beds = Bed.merge(beds)

            for bed in beds:
                contig, start, end = bed.contig, bed.start, bed.end

                if options.remove_random and "random" in contig: continue

                if max_length > 0 and end - start > max_length:
                    ndiscarded += 1
                    continue

                contigs.add(contig)
                map_track2segments[track].append(nsegments)
                options.stdout.write("%s\t%i\t%s\t(%i,%i)\n" %
                                     (prefix, nsegments, contig, start, end))
                nsegments += 1

            options.stdout.write("##Ann\t%s\t%s\n" % (track, "\t".join(
                ["%i" % x for x in range(first_segment, nsegments)])))
            E.info("track %s: annotated with %i segments" %
                   (track, nsegments - first_segment))

        ncontigs = len(contigs)
        E.info(
            "ninput=%i, ntracks=%i, ncontigs=%i, nsegments=%i, ndiscarded=%i" %
            (ninput, ntracks, ncontigs, nsegments, ndiscarded))

    E.Stop()
Exemple #3
0
    ninput, ntracks, ncontigs, nsegments, ndiscarded = 0, 0, 0, 0, 0

    if options.section in ("annotations"):
        contigs = set()
        it = itertools.groupby( Bed.iterator( options.stdin ), key=lambda x: x.track["name"])

        map_track2segments = {}
        for track, beds in it:
            ntracks += 1
            map_track2segments[track] = []
            first_segment = nsegments

            beds = list(beds)

            if options.merge: beds = Bed.merge( beds )

            for bed in beds:
                contig, start, end = bed.contig, bed.start, bed.end

                if options.remove_random and "random" in contig: continue
                
                if max_length > 0 and end - start > max_length:
                    ndiscarded += 1
                    continue

                contigs.add(contig)
                map_track2segments[track].append( nsegments )
                options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end ) )
                nsegments += 1