Ejemplo n.º 1
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument(
        "-m",
        "--method",
        dest="method",
        type=str,
        choices=("add-flank", "add-upstream-flank", "add-downstream-flank",
                 "crop", "crop-unique", "complement-groups", "combine-groups",
                 "filter-range", "join-features", "merge-features", "sanitize",
                 "to-forward-coordinates", "to-forward-strand", "rename-chr"),
        help="method to apply ")

    parser.add_argument("--ignore-strand",
                        dest="ignore_strand",
                        help="ignore strand information.",
                        action="store_true")

    parser.add_argument("--is-gtf",
                        dest="is_gtf",
                        action="store_true",
                        help="input will be treated as gtf.")

    parser.add_argument("-c",
                        "--contigs-tsv-file",
                        dest="input_filename_contigs",
                        type=str,
                        help="filename with contig lengths.")

    parser.add_argument(
        "--agp-file",
        dest="input_filename_agp",
        type=str,
        help="agp file to map coordinates from contigs to scaffolds.")

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome.")

    parser.add_argument("--crop-gff-file",
                        dest="filename_crop_gff",
                        type=str,
                        help="GFF/GTF file to crop against.")

    parser.add_argument(
        "--group-field",
        dest="group_field",
        type=str,
        help="""gff field/attribute to group by such as gene_id, "
        "transcript_id, ... .""")

    parser.add_argument(
        "--filter-range",
        dest="filter_range",
        type=str,
        help="extract all elements overlapping a range. A range is "
        "specified by eithor 'contig:from..to', 'contig:+:from..to', "
        "or 'from,to' .")

    parser.add_argument("--sanitize-method",
                        dest="sanitize_method",
                        type=str,
                        choices=("ucsc", "ensembl", "genome"),
                        help="method to use for sanitizing chromosome names. "
                        ".")

    parser.add_argument(
        "--flank-method",
        dest="flank_method",
        type=str,
        choices=("add", "extend"),
        help="method to use for adding flanks. ``extend`` will "
        "extend existing features, while ``add`` will add new features. "
        ".")

    parser.add_argument("--skip-missing",
                        dest="skip_missing",
                        action="store_true",
                        help="skip entries on missing contigs. Otherwise an "
                        "exception is raised .")

    parser.add_argument(
        "--contig-pattern",
        dest="contig_pattern",
        type=str,
        help="a comma separated list of regular expressions specifying "
        "contigs to be removed when running method sanitize .")

    parser.add_argument(
        "--assembly-report",
        dest="assembly_report",
        type=str,
        help="path to assembly report file which allows mapping of "
        "ensembl to ucsc contigs when running method sanitize .")

    parser.add_argument(
        "--assembly-report-hasids",
        dest="assembly_report_hasIDs",
        type=int,
        help="path to assembly report file which allows mapping of "
        "ensembl to ucsc contigs when running method sanitize .")

    parser.add_argument(
        "--assembly-report-ucsccol",
        dest="assembly_report_ucsccol",
        type=int,
        help="column in the assembly report containing ucsc contig ids"
        ".")

    parser.add_argument(
        "--assembly-report-ensemblcol",
        dest="assembly_report_ensemblcol",
        type=int,
        help="column in the assembly report containing ensembl contig ids")

    parser.add_argument(
        "--assembly-extras",
        dest="assembly_extras",
        type=str,
        help="additional mismatches between gtf and fasta to fix when"
        "sanitizing the genome .")

    parser.add_argument("--extension-upstream",
                        dest="extension_upstream",
                        type=float,
                        help="extension for upstream end .")

    parser.add_argument("--extension-downstream",
                        dest="extension_downstream",
                        type=float,
                        help="extension for downstream end .")

    parser.add_argument("--min-distance",
                        dest="min_distance",
                        type=int,
                        help="minimum distance of features to merge/join .")

    parser.add_argument("--max-distance",
                        dest="max_distance",
                        type=int,
                        help="maximum distance of features to merge/join .")

    parser.add_argument("--min-features",
                        dest="min_features",
                        type=int,
                        help="minimum number of features to merge/join .")

    parser.add_argument("--max-features",
                        dest="max_features",
                        type=int,
                        help="maximum number of features to merge/join .")

    parser.add_argument(
        "--rename-chr-file",
        dest="rename_chr_file",
        type=str,
        help="mapping table between old and new chromosome names."
        "TAB separated 2-column file.")

    parser.set_defaults(input_filename_contigs=False,
                        filename_crop_gff=None,
                        input_filename_agp=False,
                        genome_file=None,
                        rename_chr_file=None,
                        add_up_flank=None,
                        add_down_flank=None,
                        complement_groups=False,
                        crop=None,
                        crop_unique=False,
                        ignore_strand=False,
                        filter_range=None,
                        min_distance=0,
                        max_distance=0,
                        min_features=1,
                        max_features=0,
                        extension_upstream=1000,
                        extension_downstream=1000,
                        sanitize_method="ucsc",
                        flank_method="add",
                        output_format="%06i",
                        skip_missing=False,
                        is_gtf=False,
                        group_field=None,
                        contig_pattern=None,
                        assembly_report=None,
                        assembly_report_hasIDs=1,
                        assembly_report_ensemblcol=4,
                        assembly_report_ucsccol=9,
                        assembly_extras=None)

    (args) = E.start(parser, argv=argv)

    contigs = None
    genome_fasta = None
    chr_map = None

    if args.input_filename_contigs:
        contigs = Genomics.readContigSizes(
            iotools.open_file(args.input_filename_contigs, "r"))

    if args.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(args.genome_file)
        contigs = genome_fasta.getContigSizes()

    if args.rename_chr_file:
        chr_map = {}
        with open(args.rename_chr_file, 'r') as filein:
            reader = csv.reader(filein, delimiter='\t')
            for row in reader:
                if len(row) != 2:
                    raise ValueError(
                        "Mapping table must have exactly two columns")
                chr_map[row[0]] = row[1]
        if not len(chr_map.keys()) > 0:
            raise ValueError("Empty mapping dictionnary")

    if args.assembly_report:
        df = pd.read_csv(args.assembly_report,
                         comment="#",
                         header=None,
                         sep="\t")
        # fixes naming inconsistency in assembly report: ensembl chromosome
        # contigs found in columnn 0, ensembl unassigned contigs found in
        # column 4.
        if args.assembly_report_hasIDs == 1:
            ucsccol = args.assembly_report_ucsccol
            ensemblcol = args.assembly_report_ensemblcol
            df.loc[df[1] == "assembled-molecule",
                   ensemblcol] = df.loc[df[1] == "assembled-molecule", 0]
            if args.sanitize_method == "ucsc":
                assembly_dict = df.set_index(ensemblcol)[ucsccol].to_dict()
            elif args.sanitize_method == "ensembl":
                assembly_dict = df.set_index(ucsccol)[ensemblcol].to_dict()
            else:
                raise ValueError(''' When using assembly report,
                please specify sanitize method as either
                "ucsc" or "ensembl" to specify direction of conversion
                ''')
        else:
            assembly_dict = {}
        if args.assembly_extras is not None:
            assembly_extras = args.assembly_extras.split(",")
            for item in assembly_extras:
                item = item.split("-")
                assembly_dict[item[0]] = item[1]

    if args.method in ("forward_coordinates", "forward_strand",
                       "add-flank", "add-upstream-flank",
                       "add-downstream-flank") \
       and not contigs:
        raise ValueError("inverting coordinates requires genome file")

    if args.input_filename_agp:
        agp = AGP.AGP()
        agp.readFromFile(iotools.open_file(args.input_filename_agp, "r"))
    else:
        agp = None

    gffs = GTF.iterator(args.stdin)

    if args.method in ("add-upstream-flank", "add-downstream-flank",
                       "add-flank"):

        add_upstream_flank = "add-upstream-flank" == args.method
        add_downstream_flank = "add-downstream-flank" == args.method
        if args.method == "add-flank":
            add_upstream_flank = add_downstream_flank = True

        upstream_flank = int(args.extension_upstream)
        downstream_flank = int(args.extension_downstream)
        extend_flank = args.flank_method == "extend"

        if args.is_gtf:
            iterator = GTF.flat_gene_iterator(gffs)
        else:
            iterator = GTF.joined_iterator(gffs, args.group_field)

        for chunk in iterator:
            is_positive = Genomics.IsPositiveStrand(chunk[0].strand)
            chunk.sort(key=lambda x: (x.contig, x.start))
            lcontig = contigs[chunk[0].contig]

            if extend_flank:
                if add_upstream_flank:
                    if is_positive:
                        chunk[0].start = max(0,
                                             chunk[0].start - upstream_flank)
                    else:
                        chunk[-1].end = min(lcontig,
                                            chunk[-1].end + upstream_flank)
                if add_downstream_flank:
                    if is_positive:
                        chunk[-1].end = min(lcontig,
                                            chunk[-1].end + downstream_flank)
                    else:
                        chunk[0].start = max(0,
                                             chunk[0].start - downstream_flank)
            else:
                if add_upstream_flank:
                    gff = GTF.Entry()
                    if is_positive:
                        gff.copy(chunk[0])
                        gff.end = gff.start
                        gff.start = max(0, gff.start - upstream_flank)
                        chunk.insert(0, gff)
                    else:
                        gff.copy(chunk[-1])
                        gff.start = gff.end
                        gff.end = min(lcontig, gff.end + upstream_flank)
                        chunk.append(gff)
                    gff.feature = "5-Flank"
                    gff.mMethod = "gff2gff"
                if add_downstream_flank:
                    gff = GTF.Entry()
                    if is_positive:
                        gff.copy(chunk[-1])
                        gff.start = gff.end
                        gff.end = min(lcontig, gff.end + downstream_flank)
                        chunk.append(gff)
                    else:
                        gff.copy(chunk[0])
                        gff.end = gff.start
                        gff.start = max(0, gff.start - downstream_flank)
                        chunk.insert(0, gff)
                    gff.feature = "3-Flank"
                    gff.mMethod = "gff2gff"

            if not is_positive:
                chunk.reverse()

            for gff in chunk:
                args.stdout.write(str(gff) + "\n")

    elif args.method == "complement-groups":

        iterator = GTF.joined_iterator(gffs, group_field=args.group_field)

        for chunk in iterator:
            if args.is_gtf:
                chunk = [x for x in chunk if x.feature == "exon"]
                if len(chunk) == 0:
                    continue
            chunk.sort(key=lambda x: (x.contig, x.start))
            x = GTF.Entry()
            x.copy(chunk[0])
            x.start = x.end
            x.feature = "intron"
            for c in chunk[1:]:
                x.end = c.start
                args.stdout.write(str(x) + "\n")
                x.start = c.end

    elif args.method == "combine-groups":

        iterator = GTF.joined_iterator(gffs, group_field=args.group_field)

        for chunk in iterator:
            chunk.sort(key=lambda x: (x.contig, x.start))
            x = GTF.Entry()
            x.copy(chunk[0])
            x.end = chunk[-1].end
            x.feature = "segment"
            args.stdout.write(str(x) + "\n")

    elif args.method == "join-features":
        for gff in combineGFF(gffs,
                              min_distance=args.min_distance,
                              max_distance=args.max_distance,
                              min_features=args.min_features,
                              max_features=args.max_features,
                              merge=False,
                              output_format=args.output_format):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "merge-features":
        for gff in combineGFF(gffs,
                              min_distance=args.min_distance,
                              max_distance=args.max_distance,
                              min_features=args.min_features,
                              max_features=args.max_features,
                              merge=True,
                              output_format=args.output_format):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "crop":
        for gff in cropGFF(gffs, args.filename_crop_gff):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "crop-unique":
        for gff in cropGFFUnique(gffs):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "filter-range":

        contig, strand, interval = None, None, None
        try:
            contig, strand, start, sep, end = re.match(
                "(\S+):(\S+):(\d+)(\.\.|-)(\d+)", args.filter_range).groups()
        except AttributeError:
            pass

        if not contig:
            try:
                contig, start, sep, end = re.match("(\S+):(\d+)(\.\.|-)(\d+)",
                                                   args.filter_range).groups()
                strand = None
            except AttributeError:
                pass

        if not contig:
            try:
                start, end = re.match("(\d+)(\.\.|\,|\-)(\d+)",
                                      args.filter_range).groups()
            except AttributeError:
                raise "can not parse range %s" % args.filter_range
            contig = None
            strand = None

        if start:
            interval = (int(start), int(end))
        else:
            interval = None

        E.debug("filter: contig=%s, strand=%s, interval=%s" %
                (str(contig), str(strand), str(interval)))

        for gff in GTF.iterator_filtered(gffs,
                                         contig=contig,
                                         strand=strand,
                                         interval=interval):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "sanitize":

        def assemblyReport(id):
            if id in assembly_dict.keys():
                id = assembly_dict[id]
            # if not in dict, the contig name is forced
            # into the desired convention, this is helpful user
            # modified gff files that contain additional contigs
            elif args.sanitize_method == "ucsc":
                if not id.startswith("contig") and not id.startswith("chr"):
                    id = "chr%s" % id
            elif args.sanitize_method == "ensembl":
                if id.startswith("contig"):
                    return id[len("contig"):]
                elif id.startswith("chr"):
                    return id[len("chr"):]
            return id

        if args.sanitize_method == "genome":
            if genome_fasta is None:
                raise ValueError("please specify --genome-file= when using "
                                 "--sanitize-method=genome")
            f = genome_fasta.getToken
        else:
            if args.assembly_report is None:
                raise ValueError(
                    "please specify --assembly-report= when using "
                    "--sanitize-method=ucsc or ensembl")
            f = assemblyReport

        skipped_contigs = collections.defaultdict(int)
        outofrange_contigs = collections.defaultdict(int)
        filtered_contigs = collections.defaultdict(int)

        for gff in gffs:
            try:
                gff.contig = f(gff.contig)
            except KeyError:
                if args.skip_missing:
                    skipped_contigs[gff.contig] += 1
                    continue
                else:
                    raise

            if genome_fasta:
                lcontig = genome_fasta.getLength(gff.contig)
                if lcontig < gff.end:
                    outofrange_contigs[gff.contig] += 1
                    continue

            if args.contig_pattern:
                to_remove = [
                    re.compile(x) for x in args.contig_pattern.split(",")
                ]
                if any([x.search(gff.contig) for x in to_remove]):
                    filtered_contigs[gff.contig] += 1
                    continue

            args.stdout.write(str(gff) + "\n")

        if skipped_contigs:
            E.info("skipped %i entries on %i contigs: %s" %
                   (sum(skipped_contigs.values()),
                    len(list(skipped_contigs.keys())), str(skipped_contigs)))

        if outofrange_contigs:
            E.warn(
                "skipped %i entries on %i contigs because they are out of range: %s"
                % (sum(outofrange_contigs.values()),
                   len(list(
                       outofrange_contigs.keys())), str(outofrange_contigs)))

        if filtered_contigs:
            E.info("filtered out %i entries on %i contigs: %s" %
                   (sum(filtered_contigs.values()),
                    len(list(filtered_contigs.keys())), str(filtered_contigs)))

    elif args.method == "rename-chr":
        if not chr_map:
            raise ValueError("please supply mapping file")

        for gff in renameChromosomes(gffs, chr_map):
            args.stdout.write(str(gff) + "\n")

    else:

        for gff in gffs:

            if args.method == "forward_coordinates":
                gff.invert(contigs[gff.contig])

            if args.method == "forward_strand":
                gff.invert(contigs[gff.contig])
                gff.strand = "+"

            if agp:
                # note: this works only with forward coordinates
                gff.contig, gff.start, gff.end = agp.mapLocation(
                    gff.contig, gff.start, gff.end)

            args.stdout.write(str(gff) + "\n")

    E.stop()
Ejemplo n.º 2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
        """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=(
                          "geneprofile",
                          "tssprofile",
                          "utrprofile",
                          "intervalprofile",
                          "midpointprofile",
                          "geneprofilewithintrons",
                          "geneprofileabsolutedistancefromthreeprimeend",
                          "separateexonprofile",
                          "separateexonprofilewithintrons",
                      ),
                      help='counters to use. Counters describe the '
                      'meta-gene structure to use. '
                      'Note using geneprofilewithintrons, or '
                      'geneprofileabsolutedistancefromthreeprimeend will '
                      'automatically turn on the --use-base-accuracy option'
                      '[%default].')

    parser.add_option("-b",
                      "--bam-file",
                      "--bedfile",
                      "--bigwigfile",
                      dest="infiles",
                      metavar="BAM",
                      type="string",
                      action="append",
                      help="BAM/bed/bigwig files to use. Do not mix "
                      "different types [%default]")

    parser.add_option("-c",
                      "--control-bam-file",
                      dest="controlfiles",
                      metavar="BAM",
                      type="string",
                      action="append",
                      help="control/input to use. Should be of the same "
                      "type as the bam/bed/bigwig file"
                      " [%default]")

    parser.add_option("-g",
                      "--gtf-file",
                      dest="gtffile",
                      type="string",
                      metavar="GTF",
                      help="GTF file to use. "
                      "[%default]")

    parser.add_option("--normalize-transcript",
                      dest="transcript_normalization",
                      type="choice",
                      choices=("none", "max", "sum", "total-max", "total-sum"),
                      help="normalization to apply on each transcript "
                      "profile before adding to meta-gene profile. "
                      "[%default]")

    parser.add_option("--normalize-profile",
                      dest="profile_normalizations",
                      type="choice",
                      action="append",
                      choices=("all", "none", "area", "counts", "background"),
                      help="normalization to apply on meta-gene "
                      "profile normalization. "
                      "[%default]")

    parser.add_option(
        "-r",
        "--reporter",
        dest="reporter",
        type="choice",
        choices=("gene", "transcript"),
        help="report results for genes or transcripts."
        " When 'genes` is chosen, exons across all transcripts for"
        " a gene are merged. When 'transcript' is chosen, counts are"
        " computed for each transcript separately with each transcript"
        " contributing equally to the meta-gene profile."
        " [%default]")

    parser.add_option("-i",
                      "--shift-size",
                      dest="shifts",
                      type="int",
                      action="append",
                      help="shift reads in :term:`bam` formatted file "
                      "before computing densities (ChIP-Seq). "
                      "[%default]")

    parser.add_option("-a",
                      "--merge-pairs",
                      dest="merge_pairs",
                      action="store_true",
                      help="merge pairs in :term:`bam` formatted "
                      "file before computing "
                      "densities (ChIP-Seq). "
                      "[%default]")

    parser.add_option("-u",
                      "--use-base-accuracy",
                      dest="base_accuracy",
                      action="store_true",
                      help="compute densities with base accuracy. The default "
                      "is to only use the start and end of the aligned region "
                      "(RNA-Seq) "
                      "[%default]")

    parser.add_option("-e",
                      "--extend",
                      dest="extends",
                      type="int",
                      action="append",
                      help="extend reads in :term:`bam` formatted file "
                      "(ChIP-Seq). "
                      "[%default]")

    parser.add_option("--resolution-upstream",
                      dest="resolution_upstream",
                      type="int",
                      help="resolution of upstream region in bp "
                      "[%default]")

    parser.add_option("--resolution-downstream",
                      dest="resolution_downstream",
                      type="int",
                      help="resolution of downstream region in bp "
                      "[%default]")

    parser.add_option("--resolution-upstream-utr",
                      dest="resolution_upstream_utr",
                      type="int",
                      help="resolution of upstream UTR region in bp "
                      "[%default]")

    parser.add_option("--resolution-downstream-utr",
                      dest="resolution_downstream_utr",
                      type="int",
                      help="resolution of downstream UTR region in bp "
                      "[%default]")

    parser.add_option("--resolution-cds",
                      dest="resolution_cds",
                      type="int",
                      help="resolution of cds region in bp "
                      "[%default]")

    parser.add_option("--resolution-first-exon",
                      dest="resolution_first",
                      type="int",
                      help="resolution of first exon in gene, in bp"
                      "[%default]")

    parser.add_option("--resolution-last-exon",
                      dest="resolution_last",
                      type="int",
                      help="resolution of last exon in gene, in bp"
                      "[%default]")

    parser.add_option("--resolution-introns",
                      dest="resolution_introns",
                      type="int",
                      help="resolution of introns region in bp "
                      "[%default]")

    parser.add_option("--resolution-exons-absolute-distance-topolya",
                      dest="resolution_exons_absolute_distance_topolya",
                      type="int",
                      help="resolution of exons absolute distance "
                      "topolya in bp "
                      "[%default]")

    parser.add_option("--resolution-introns-absolute-distance-topolya",
                      dest="resolution_introns_absolute_distance_topolya",
                      type="int",
                      help="resolution of introns absolute distance "
                      "topolya in bp "
                      "[%default]")

    parser.add_option("--extension-exons-absolute-distance-topolya",
                      dest="extension_exons_absolute_distance_topolya",
                      type="int",
                      help="extension for exons from the absolute "
                      "distance from the topolya in bp "
                      "[%default]")

    parser.add_option(
        "--extension-introns-absolute-distance-topolya",
        dest="extension_introns_absolute_distance_topolya",
        type="int",
        help="extension for introns from the absolute distance from "
        "the topolya in bp [%default]")

    parser.add_option("--extension-upstream",
                      dest="extension_upstream",
                      type="int",
                      help="extension upstream from the first exon in bp"
                      "[%default]")

    parser.add_option("--extension-downstream",
                      dest="extension_downstream",
                      type="int",
                      help="extension downstream from the last exon in bp"
                      "[%default]")

    parser.add_option("--extension-inward",
                      dest="extension_inward",
                      type="int",
                      help="extension inward from a TSS start site in bp"
                      "[%default]")

    parser.add_option("--extension-outward",
                      dest="extension_outward",
                      type="int",
                      help="extension outward from a TSS start site in bp"
                      "[%default]")

    parser.add_option("--scale-flank-length",
                      dest="scale_flanks",
                      type="int",
                      help="scale flanks to (integer multiples of) gene length"
                      "[%default]")

    parser.add_option(
        "--control-factor",
        dest="control_factor",
        type="float",
        help="factor for normalizing control and foreground data. "
        "Computed from data if not set. "
        "[%default]")

    parser.add_option("--output-all-profiles",
                      dest="output_all_profiles",
                      action="store_true",
                      help="keep individual profiles for each "
                      "transcript and output. "
                      "[%default]")

    parser.add_option("--counts-tsv-file",
                      dest="input_filename_counts",
                      type="string",
                      help="filename with count data for each transcript. "
                      "Use this instead "
                      "of recomputing the profile. Useful for plotting the "
                      "meta-gene profile "
                      "from previously computed counts "
                      "[%default]")

    parser.add_option(
        "--background-region-bins",
        dest="background_region_bins",
        type="int",
        help="number of bins on either end of the profile "
        "to be considered for background meta-gene normalization "
        "[%default]")

    parser.add_option(
        "--output-res",
        dest="resolution_images",
        type="int",
        help="the output dpi for the figure plot - will default to "
        "[%default]")

    parser.add_option(
        "--image-format",
        dest="image_format",
        type="string",
        help="The output format for the figure plot - defaults to "
        "[%default]")

    parser.set_defaults(
        remove_rna=False,
        ignore_pairs=False,
        force_output=False,
        bin_size=10,
        extends=[],
        shifts=[],
        sort=[],
        reporter="transcript",
        resolution_cds=1000,
        resolution_introns=1000,
        # 3kb is a good balance of seeing long enough 3 prime bias and not omit
        # too many genes. Tim 31th Aug 2013
        resolution_exons_absolute_distance_topolya=3000,
        # introns is only for assess the noise level, thus do ont need a long
        # region, a long region has the side effect of omit more genes. Tim
        # 31th Aug 2013
        resolution_introns_absolute_distance_topolya=500,
        # extension can simply just be the same as resolution
        extension_exons_absolute_distance_topolya=3000,
        extension_introns_absolute_distance_topolya=500,
        resolution_upstream_utr=1000,
        resolution_downstream_utr=1000,
        resolution_upstream=1000,
        resolution_downstream=1000,
        resolution_first=1000,
        resolution_last=1000,
        # mean length of transcripts: about 2.5 kb
        extension_upstream=2500,
        extension_downstream=2500,
        extension_inward=3000,
        extension_outward=3000,
        plot=True,
        methods=[],
        infiles=[],
        controlfiles=[],
        gtffile=None,
        profile_normalizations=[],
        transcript_normalization=None,
        scale_flanks=0,
        merge_pairs=False,
        min_insert_size=0,
        max_insert_size=1000,
        base_accuracy=False,
        matrix_format="single",
        control_factor=None,
        output_all_profiles=False,
        background_region_bins=10,
        input_filename_counts=None,
        resolution_images=None,
        image_format="png",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    # Keep for backwards compatability
    if len(args) == 2:
        infile, gtf = args
        options.infiles.append(infile)
        options.gtffile = gtf

    if not options.gtffile:
        raise ValueError("no GTF file specified")

    if options.gtffile == "-":
        options.gtffile = options.stdin
    else:
        options.gtffile = iotools.open_file(options.gtffile)

    if len(options.infiles) == 0:
        raise ValueError("no bam/wig/bed files specified")

    for methodsRequiresBaseAccuracy in [
            "geneprofilewithintrons",
            "geneprofileabsolutedistancefromthreeprimeend",
    ]:
        # If you implemented any methods that you do not want the
        # spliced out introns or exons appear to be covered by
        # non-existent reads, it is better you let those methods imply
        # --base-accurarcy by add them here.
        if methodsRequiresBaseAccuracy in options.methods:
            options.base_accuracy = True

    if options.reporter == "gene":
        gtf_iterator = GTF.flat_gene_iterator(GTF.iterator(options.gtffile))
    elif options.reporter == "transcript":
        gtf_iterator = GTF.transcript_iterator(GTF.iterator(options.gtffile))

    # Select rangecounter based on file type
    if len(options.infiles) > 0:
        if options.infiles[0].endswith(".bam"):
            bamfiles = [pysam.AlignmentFile(x, "rb") for x in options.infiles]

            if options.controlfiles:
                controlfiles = [
                    pysam.AlignmentFile(x, "rb") for x in options.controlfiles
                ]
            else:
                controlfiles = None

            format = "bam"
            if options.merge_pairs:
                range_counter = geneprofile.RangeCounterBAM(
                    bamfiles,
                    shifts=options.shifts,
                    extends=options.extends,
                    merge_pairs=options.merge_pairs,
                    min_insert_size=options.min_insert_size,
                    max_insert_size=options.max_insert_size,
                    controfiles=controlfiles,
                    control_factor=options.control_factor)

            elif options.shifts or options.extends:
                range_counter = geneprofile.RangeCounterBAM(
                    bamfiles,
                    shifts=options.shifts,
                    extends=options.extends,
                    controlfiles=controlfiles,
                    control_factor=options.control_factor)

            elif options.base_accuracy:
                range_counter = geneprofile.RangeCounterBAMBaseAccuracy(
                    bamfiles,
                    controlfiles=controlfiles,
                    control_factor=options.control_factor)
            else:
                range_counter = geneprofile.RangeCounterBAM(
                    bamfiles,
                    controlfiles=controlfiles,
                    control_factor=options.control_factor)

        elif options.infiles[0].endswith(".bed.gz"):
            bedfiles = [pysam.Tabixfile(x) for x in options.infiles]

            if options.controlfiles:
                controlfiles = [
                    pysam.Tabixfile(x) for x in options.controlfiles
                ]
            else:
                controlfiles = None

            range_counter = geneprofile.RangeCounterBed(
                bedfiles,
                controlfiles=controlfiles,
                control_factor=options.control_factor)

        elif options.infiles[0].endswith(".bw"):
            wigfiles = [BigWigFile(file=open(x)) for x in options.infiles]
            range_counter = geneprofile.RangeCounterBigWig(wigfiles)

        else:
            raise NotImplementedError("can't determine file type for %s" %
                                      str(options.infiles))

    counters = []
    for method in options.methods:
        if method == "utrprofile":
            counters.append(
                geneprofile.UTRCounter(
                    range_counter,
                    options.resolution_upstream,
                    options.resolution_upstream_utr,
                    options.resolution_cds,
                    options.resolution_downstream_utr,
                    options.resolution_downstream,
                    options.extension_upstream,
                    options.extension_downstream,
                ))

        elif method == "geneprofile":
            counters.append(
                geneprofile.GeneCounter(
                    range_counter, options.resolution_upstream,
                    options.resolution_cds, options.resolution_downstream,
                    options.extension_upstream, options.extension_downstream,
                    options.scale_flanks))

        elif method == "geneprofilewithintrons":
            counters.append(
                geneprofile.GeneCounterWithIntrons(
                    range_counter, options.resolution_upstream,
                    options.resolution_cds, options.resolution_introns,
                    options.resolution_downstream, options.extension_upstream,
                    options.extension_downstream, options.scale_flanks))

        elif method == "geneprofileabsolutedistancefromthreeprimeend":
            # options.extension_exons_absolute_distance_tostartsite,
            # options.extension_introns_absolute_distance_tostartsite,
            # Tim 31th Aug 2013: a possible feature for future,  if five prime
            # bias is of your interest.
            # (you need to create another class). It is not very difficult to
            # derive from this class, but is not implemented yet
            # This future feature is slightly different the TSS profile
            # already implemented, because in this future feature introns are
            # skipped,
            counters.append(
                geneprofile.GeneCounterAbsoluteDistanceFromThreePrimeEnd(
                    range_counter, options.resolution_upstream,
                    options.resolution_downstream,
                    options.resolution_exons_absolute_distance_topolya,
                    options.resolution_introns_absolute_distance_topolya,
                    options.extension_upstream, options.extension_downstream,
                    options.extension_exons_absolute_distance_topolya,
                    options.extension_introns_absolute_distance_topolya,
                    options.scale_flanks))

        elif method == "tssprofile":
            counters.append(
                geneprofile.TSSCounter(range_counter,
                                       options.extension_outward,
                                       options.extension_inward))

        elif method == "intervalprofile":
            counters.append(
                geneprofile.RegionCounter(range_counter,
                                          options.resolution_upstream,
                                          options.resolution_cds,
                                          options.resolution_downstream,
                                          options.extension_upstream,
                                          options.extension_downstream))

        elif method == "midpointprofile":
            counters.append(
                geneprofile.MidpointCounter(range_counter,
                                            options.resolution_upstream,
                                            options.resolution_downstream,
                                            options.extension_upstream,
                                            options.extension_downstream))

        # add new method to split 1st and last exons out
        # requires a representative transcript for reach gene
        # gtf should be sorted gene-position
        elif method == "separateexonprofile":
            counters.append(
                geneprofile.SeparateExonCounter(
                    range_counter, options.resolution_upstream,
                    options.resolution_first, options.resolution_last,
                    options.resolution_cds, options.resolution_downstream,
                    options.extension_upstream, options.extension_downstream))

        elif method == "separateexonprofilewithintrons":
            counters.append(
                geneprofile.SeparateExonWithIntronCounter(
                    range_counter, options.resolution_upstream,
                    options.resolution_first, options.resolution_last,
                    options.resolution_cds, options.resolution_introns,
                    options.resolution_downstream, options.extension_upstream,
                    options.extension_downstream))

    # set normalization
    for c in counters:
        c.setNormalization(options.transcript_normalization)
        if options.output_all_profiles:
            c.setOutputProfiles(
                iotools.open_file(
                    E.get_output_file(c.name) + ".profiles.tsv.gz", "w"))

    if options.input_filename_counts:
        # read counts from file
        E.info("reading counts from %s" % options.input_filename_counts)
        all_counts = pandas.read_csv(iotools.open_file(
            options.input_filename_counts),
                                     sep='\t',
                                     header=0,
                                     index_col=0)

        if len(counters) != 1:
            raise NotImplementedError(
                'counting from matrix only implemented for 1 counter.')
        # build counter based on reference counter
        counter = geneprofile.UnsegmentedCounter(counters[0])
        counters = [counter]
        geneprofile.countFromCounts(counters, all_counts)

    else:
        E.info("starting counting with %i counters" % len(counters))
        feature_names = geneprofile.countFromGTF(counters, gtf_iterator)

    # output matrices
    if not options.profile_normalizations:
        options.profile_normalizations.append("none")
    elif "all" in options.profile_normalizations:
        options.profile_normalizations = [
            "none", "area", "counts", "background"
        ]

    for method, counter in zip(options.methods, counters):
        profiles = []
        for norm in options.profile_normalizations:
            # build matrix, apply normalization
            profile = counter.getProfile(
                normalize=norm,
                background_region_bins=options.background_region_bins)
            profiles.append(profile)

        for x in range(1, len(profiles)):
            assert profiles[0].shape == profiles[x].shape

        # build a single matrix of all profiles for output
        matrix = numpy.concatenate(profiles)
        matrix.shape = len(profiles), len(profiles[0])
        matrix = matrix.transpose()

        with iotools.open_file(
                E.get_output_file(counter.name) + ".matrix.tsv.gz",
                "w") as outfile:
            outfile.write("bin\tregion\tregion_bin\t%s\n" %
                          "\t".join(options.profile_normalizations))
            fields = []
            bins = []
            for field, nbins in zip(counter.fields, counter.nbins):
                fields.extend([field] * nbins)
                bins.extend(list(range(nbins)))

            for row, cols in enumerate(zip(fields, bins, matrix)):
                outfile.write("%i\t%s\t" %
                              (row, "\t".join([str(x) for x in cols[:-1]])))
                outfile.write("%s\n" % ("\t".join([str(x) for x in cols[-1]])))

        with iotools.open_file(
                E.get_output_file(counter.name) + ".lengths.tsv.gz",
                "w") as outfile:
            counter.writeLengthStats(outfile)

        if options.output_all_profiles:
            counter.closeOutputProfiles()

    if options.plot:

        import matplotlib
        # avoid Tk or any X
        matplotlib.use("Agg")
        import matplotlib.pyplot as plt

        for method, counter in zip(options.methods, counters):

            if method in ("geneprofile", "geneprofilewithintrons",
                          "geneprofileabsolutedistancefromthreeprimeend",
                          "utrprofile", "intervalprofile",
                          "separateexonprofile",
                          "separateexonprofilewithintrons"):

                plt.figure()
                plt.subplots_adjust(wspace=0.05)
                max_scale = max([max(x) for x in counter.aggregate_counts])

                for x, counts in enumerate(counter.aggregate_counts):
                    plt.subplot(6, 1, x + 1)
                    plt.plot(list(range(len(counts))), counts)
                    plt.title(counter.fields[x])
                    plt.ylim(0, max_scale)

                figname = counter.name + ".full"

                fn = E.get_output_file(figname) + "." + options.image_format
                plt.savefig(os.path.expanduser(fn),
                            format=options.image_format,
                            dpi=options.resolution_images)

                plt.figure()

                points = []
                cuts = []
                for x, counts in enumerate(counter.aggregate_counts):
                    points.extend(counts)
                    cuts.append(len(counts))

                plt.plot(list(range(len(points))), points)

                xx, xxx = 0, []
                for x in cuts:
                    xxx.append(xx + x // 2)
                    xx += x
                    plt.axvline(xx, color="r", ls="--")

                plt.xticks(xxx, counter.fields)

                figname = counter.name + ".detail"

                fn = E.get_output_file(figname) + "." + options.image_format
                plt.savefig(os.path.expanduser(fn),
                            format=options.image_format,
                            dpi=options.resolution_images)

            elif method == "tssprofile":

                plt.figure()
                plt.subplot(1, 3, 1)
                plt.plot(
                    list(
                        range(-options.extension_outward,
                              options.extension_inward)),
                    counter.aggregate_counts[0])
                plt.title(counter.fields[0])
                plt.subplot(1, 3, 2)
                plt.plot(
                    list(
                        range(-options.extension_inward,
                              options.extension_outward)),
                    counter.aggregate_counts[1])
                plt.title(counter.fields[1])
                plt.subplot(1, 3, 3)
                plt.title("combined")
                plt.plot(
                    list(
                        range(-options.extension_outward,
                              options.extension_inward)),
                    counter.aggregate_counts[0])
                plt.plot(
                    list(
                        range(-options.extension_inward,
                              options.extension_outward)),
                    counter.aggregate_counts[1])
                plt.legend(counter.fields[:2])

                fn = E.get_output_file(
                    counter.name) + "." + options.image_format
                plt.savefig(os.path.expanduser(fn),
                            format=options.image_format,
                            dpi=options.resolution_images)

            elif method == "midpointprofile":

                plt.figure()
                plt.plot(numpy.arange(-options.resolution_upstream, 0),
                         counter.aggregate_counts[0])
                plt.plot(numpy.arange(0, options.resolution_downstream),
                         counter.aggregate_counts[1])

                fn = E.get_output_file(
                    counter.name) + "." + options.image_format
                plt.savefig(os.path.expanduser(fn),
                            format=options.image_format,
                            dpi=options.resolution_images)

    # write footer and output benchmark information.
    E.stop()