예제 #1
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--input-fastq-file",
                      dest="input_fastq_file",
                      type="string",
                      help="input fastq file. "
                      "[%default]")

    parser.add_option(
        "--output-removed-tsv",
        dest="output_removed_tsv",
        type="string",
        help="if given, sequence identifiers of removed sequences will "
        "be stored in this file [%default]")

    parser.add_option(
        "--output-stats-tsv",
        dest="output_stats_tsv",
        type="string",
        help="if given, output statistics will be written to this file. "
        "[%default]")

    parser.add_option("--output-removed-fastq",
                      dest="output_removed_fastq",
                      type="string",
                      help="if given, removed fastq records will "
                      "be stored in this file [%default]")

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      action="append",
                      type="choice",
                      choices=("filter-N", "filter-identifier", "filter-ONT",
                               "offset-quality", "apply", "change-format",
                               "renumber-reads", "sample", "sort", "trim3",
                               "trim5", "unique", "reverse-complement",
                               "grep"),
                      help="methods to apply [%default]")

    parser.add_option("--set-prefix",
                      dest="set_prefix",
                      type="string",
                      help="set sequence prefix [%default]")

    parser.add_option("--input-filter-tsv",
                      dest="input_filter_tsv",
                      type="string",
                      help="list of sequence ides to filter [%default]")

    parser.add_option("--min-average-quality",
                      dest="min_average_quality",
                      type="float",
                      help="minimum average quality [%default]")

    parser.add_option("--min-sequence-length",
                      dest="min_sequence_length",
                      type="int",
                      help="minimum sequence length [%default]")

    parser.add_option("--quality-offset",
                      dest="quality_offset",
                      type="int",
                      help="offset to modify quality values with [%default]")

    parser.add_option("--target-format",
                      dest="target_format",
                      type="choice",
                      choices=('sanger', 'solexa', 'phred64', 'integer',
                               'illumina-1.8'),
                      help="guess quality score format and set quality scores "
                      "to format [default=%default].")

    parser.add_option(
        "--guess-format",
        dest="guess_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'),
        help="quality score format to assume if ambiguous [default=%default].")

    parser.add_option(
        "--sample-size",
        dest="sample_size",
        type="float",
        help="proportion of reads to sample. "
        "Provide a proportion of reads to sample, e.g. 0.1 for 10%, "
        "0.5 for 50%, etc [default=%default].")

    parser.add_option("--pair-fastq-file",
                      dest="pair",
                      type="string",
                      help="if data is paired, filename with second pair. "
                      "Implemented for sampling [default=%default].")

    parser.add_option(
        "--map-tsv-file",
        dest="map_tsv_file",
        type="string",
        help="filename with tab-separated identifiers mapping for "
        "method apply [default=%default].")

    parser.add_option("--num-bases",
                      dest="nbases",
                      type="int",
                      help="number of bases to trim [default=%default].")

    parser.add_option(
        "--seed",
        dest="seed",
        type="int",
        help="seed for random number generator [default=%default].")

    parser.add_option(
        "--pattern-identifier",
        dest="renumber_pattern",
        type="string",
        help="rename reads in file by pattern [default=%default]")

    parser.add_option(
        "--grep-pattern",
        dest="grep_pattern",
        type="string",
        help="subset to reads matching pattern [default=%default]")

    parser.set_defaults(
        input_fastq_file="-",
        methods=[],
        change_format=None,
        guess_format=None,
        sample_size=0.1,
        nbases=0,
        pair=None,
        apply=None,
        seed=None,
        renumber_pattern="read_%010i",
        grep_pattern=".*",
        max_percent_N=10.0,
        set_prefix=None,
        output_removed_tsv=None,
        output_removed_fastq=None,
        output_stats_tsv=None,
        input_filter_tsv=None,
        min_average_quality=0,
        min_sequence_length=0,
        quality_offset=0,
    )

    (options, args) = E.start(parser, argv, add_output_options=True)

    if len(args) == 1:
        options.input_fastq_file = args[0]

    if len(options.methods) == 0:
        raise ValueError("no method specified, please use --method")

    # this script combines two scripts with different functionalities
    # TODO: to be sanitized
    if options.methods[0] in [
            "apply", "change-format", "renumber-reads", "sample", "sort",
            "trim3", "trim5", "unique", "reverse-complement", "grep"
    ]:
        options.method = options.methods[0]
        counter = process_cgat(options)
    else:
        counter = process_daisy(options)

    E.info(counter)
    E.stop()
예제 #2
0
def main(argv=None):
    """script main.

    parses command line args in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser()

    parser.add_argument("--version",
                        action='version',
                        version='%(prog)s {version}'.format(version="1.0"))

    parser.add_argument("-m",
                        "--method",
                        dest="methods",
                        type=str,
                        action="append",
                        choices=("transpose", "normalize-by-max",
                                 "normalize-by-value", "multiply-by-value",
                                 "percentile", "remove-header",
                                 "normalize-by-table", "upper-bound",
                                 "lower-bound", "kullback-leibler", "expand",
                                 "compress", "fdr", "grep", "randomize-rows"),
                        help="""actions to perform on table.""")

    parser.add_argument("-s",
                        "--scale",
                        dest="scale",
                        type=float,
                        help="factor to scale matrix by.")

    parser.add_argument("-f",
                        "--format",
                        dest="format",
                        type=str,
                        help="output number format")

    parser.add_argument("-p",
                        "--parameters",
                        dest="parameters",
                        type=str,
                        help="Parameters for various functions.")

    parser.add_argument("-t",
                        "--header-names",
                        dest="has_headers",
                        action="store_true",
                        help="matrix has row/column headers.")

    parser.add_argument("--transpose",
                        dest="transpose",
                        action="store_true",
                        help="transpose table.")

    parser.add_argument(
        "--set-transpose-field",
        dest="set_transpose_field",
        type=str,
        help="set first field (row 1 and col 1) to this value [%default].")

    parser.add_argument("--transpose-format",
                        dest="transpose_format",
                        type=str,
                        choices=(
                            "default",
                            "separated",
                        ),
                        help="input format of un-transposed table")

    parser.add_argument(
        "--expand",
        dest="expand_table",
        action="store_true",
        help="expand table - multi-value cells with be expanded over "
        "several rows.")

    parser.add_argument("--no-headers",
                        dest="has_headers",
                        action="store_false",
                        help="matrix has no row/column headers.")

    parser.add_argument("--columns",
                        dest="columns",
                        type=str,
                        help="columns to use.")

    parser.add_argument("--file",
                        dest="file",
                        type=str,
                        help="columns to test from table.",
                        metavar="FILE")

    parser.add_argument("-d",
                        "--delimiter",
                        dest="delimiter",
                        type=str,
                        help="delimiter of columns.",
                        metavar="DELIM")

    parser.add_argument("-V",
                        "--invert-match",
                        dest="invert_match",
                        action="store_true",
                        help="invert match.")

    parser.add_argument("--sort-by-rows",
                        dest="sort_rows",
                        type=str,
                        help="output order for rows.")

    parser.add_argument("-a",
                        "--value",
                        dest="value",
                        type=float,
                        help="value to use for various algorithms.")

    parser.add_argument(
        "--group",
        dest="group_column",
        type=int,
        help="group values by column. Supply an integer column ")

    parser.add_argument("--group-function",
                        dest="group_function",
                        type=str,
                        choices=("min", "max", "sum", "mean", "stats", "cat",
                                 "uniq"),
                        help="function to group values by.")

    parser.add_argument("--join-table",
                        dest="join_column",
                        type=int,
                        help="join rows in a table by columns.")

    parser.add_argument(
        "--collapse-table",
        dest="collapse_table",
        type=str,
        help="collapse a table. Value determines the missing variable ")

    parser.add_argument("--join-column-name",
                        dest="join_column_name",
                        type=int,
                        help="use this column as a prefix.")

    parser.add_argument("--flatten-table",
                        dest="flatten_table",
                        action="store_true",
                        help="flatten a table.")

    parser.add_argument("--as-column",
                        dest="as_column",
                        action="store_true",
                        help="output table as a single column.")

    parser.add_argument("--split-fields",
                        dest="split_fields",
                        action="store_true",
                        help="split fields.")

    parser.add_argument("--separator",
                        dest="separator",
                        type=str,
                        help="separator for multi-valued fields.")

    parser.add_argument(
        "--fdr-method",
        dest="fdr_method",
        type=str,
        choices=("BH", "bonferroni", "holm", "hommel", "hochberg", "BY"),
        help="method to perform multiple testing correction by controlling "
        "the fdr.")

    parser.add_argument(
        "--fdr-add-column",
        dest="fdr_add_column",
        type=str,
        help="add new column instead of replacing existing columns. "
        "The value of the option will be used as prefix if there are "
        "multiple columns")

    # IMS: add option to use a column as the row id in flatten
    parser.add_argument(
        "--id-column",
        dest="id_column",
        type=str,
        help="list of column(s) to use as the row id when flattening "
        "the table. If None, then row number is used.")

    parser.add_argument(
        "--variable-name",
        dest="variable_name",
        type=str,
        help="the column header for the 'variable' column when flattening ")

    parser.add_argument(
        "--value-name",
        dest="value_name",
        type=str,
        help="the column header for the 'value' column when flattening ")

    parser.set_defaults(
        methods=[],
        scale=1.0,
        has_headers=True,
        format=None,
        value=0.0,
        parameters="",
        columns="all",
        transpose=False,
        set_transpose_field=None,
        transpose_format="default",
        group=False,
        group_column=0,
        group_function="mean",
        missing_value="na",
        sort_rows=None,
        flatten_table=False,
        collapse_table=None,
        separator=";",
        expand=False,
        join_column=None,
        join_column_name=None,
        compute_fdr=None,
        as_column=False,
        fdr_method="BH",
        fdr_add_column=None,
        id_column=None,
        variable_name="column",
        value_name="value",
        file=None,
        delimiter="\t",
        invert_match=False,
    )

    (args, unknown) = E.start(parser, unknowns=True)

    args.parameters = args.parameters.split(",")

    if args.group_column:
        args.group = True
        args.group_column -= 1

    ######################################################################
    ######################################################################
    ######################################################################
    # if only to remove header, do this quickly
    if args.methods == ["remove-header"]:

        first = True
        for line in args.stdin:
            if line[0] == "#":
                continue
            if first:
                first = False
                continue
            args.stdout.write(line)

    elif args.transpose or "transpose" in args.methods:

        read_and_transpose_table(args.stdin, args)

    elif args.flatten_table:
        # IMS: bug fixed to make work. Also added options for keying
        # on a particular and adding custom column headings

        fields, table = CSV.readTable(args.stdin,
                                      with_header=args.has_headers,
                                      as_rows=True)

        args.columns = get_columns(fields, args.columns)

        if args.id_column:
            id_columns = [int(x) - 1 for x in args.id_column.split(",")]
            id_header = "\t".join(
                [fields[id_column] for id_column in id_columns])
            args.columns = [x for x in args.columns if x not in id_columns]
        else:
            id_header = "row"

        args.stdout.write("%s\t%s\t%s\n" %
                          (id_header, args.variable_name, args.value_name))

        for x, row in enumerate(table):

            if args.id_column:
                row_id = "\t".join(
                    [row[int(x) - 1] for x in args.id_column.split(",")])
            else:
                row_id = str(x)

            for y in args.columns:
                args.stdout.write("%s\t%s\t%s\n" % (row_id, fields[y], row[y]))

    elif args.as_column:

        fields, table = CSV.readTable(args.stdin,
                                      with_header=args.has_headers,
                                      as_rows=True)
        args.columns = get_columns(fields, args.columns)
        table = list(zip(*table))

        args.stdout.write("value\n")

        for column in args.columns:
            args.stdout.write("\n".join(table[column]) + "\n")

    elif args.split_fields:

        # split comma separated fields
        fields, table = CSV.readTable(args.stdin,
                                      with_header=args.has_headers,
                                      as_rows=True)

        args.stdout.write("%s\n" % ("\t".join(fields)))

        for row in table:
            row = [x.split(args.separator) for x in row]
            for d in itertools.product(*row):
                args.stdout.write("%s\n" % "\t".join(d))

    elif args.group:
        read_and_group_table(args.stdin, args)

    elif args.join_column:
        read_and_join_table(args.stdin, args)

    elif args.expand_table:
        read_and_expand_table(args.stdin, args)

    elif args.collapse_table is not None:
        read_and_collapse_table(args.stdin, args, args.collapse_table)

    elif "randomize-rows" in args.methods:
        read_and_randomize_rows(args.stdin, args)

    elif "grep" in args.methods:

        args.columns = [int(x) - 1 for x in args.columns.split(",")]

        patterns = []

        if args.file:
            infile = iotools.open_file(args.file, "r")
            for line in infile:
                if line[0] == "#":
                    continue
                patterns.append(line[:-1].split(args.delimiter)[0])
        else:
            patterns = args

        for line in args.stdin:

            data = line[:-1].split(args.delimiter)
            found = False

            for c in args.columns:

                if data[c] in patterns:
                    found = True
                    break

            if (not found
                    and args.invert_match) or (found
                                               and not args.invert_match):
                print(line[:-1])
    else:

        ######################################################################
        ######################################################################
        ######################################################################
        # Apply remainder of transformations
        fields, table = CSV.readTable(args.stdin,
                                      with_header=args.has_headers,
                                      as_rows=False)
        # convert columns to list
        table = [list(x) for x in table]

        ncols = len(fields)
        if len(table) == 0:
            raise ValueError("table is empty")

        nrows = len(table[0])

        E.info("processing table with %i rows and %i columns" % (nrows, ncols))

        args.columns = get_columns(fields, args.columns)

        # convert all values to float
        for c in args.columns:
            for r in range(nrows):
                try:
                    table[c][r] = float(table[c][r])
                except ValueError:
                    continue

        for method in args.methods:

            if method == "normalize-by-value":

                value = float(args.parameters[0])
                del args.parameters[0]

                for c in args.columns:
                    table[c] = [x / value for x in table[c]]

            elif method == "multiply-by-value":

                value = float(args.parameters[0])
                del args.parameters[0]

                for c in args.columns:
                    table[c] = [x * value for x in table[c]]

            elif method == "normalize-by-max":

                for c in args.columns:
                    m = max(table[c])
                    table[c] = [x / m for x in table[c]]

            elif method == "kullback-leibler":
                args.stdout.write("category1\tcategory2\tkl1\tkl2\tmean\n")
                format = args.format
                if format is None:
                    format = "%f"

                for x in range(0, len(args.columns) - 1):
                    for y in range(x + 1, len(args.columns)):
                        c1 = args.columns[x]
                        c2 = args.columns[y]
                        e1 = 0
                        e2 = 0
                        for z in range(nrows):
                            p = table[c1][z]
                            q = table[c2][z]
                            e1 += p * math.log(p / q)
                            e2 += q * math.log(q / p)

                        args.stdout.write("%s\t%s\t%s\t%s\t%s\n" %
                                          (fields[c1], fields[c2], format % e1,
                                           format % e2, format %
                                           ((e1 + e2) / 2)))
                E.stop()
                sys.exit(0)

            elif method == "rank":

                for c in args.columns:
                    tt = table[c]
                    t = list(zip(tt, list(range(nrows))))
                    t.sort()
                    for i, n in zip([x[1] for x in t], list(range(nrows))):
                        tt[i] = n

            elif method in ("lower-bound", "upper-bound"):

                boundary = float(args.parameters[0])
                del args.parameters[0]
                new_value = float(args.parameters[0])
                del args.parameters[0]

                if method == "upper-bound":
                    for c in args.columns:
                        for r in range(nrows):
                            if isinstance(table[c][r], float) and \
                                    table[c][r] > boundary:
                                table[c][r] = new_value
                else:
                    for c in args.columns:
                        for r in range(nrows):
                            if isinstance(table[c][r], float) and \
                                    table[c][r] < boundary:
                                table[c][r] = new_value

            elif method == "fdr":
                pvalues = []
                for c in args.columns:
                    pvalues.extend(table[c])

                assert max(pvalues) <= 1.0, "pvalues > 1 in table: max=%s" % \
                    str(max(pvalues))
                assert min(pvalues) >= 0, "pvalue < 0 in table: min=%s" % \
                    str(min(pvalues))

                # convert to str to avoid test for float downstream
                qvalues = list(
                    map(str,
                        Stats.adjustPValues(pvalues, method=args.fdr_method)))

                if args.fdr_add_column is None:
                    x = 0
                    for c in args.columns:
                        table[c] = qvalues[x:x + nrows]
                        x += nrows
                else:
                    # add new column headers
                    if len(args.columns) == 1:
                        fields.append(args.fdr_add_column)
                    else:
                        for co in args.columns:
                            fields.append(args.fdr_add_column + fields[c])

                    x = 0
                    for c in args.columns:
                        # add a new column
                        table.append(qvalues[x:x + nrows])
                        x += nrows
                    ncols += len(args.columns)

            elif method == "normalize-by-table":

                other_table_name = args.parameters[0]
                del args.parameters[0]
                other_fields, other_table = CSV.readTable(
                    iotools.open_file(other_table_name, "r"),
                    with_header=args.has_headers,
                    as_rows=False)

                # convert all values to float
                for c in args.columns:
                    for r in range(nrows):
                        try:
                            other_table[c][r] = float(other_table[c][r])
                        except ValueError:
                            continue

                # set 0s to 1 in the other matrix
                for c in args.columns:
                    for r in range(nrows):
                        if isinstance(table[c][r], float) and \
                                isinstance(other_table[c][r], float) and \
                                other_table[c][r] != 0:
                            table[c][r] /= other_table[c][r]
                        else:
                            table[c][r] = args.missing_value

        # convert back
        if args.format is not None:
            for c in args.columns:
                for r in range(nrows):
                    if isinstance(table[c][r], float):
                        table[c][r] = format % table[c][r]

        args.stdout.write("\t".join(fields) + "\n")
        if args.sort_rows:
            old2new = {}
            for r in range(nrows):
                old2new[table[0][r]] = r
            for x in args.sort_rows.split(","):
                if x not in old2new:
                    continue
                r = old2new[x]
                args.stdout.write(
                    "\t".join(map(str, [table[c][r]
                                        for c in range(ncols)])) + "\n")
        else:
            for r in range(nrows):
                args.stdout.write(
                    "\t".join(map(str, [table[c][r]
                                        for c in range(ncols)])) + "\n")

    E.stop()
예제 #3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--summarise",
                      dest="summarise",
                      type="choice",
                      choices=("level-counts", "taxa-counts", "individual"),
                      help="summarise the taxa counts - no. phyla etc")

    parser.add_option("--output-map",
                      dest="output_map",
                      action="store_true",
                      help="ouput map of taxonomy")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if options.output_map:
        found = []
        options.stdout.write("""Domain\t \
        kingdom\t \
        phylum\t \
        class\t \
        order\t \
        family\t \
        genus\t \
        species\n""")
        # only output the mapping file - do not continue
        # summarise regardless of the specified options
        for lca in LCA.iterate(options.stdin):

            # if bacteria or archaea the kingdom will
            # be the domain
            if lca.domain == "Bacteria" or lca.domain == "Archaea":
                kingdom = lca.domain
            else:
                kingdom = lca.kingdom

            hierarchy = [
                lca.domain, kingdom, lca.phylum, lca._class, lca.order,
                lca.family, lca.genus, lca.species
            ]
            if hierarchy in found:
                continue
            else:
                found.append(hierarchy)
                options.stdout.write("\t".join(hierarchy) + "\n")
        return

    if options.summarise == "level-counts":
        level_counts = collections.defaultdict(set)
        total = 0
        nreads_domain = 0
        nreads_kingdom = 0
        nreads_kingdom_plus = 0
        nreads_phylum = 0
        nreads_phylum_plus = 0
        nreads_class = 0
        nreads_class_plus = 0
        nreads_order = 0
        nreads_order_plus = 0
        nreads_family = 0
        nreads_family_plus = 0
        nreads_genus = 0
        nreads_genus_plus = 0
        nreads_species = 0
        nreads_species_plus = 0
        nreads_subspecies = 0
        nreads_subspecies_plus = 0

        c = E.Counter()
        for lca in LCA.iterate(options.stdin):
            total += 1
            if lca.domain != "NA":
                nreads_domain += 1
                level_counts["domain"].add(lca.domain)
            else:
                c.kingdom_unmapped += 1

            if lca.kingdom != "NA":
                nreads_kingdom += 1
                level_counts["kingdom"].add(lca.kingdom)
            else:
                c.kingdom_unmapped += 1

            if lca.kingdom_plus != "NA":
                nreads_kingdom_plus += 1
                level_counts["kingdom+"].add(lca.kingdom_plus)
            else:
                c.kingdom_plus_unmapped += 1

            if lca.phylum != "NA":
                nreads_phylum += 1
                level_counts["phylum"].add(lca.phylum)
            else:
                c.phylum_unmapped += 1

            if lca.phylum_plus != "NA":
                nreads_phylum_plus += 1
                level_counts["phylum+"].add(lca.phylum_plus)
            else:
                c.phylum_plus_unmapped += 1

            if lca._class != "NA":
                nreads_class += 1
                level_counts["class"].add(lca._class)
            else:
                c.class_unmapped += 1

            if lca._class_plus != "NA":
                nreads_class_plus += 1
                level_counts["class+"].add(lca._class_plus)
            else:
                c.class_plus_unmapped += 1

            if lca.order != "NA":
                nreads_order += 1
                level_counts["order"].add(lca.order)
            else:
                c.order_unmapped += 1

            if lca.order_plus != "NA":
                nreads_order_plus += 1
                level_counts["order+"].add(lca.order_plus)
            else:
                c.order_plus_unmapped += 1

            if lca.family != "NA":
                nreads_family += 1
                level_counts["family"].add(lca.family)
            else:
                c.family_unmapped += 1

            if lca.family != "NA":
                nreads_family_plus == 1
                level_counts["family+"].add(lca.family_plus)
            else:
                c.family_plus_unmapped += 1

            if lca.genus != "NA":
                nreads_genus += 1
                level_counts["genus"].add(lca.genus)
            else:
                c.genus_unmapped += 1

            if lca.genus_plus != "NA":
                nreads_genus_plus == 1
                level_counts["genus+"].add(lca.genus_plus)
            else:
                c.genus_plus_unmapped += 1

            if lca.species != "NA":
                nreads_species += 1
                level_counts["species"].add(lca.species)
            else:
                c.species_unmapped += 1

            if lca.species_plus != "NA":
                nreads_species_plus += 1
                level_counts["species+"].add(lca.species_plus)
            else:
                c.species_plus_unmapped += 1

            # removed subspecies mapping for the time
            # being

            # if lca.subspecies != "NA":
            #     nreads_subspecies += 1
            #     level_counts["subspecies"].add(lca.subspecies)
            # else:
            #     c.subspecies_unmapped += 1

            # if lca.subspecies_plus != "NA":
            #     nreads_subspecies_plus += 1
            #     level_counts["subspecies+"].add(lca.subspecies_plus)
            # else:
            #     c.subspecies_plus_unmapped += 1

        options.stdout.write("\t".join([
            "ndomain", "nkingdom", "nkingdom+", "nphylum", "nphylum+",
            "nclass", "nclass+", "norder", "norder+", "nfamily", "nfamily+",
            "ngenus", "ngenus+", "nspecies", "nspecies+", "nseqkingdom",
            "nseqkingdom+", "nseqphylum", "nseqphylum+", "nseqclass",
            "nseqclass+", "nseqorder", "nseqorder+", "nseqfamily",
            "nseqfamily+", "nseqgenus", "nseqgenus+", "nseqspecies",
            "nseqspecies+"
        ]) + "\n")

        options.stdout.write("\t".join(
            map(str, [
                len(level_counts["domain"]),
                len(level_counts["kingdom"]),
                len(level_counts["kingdom+"]),
                len(level_counts["phylum"]),
                len(level_counts["phylum+"]),
                len(level_counts["class"]),
                len(level_counts["class+"]),
                len(level_counts["order"]),
                len(level_counts["order+"]),
                len(level_counts["family"]),
                len(level_counts["family+"]),
                len(level_counts["genus"]),
                len(level_counts["genus+"]),
                len(level_counts["species"]),
                len(level_counts["species+"]), nreads_domain, nreads_kingdom,
                nreads_phylum, nreads_phylum_plus, nreads_class,
                nreads_class_plus, nreads_order, nreads_order_plus,
                nreads_family, nreads_family_plus, nreads_genus,
                nreads_genus_plus, nreads_species, nreads_species_plus
            ])) + "\n")
    elif options.summarise == "taxa-counts":
        unmapped = collections.defaultdict(int)
        total = 0
        taxa_counts = {
            "domain": collections.defaultdict(int),
            "kingdom": collections.defaultdict(int),
            "kingdom+": collections.defaultdict(int),
            "phylum": collections.defaultdict(int),
            "phylum+": collections.defaultdict(int),
            "class": collections.defaultdict(int),
            "class+": collections.defaultdict(int),
            "order": collections.defaultdict(int),
            "order+": collections.defaultdict(int),
            "family": collections.defaultdict(int),
            "family+": collections.defaultdict(int),
            "genus": collections.defaultdict(int),
            "genus+": collections.defaultdict(int),
            "species": collections.defaultdict(int),
            "species+": collections.defaultdict(int)
        }

        c = E.Counter()
        for lca in LCA.iterate(options.stdin):
            total += 1
            if lca.domain != "NA":
                taxa_counts["domain"][lca.domain] += 1
            else:
                c.kingdom_unmapped += 1
                unmapped["domain"] += 1
            if lca.kingdom != "NA":
                taxa_counts["kingdom"][lca.kingdom] += 1
            else:
                c.kingdom_unmapped += 1
                unmapped["kingdom"] += 1
            if lca.kingdom_plus != "NA":
                taxa_counts["kingdom+"][lca.kingdom_plus] += 1
            else:
                c.kingdom_plus_unmapped += 1
                unmapped["kingdom+"] += 1
            if lca.phylum != "NA":
                taxa_counts["phylum"][lca.phylum] += 1
            else:
                c.phylum_unmapped += 1
                unmapped["phylum"] += 1
            if lca.phylum_plus != "NA":
                taxa_counts["phylum+"][lca.phylum_plus] += 1
            else:
                c.phylum_plus_unmapped += 1
                unmapped["phylum+"] += 1
            if lca._class != "NA":
                taxa_counts["class"][lca._class] += 1
            else:
                c.class_unmapped += 1
                unmapped["class"] += 1
            if lca._class_plus != "NA":
                taxa_counts["class+"][lca._class_plus] += 1
            else:
                c.class_plus_unmapped += 1
                unmapped["class+"] += 1
            if lca.order != "NA":
                taxa_counts["order"][lca.order] += 1
            else:
                c.order_unmapped += 1
                unmapped["order"] += 1
            if lca.order_plus != "NA":
                taxa_counts["order+"][lca.order_plus] += 1
            else:
                c.order_plus_unmapped += 1
                unmapped["order+"] += 1
            if lca.family != "NA":
                taxa_counts["family"][lca.family] += 1
            else:
                c.family_unmapped += 1
                unmapped["family"] += 1
            if lca.family_plus != "NA":
                taxa_counts["family+"][lca.family_plus] += 1
            else:
                c.family_plus_unmapped += 1
                unmapped["family+"] += 1
            if lca.genus != "NA":
                taxa_counts["genus"][lca.genus] += 1
            else:
                c.genus_unmapped += 1
                unmapped["genus"] += 1
            if lca.genus_plus != "NA":
                taxa_counts["genus+"][lca.genus_plus] += 1
            else:
                c.genus_plus_unmapped += 1
                unmapped["genus+"] += 1
            if lca.species != "NA":
                taxa_counts["species"][lca.species] += 1
            else:
                c.species_unmapped += 1
                unmapped["species"] += 1
            if lca.species_plus != "NA":
                taxa_counts["species+"][lca.species_plus] += 1
            else:
                c.species_plus_unmapped += 1
                unmapped["species+"] += 1

        options.stdout.write("level\ttaxa\tcount\tproportion\trpm\n")
        for level, taxa_count in sorted(taxa_counts.items()):
            total_level = total - unmapped[level]
            for taxa, count in sorted(taxa_count.items()):
                options.stdout.write("\t".join([
                    level, taxa,
                    str(count), "{:.8}".format(float(count) /
                                               total_level), "{:.8}".
                    format(float(count) / (float(total_level) / 1000000))
                ]) + "\n")

        E.info(c)

    elif options.summarise == "individual":
        # each read is output with its respective
        # taxon assignments
        options.stdout.write("\t".join([
            "id", "domain", "kingdom", "kingdom+", "phylum", "phylum+",
            "class", "class+", "order", "order+", "family", "family+", "genus",
            "genus+", "species", "species+"
        ]) + "\n")
        for lca in LCA.iterate(options.stdin):
            options.stdout.write("\t".join([
                lca.identifier, lca.domain, lca.kingdom, lca.kingdom_plus,
                lca.phylum, lca.phylum_plus, lca._class, lca._class_plus,
                lca.order, lca.order_plus, lca.family, lca.family_plus,
                lca.genus, lca.genus_plus, lca.species, lca.species_plus
            ]) + "\n")

    # write footer and output benchmark information.
    E.stop()
예제 #4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument(
        "--input-filename-fasta",
        dest="input_filename_fasta",
        type=str,
        help="filename with reference sequence in fasta format ")

    parser.add_argument(
        "--counting-mode",
        dest="counting_mode",
        type=str,
        choices=("all", "pileup_defaults"),
        help="counting mode. all=all reads/bases. pileup-defaults= "
        "use default pileup thresholds. Options will be added to "
        "--mpileup-options. .")

    parser.add_argument("--mpileup-options",
                        dest="mpileup_options",
                        type=str,
                        help="pileup options to use ")

    parser.set_defaults(
        mpileup_options="",
        counting_mode="all",
        input_filename_fasta=None,
        report_step=1000000,
    )

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv, add_output_options=True)

    bamfile = args[0]

    mpileup_options = args.mpileup_options

    if args.counting_mode == "all":
        mpileup_options += " -Q 0 -B -A"

    read_depth_histogram = collections.defaultdict(int)
    base_depth_histogram = collections.defaultdict(int)

    # deletions are marked by something like -2AA at the first
    # position and a '*' for subsequent positions
    rx_deletions = re.compile("([-][0-9]+|[*])")
    report_step = args.report_step
    npositions = 0

    samtools = iotools.which("samtools")

    statement = ("{samtools} mpileup "
                 "-f {reference_fasta} "
                 "{mpileup_options} "
                 "{bamfile} ".format(samtools=samtools,
                                     reference_fasta=args.input_filename_fasta,
                                     mpileup_options=mpileup_options,
                                     bamfile=os.path.abspath(bamfile)))

    E.info("running the following statement: {}".format(statement))

    cmd_args = shlex.split(statement)
    proc = subprocess.Popen(cmd_args,
                            shell=False,
                            stderr=subprocess.PIPE,
                            stdout=subprocess.PIPE,
                            cwd=os.path.abspath(os.curdir))

    for line in proc.stdout:
        line = line.decode("utf-8")
        contig, pos, base, read_depth, info, qualities = line[:-1].split("\t")
        read_depth = int(read_depth)
        pos = int(pos)

        if pos % report_step == 0:
            E.info("working on {}: {}".format(contig, pos))

        ndeletions = len(rx_deletions.findall(info))
        base_depth = read_depth - ndeletions

        read_depth_histogram[read_depth] += 1
        base_depth_histogram[base_depth] += 1

    for line in proc.stderr:
        E.warn(line)

    keys = sorted(
        set(read_depth_histogram.keys()).union(base_depth_histogram.keys()))

    args.stdout.write("depth\tread_depth_positions\tbase_depth_positions\n")
    for key in keys:
        args.stdout.write("{}\t{}\t{}\n".format(key, read_depth_histogram[key],
                                                base_depth_histogram[key]))

    E.info("positions tested: {}".format(sum(read_depth_histogram.values())))
    E.stop()
예제 #5
0
for read in bamfile.fetch(until_eof=True):
    all_total += 1

    if all_total % 1000000 == 0:
        E.debug("Done %s reads; %s used" % (all_total, total))

    if read.is_unmapped:
        continue

    if read.has_tag("NH") and read.get_tag("NH") != 1:
        continue

    if read.is_read2:
        continue

    flen = read.template_length

    total += 1

    if abs(flen) <= 1000:
        lengthdist[abs(flen)] += 1.0

bamfile.close()

lengthdist = [f / total for f in lengthdist]

options.stdout.write("\t".join(map(str, lengthdist)))
E.info("Used %s reads out of %s to built distribution. Total weight = %s " %
       (total, all_total, sum(lengthdist)))
E.stop()
예제 #6
0
def run_workflow(options, args, pipeline=None):
    """command line control function for a pipeline.

    This method defines command line options for the pipeline and
    updates the global configuration dictionary correspondingly.

    It then provides a command parser to execute particular tasks
    using the ruffus pipeline control functions. See the generated
    command line help for usage.

    To use it, add::

        import pipeline as P

        if __name__ == "__main__":
            sys.exit(P.main(sys.argv))

    to your pipeline script.

    Arguments
    ---------
    pipeline: object
        pipeline to run. If not given, all ruffus pipelines are run.

    """
    logger = logging.getLogger("cgatcore.pipeline")

    if args:
        options.pipeline_action = args[0]
        if len(args) > 1:
            options.pipeline_targets.extend(args[1:])

    if options.force_run:
        if options.force_run == "all":
            forcedtorun_tasks = ruffus.pipeline_get_task_names()
        else:
            forcedtorun_tasks = options.pipeline_targets
    else:
        forcedtorun_tasks = []

    # create local scratch if it does not already exists. Note that
    # directory itself will be not deleted while its contents should
    # be cleaned up.
    if not os.path.exists(get_params()["tmpdir"]):
        logger.warn(
            "local temporary directory {} did not exist - created".format(
                get_params()["tmpdir"]))
        try:
            os.makedirs(get_params()["tmpdir"])
        except OSError:
            # file exists
            pass

    logger.debug("temporary directory is {}".format(get_params()["tmpdir"]))

    # set multiprocess to a sensible setting if there is no cluster
    run_on_cluster = HAS_DRMAA is True and not options.without_cluster
    if options.multiprocess is None:
        if not run_on_cluster:
            options.multiprocess = int(
                math.ceil(multiprocessing.cpu_count() / 2.0))
        else:
            options.multiprocess = 40

    # see inputValidation function in Parameters.py
    if options.input_validation:
        input_validation(get_params(), sys.argv[0])

    elif options.pipeline_action == "debug":
        # create the session proxy
        start_session()

        method_name = options.pipeline_targets[0]
        caller = get_caller()
        method = getattr(caller, method_name)
        method(*options.pipeline_targets[1:])

    elif options.pipeline_action in ("make", "show", "state", "svg", "plot",
                                     "dot", "touch", "regenerate"):

        messenger = None
        try:
            with cache_os_functions():
                if options.pipeline_action == "make":

                    if not options.without_cluster and not HAS_DRMAA and not get_params(
                    )['testing']:
                        E.critical(
                            "DRMAA API not found so cannot talk to a cluster.")
                        E.critical("Please use --local to run the pipeline"
                                   " on this host: {}".format(os.uname()[1]))
                        sys.exit(-1)

                    # get tasks to be done. This essentially replicates
                    # the state information within ruffus.
                    stream = StringIO()
                    ruffus.pipeline_printout(
                        stream,
                        options.pipeline_targets,
                        verbose=5,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                    messenger = LoggingFilterProgress(stream.getvalue())
                    logger.addFilter(messenger)

                    global task
                    if options.without_cluster:
                        # use ThreadPool to avoid taking multiple CPU for pipeline
                        # controller.
                        opts = {"multithread": options.multiprocess}
                    else:
                        # use cooperative multitasking instead of multiprocessing.
                        opts = {
                            "multiprocess": options.multiprocess,
                            "pool_manager": "gevent"
                        }
                        # create the session proxy
                        start_session()

                    logger.info("current directory is {}".format(os.getcwd()))

                    ruffus.pipeline_run(
                        options.pipeline_targets,
                        forcedtorun_tasks=forcedtorun_tasks,
                        logger=logger,
                        verbose=options.loglevel,
                        log_exceptions=options.log_exceptions,
                        exceptions_terminate_immediately=options.
                        exceptions_terminate_immediately,
                        checksum_level=options.ruffus_checksums_level,
                        pipeline=pipeline,
                        one_second_per_job=False,
                        **opts)

                    close_session()

                elif options.pipeline_action == "show":
                    ruffus.pipeline_printout(
                        options.stdout,
                        options.pipeline_targets,
                        forcedtorun_tasks=forcedtorun_tasks,
                        verbose=options.loglevel,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                elif options.pipeline_action == "touch":
                    ruffus.pipeline_run(
                        options.pipeline_targets,
                        touch_files_only=True,
                        verbose=options.loglevel,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                elif options.pipeline_action == "regenerate":
                    ruffus.pipeline_run(
                        options.pipeline_targets,
                        touch_files_only=options.ruffus_checksums_level,
                        pipeline=pipeline,
                        verbose=options.loglevel)

                elif options.pipeline_action == "svg":
                    ruffus.pipeline_printout_graph(
                        options.stdout.buffer,
                        options.pipeline_format,
                        options.pipeline_targets,
                        forcedtorun_tasks=forcedtorun_tasks,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                elif options.pipeline_action == "state":
                    ruffus.ruffus_return_dag(
                        options.stdout,
                        target_tasks=options.pipeline_targets,
                        forcedtorun_tasks=forcedtorun_tasks,
                        verbose=options.loglevel,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                elif options.pipeline_action == "plot":
                    outf, filename = tempfile.mkstemp()
                    ruffus.pipeline_printout_graph(
                        os.fdopen(outf, "wb"),
                        options.pipeline_format,
                        options.pipeline_targets,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)
                    execute("inkscape %s" % filename)
                    os.unlink(filename)

        except ruffus.ruffus_exceptions.RethrownJobError as ex:

            if not options.debug:
                E.error("%i tasks with errors, please see summary below:" %
                        len(ex.args))
                for idx, e in enumerate(ex.args):
                    task, job, error, msg, traceback = e

                    if task is None:
                        # this seems to be errors originating within ruffus
                        # such as a missing dependency
                        # msg then contains a RethrownJobJerror
                        msg = str(msg)
                    else:
                        task = re.sub("__main__.", "", task)
                        job = re.sub(r"\s", "", job)

                    # display only single line messages
                    if len([x for x in msg.split("\n") if x != ""]) > 1:
                        msg = ""

                    E.error("%i: Task=%s Error=%s %s: %s" %
                            (idx, task, error, job, msg))

                E.error("full traceback is in %s" % options.pipeline_logfile)

                logger.error("start of all error messages")
                logger.error(ex)
                logger.error("end of all error messages")

                raise ValueError("pipeline failed with %i errors" %
                                 len(ex.args)) from ex
            else:
                raise

    elif options.pipeline_action == "dump":
        options.stdout.write((json.dumps(get_params())) + "\n")

    elif options.pipeline_action == "printconfig":
        E.info("printing out pipeline parameters: ")
        p = get_params()
        for k in sorted(get_params()):
            print(k, "=", p[k])
        print_config_files()

    elif options.pipeline_action == "config":
        # Level needs to be 2:
        # 0th level -> cgatflow.py
        # 1st level -> Control.py
        # 2nd level -> pipeline_xyz.py
        f = sys._getframe(2)
        caller = f.f_globals["__file__"]
        pipeline_path = os.path.splitext(caller)[0]
        general_path = os.path.join(os.path.dirname(pipeline_path),
                                    "configuration")
        write_config_files(pipeline_path, general_path)

    elif options.pipeline_action == "clone":
        clone_pipeline(options.pipeline_targets[0])

    else:
        raise ValueError("unknown pipeline action %s" %
                         options.pipeline_action)

    E.stop(logger=get_logger())
예제 #7
0
def main(argv=None):
    """script main.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("bedgraph", "wiggle", "bigbed", "bigwig",
                               "bed"),
                      help="output format [default=%default]")

    parser.add_option("-s",
                      "--shift-size",
                      dest="shift",
                      type="int",
                      help="shift reads by a certain amount (ChIP-Seq) "
                      "[%default]")

    parser.add_option("-e",
                      "--extend",
                      dest="extend",
                      type="int",
                      help="extend reads by a certain amount "
                      "(ChIP-Seq) [%default]")

    parser.add_option("-p",
                      "--wiggle-span",
                      dest="span",
                      type="int",
                      help="span of a window in wiggle tracks "
                      "[%default]")

    parser.add_option("-m",
                      "--merge-pairs",
                      dest="merge_pairs",
                      action="store_true",
                      help="merge paired-ended reads into a single "
                      "bed interval [default=%default].")

    parser.add_option("--scale-base",
                      dest="scale_base",
                      type="float",
                      help="number of reads/pairs to scale bigwig file to. "
                      "The default is to scale to 1M reads "
                      "[default=%default]")

    parser.add_option("--scale-method",
                      dest="scale_method",
                      type="choice",
                      choices=(
                          "none",
                          "reads",
                      ),
                      help="scale bigwig output. 'reads' will normalize by "
                      "the total number reads in the bam file that are used "
                      "to construct the bigwig file. If --merge-pairs is used "
                      "the number of pairs output will be used for "
                      "normalization. 'none' will not scale the bigwig file"
                      "[default=%default]")

    parser.add_option("--max-insert-size",
                      dest="max_insert_size",
                      type="int",
                      help="only merge if insert size less that "
                      "# bases. 0 turns of this filter "
                      "[default=%default].")

    parser.add_option("--min-insert-size",
                      dest="min_insert_size",
                      type="int",
                      help="only merge paired-end reads if they are "
                      "at least # bases apart. "
                      "0 turns of this filter. [default=%default]")

    parser.set_defaults(
        samfile=None,
        output_format="wiggle",
        shift=0,
        extend=0,
        span=1,
        merge_pairs=None,
        min_insert_size=0,
        max_insert_size=0,
        scale_method='none',
        scale_base=1000000,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if len(args) >= 1:
        options.samfile = args[0]
    if len(args) == 2:
        options.output_filename_pattern = args[1]
    if not options.samfile:
        raise ValueError("please provide a bam file")

    # Read BAM file using Pysam
    samfile = pysam.AlignmentFile(options.samfile, "rb")

    # Create temporary files / folders
    tmpdir = tempfile.mkdtemp()
    E.debug("temporary files are in %s" % tmpdir)
    tmpfile_wig = os.path.join(tmpdir, "wig")
    tmpfile_sizes = os.path.join(tmpdir, "sizes")

    # Create dictionary of contig sizes
    contig_sizes = dict(list(zip(samfile.references, samfile.lengths)))
    # write contig sizes
    outfile_size = iotools.open_file(tmpfile_sizes, "w")
    for contig, size in sorted(contig_sizes.items()):
        outfile_size.write("%s\t%s\n" % (contig, size))
    outfile_size.close()

    # Shift and extend only available for bigwig format
    if options.shift or options.extend:
        if options.output_format != "bigwig":
            raise ValueError(
                "shift and extend only available for bigwig output")

    # Output filename required for bigwig / bigbed computation
    if options.output_format == "bigwig":
        if not options.output_filename_pattern:
            raise ValueError(
                "please specify an output file for bigwig computation.")

        # Define executable to use for binary conversion
        if options.output_format == "bigwig":
            executable_name = "wigToBigWig"
        else:
            raise ValueError("unknown output format `%s`" %
                             options.output_format)

        # check required executable file is in the path
        executable = iotools.which(executable_name)
        if not executable:
            raise OSError("could not find %s in path." % executable_name)

        # Open outout file
        outfile = iotools.open_file(tmpfile_wig, "w")
        E.info("starting output to %s" % tmpfile_wig)
    else:
        outfile = iotools.open_file(tmpfile_wig, "w")
        E.info("starting output to stdout")

    # Set up output write functions
    if options.output_format in ("wiggle", "bigwig"):
        # wiggle is one-based, so add 1, also step-size is 1, so need
        # to output all bases
        if options.span == 1:
            outf = lambda outfile, contig, start, end, val: \
                outfile.write(
                    "".join(["%i\t%i\n" % (x, val)
                             for x in range(start + 1, end + 1)]))
        else:
            outf = SpanWriter(options.span)
    elif options.output_format == "bedgraph":
        # bed is 0-based, open-closed
        outf = lambda outfile, contig, start, end, val: \
            outfile.write("%s\t%i\t%i\t%i\n" % (contig, start, end, val))

    # initialise counters
    ninput, nskipped, ncontigs = 0, 0, 0

    # set output file name
    output_filename_pattern = options.output_filename_pattern
    if output_filename_pattern:
        output_filename = os.path.abspath(output_filename_pattern)

    # shift and extend or merge pairs. Output temporay bed file
    if options.shift > 0 or options.extend > 0 or options.merge_pairs:
        # Workflow 1: convert to bed intervals and use bedtools
        # genomecov to build a coverage file.
        # Convert to bigwig with UCSC tools bedGraph2BigWig

        if options.merge_pairs:
            # merge pairs using bam2bed
            E.info("merging pairs to temporary file")
            counter = merge_pairs(samfile,
                                  outfile,
                                  min_insert_size=options.min_insert_size,
                                  max_insert_size=options.max_insert_size,
                                  bed_format=3)
            E.info("merging results: {}".format(counter))
            if counter.output == 0:
                raise ValueError("no pairs output after merging")
        else:
            # create bed file with shifted/extended tags
            shift, extend = options.shift, options.extend
            shift_extend = shift + extend
            counter = E.Counter()

            for contig in samfile.references:
                E.debug("output for %s" % contig)
                lcontig = contig_sizes[contig]

                for read in samfile.fetch(contig):
                    pos = read.pos
                    if read.is_reverse:
                        start = max(0, read.pos + read.alen - shift_extend)
                    else:
                        start = max(0, read.pos + shift)

                    # intervals extending beyond contig are removed
                    if start >= lcontig:
                        continue

                    end = min(lcontig, start + extend)
                    outfile.write("%s\t%i\t%i\n" % (contig, start, end))
                    counter.output += 1

        outfile.close()

        if options.scale_method == "reads":
            scale_factor = float(options.scale_base) / counter.output

            E.info("scaling: method=%s scale_quantity=%i scale_factor=%f" %
                   (options.scale_method, counter.output, scale_factor))
            scale = "-scale %f" % scale_factor
        else:
            scale = ""

        # Convert bed file to coverage file (bedgraph)
        tmpfile_bed = os.path.join(tmpdir, "bed")
        E.info("computing coverage")
        # calculate coverage - format is bedgraph
        statement = """bedtools genomecov -bg -i %(tmpfile_wig)s %(scale)s
        -g %(tmpfile_sizes)s > %(tmpfile_bed)s""" % locals()
        E.run(statement)

        # Convert bedgraph to bigwig
        E.info("converting to bigwig")
        tmpfile_sorted = os.path.join(tmpdir, "sorted")
        statement = ("sort -k 1,1 -k2,2n %(tmpfile_bed)s > %(tmpfile_sorted)s;"
                     "bedGraphToBigWig %(tmpfile_sorted)s %(tmpfile_sizes)s "
                     "%(output_filename_pattern)s" % locals())
        E.run(statement)

    else:

        # Workflow 2: use pysam column iterator to build a
        # wig file. Then convert to bigwig of bedgraph file
        # with UCSC tools.
        def column_iter(iterator):
            start = None
            end = 0
            n = None
            for t in iterator:
                if t.pos - end > 1 or n != t.n:
                    if start is not None:
                        yield start, end, n
                    start = t.pos
                    end = t.pos
                    n = t.n
                end = t.pos
            yield start, end, n

        if options.scale_method != "none":
            raise NotImplementedError(
                "scaling not implemented for pileup method")

        # Bedgraph track definition
        if options.output_format == "bedgraph":
            outfile.write("track type=bedGraph\n")

        for contig in samfile.references:
            # if contig != "chrX": continue
            E.debug("output for %s" % contig)
            lcontig = contig_sizes[contig]

            # Write wiggle header
            if options.output_format in ("wiggle", "bigwig"):
                outfile.write("variableStep chrom=%s span=%i\n" %
                              (contig, options.span))

            # Generate pileup per contig using pysam and iterate over columns
            for start, end, val in column_iter(samfile.pileup(contig)):
                # patch: there was a problem with bam files and reads
                # overextending at the end. These are usually Ns, but
                # need to check as otherwise wigToBigWig fails.
                if lcontig <= end:
                    E.warn("read extending beyond contig: %s: %i > %i" %
                           (contig, end, lcontig))
                    end = lcontig
                    if start >= end:
                        continue

                if val > 0:
                    outf(outfile, contig, start, end, val)
            ncontigs += 1

        # Close output file
        if type(outf) == type(SpanWriter):
            outf.flush(outfile)
        else:
            outfile.flush()

        E.info("finished output")

        # Report counters
        E.info("ninput=%i, ncontigs=%i, nskipped=%i" %
               (ninput, ncontigs, nskipped))

        # Convert to binary formats
        if options.output_format == "bigwig":
            outfile.close()

            E.info("starting %s conversion" % executable)
            try:
                retcode = subprocess.call(" ".join(
                    (executable, tmpfile_wig, tmpfile_sizes,
                     output_filename_pattern)),
                                          shell=True)
                if retcode != 0:
                    E.warn("%s terminated with signal: %i" %
                           (executable, -retcode))
                    return -retcode
            except OSError as msg:
                E.warn("Error while executing bigwig: %s" % msg)
                return 1
            E.info("finished bigwig conversion")
        else:
            with open(tmpfile_wig) as inf:
                sys.stdout.write(inf.read())

    # Cleanup temp files
    shutil.rmtree(tmpdir)

    E.stop()
예제 #8
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-i", "--input-bam", dest="input_bam_file", type="string",
        help="input bam file")

    parser.add_option(
        "-f", "--reference-bam", dest="reference_bam_file", type="string",
        help="reference BAM file [%default]")

    parser.add_option(
        "-q", "--query-name-regex", dest="query_name_regex", type="string",
        help="regular expression to apply on query name. "
        "Potentially required to match samtools sort order and should "
        "evaluate to an integer [%default]")

    parser.set_defaults(
        input_bam_file=None,
        reference_bam_file=None,
        query_name_regex=None,
    )

    (options, args) = E.start(parser, argv, add_output_options=True)

    if len(args) == 2:
        options.input_bam_file = args[0]
        options.reference_bam_file = args[1]

    if options.input_bam_file is None:
        raise ValueError("please supply a BAM file as input")

    if options.reference_bam_file is None:
        raise ValueError("please supply a BAM file as reference")

    # update paths to absolute
    options.input_bam_file = os.path.abspath(options.input_bam_file)
    options.reference_bam_file = os.path.abspath(options.reference_bam_file)

    if not os.path.exists(options.input_bam_file):
        raise OSError("input bam file {} does not exist".format(
            options.input_bam_file))

    if not os.path.exists(options.reference_bam_file):
        raise OSError("reference bam file {} does not exist".format(
            options.reference_bam_file))

    bam_in = pysam.AlignmentFile(options.input_bam_file)
    ref_in = pysam.AlignmentFile(options.reference_bam_file)

    outf_mapped = E.open_output_file("mapped")
    outf_mapped.write("\t".join(
        ["read",
         "length",
         "status",
         "overlap",
         "comp_contig",
         "comp_start",
         "comp_end",
         "ref_contig",
         "ref_start",
         "ref_end",
         "shared_misaligned",
         "shared_aligned",
         "shared_insertion",
         "shared_deletion",
         "comp_aligned",
         "comp_insertion",
         "comp_deletion",
         "ref_aligned",
         "ref_insertion",
         "ref_deletion"]) + "\n")

    outf_missing = E.open_output_file("missing")
    outf_missing.write("\t".join(
        ["read", "length", "status", "aligned",
         "insertion", "deletion"]) + "\n")

    counter = E.Counter()

    if options.query_name_regex:
        rx = re.compile(options.query_name_regex)

    def extract_query(x):
        return int(rx.search(x).groups()[0])

    qname_fn = None
    if options.query_name_regex:
        qname_fn = extract_query

    for reads_cmp, read_ref in group_pairs(iterate_read_pairs(
            bam_in.fetch(until_eof=True),
            ref_in.fetch(until_eof=True),
            qname_fn=qname_fn)):

        if len(reads_cmp) == 0:
            counter.missing += 1
            pairs_ref = set(read_ref.get_aligned_pairs())
            outf_missing.write("\t".join(
                map(str, (
                    read_ref.query_name,
                    read_ref.query_length,
                    "missing") +
                    count_pairs(pairs_ref))) + "\n")
            continue

        if len(reads_cmp) > 1:
            # multiple matches
            counter.multi_mapping += 1
            prefix = "multi_"
        else:
            counter.unique_mapping += 1
            prefix = "unique_"

        is_mapped = False
        for read_cmp in reads_cmp:

            counter.paired += 1

            if read_cmp.is_unmapped:
                counter.unmapped += 1
                pairs_ref = set(read_ref.get_aligned_pairs())
                outf_missing.write("\t".join(
                    map(str, (
                        read_ref.query_name,
                        read_ref.query_length,
                        "unmapped") +
                        count_pairs(pairs_ref))) + "\n")
                continue

            overlap = max(0, (min(read_cmp.reference_end,
                                  read_ref.reference_end) -
                              max(read_cmp.reference_start,
                                  read_ref.reference_start)))

            pairs_cmp = set(read_cmp.get_aligned_pairs())
            pairs_ref = set(read_ref.get_aligned_pairs())
            shared_cmp = pairs_cmp.intersection(pairs_ref)
            unique_cmp = pairs_cmp.difference(pairs_ref)
            missaligned = len([x for x, y in unique_cmp
                               if x is not None and y is not None])

            if read_cmp.reference_name != read_ref.reference_name or \
               overlap == 0:
                status = "mismapped"
            else:
                counter.overlap += 1
                status = "mapped"
                is_mapped = True

            outf_mapped.write("\t".join(
                map(str, (read_cmp.query_name,
                          read_cmp.query_length,
                          prefix + status,
                          overlap,
                          read_cmp.reference_name,
                          read_cmp.reference_start,
                          read_cmp.reference_end,
                          read_ref.reference_name,
                          read_ref.reference_start,
                          read_ref.reference_end,
                          missaligned) +
                    count_pairs(shared_cmp) +
                    count_pairs(pairs_cmp) +
                    count_pairs(pairs_ref))) + "\n")
        else:
            if is_mapped:
                status = "mapped"
            else:
                status = "mismapped"

            counter[prefix + status] += 1

    with E.open_output_file("summary") as outf:
        outf.write("category\tcounts\n")
        outf.write(counter.asTable() + "\n")

    E.stop()
예제 #9
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--program",
                      dest="program",
                      type="choice",
                      choices=["plink2", "gcta", "plinkdev"],
                      help="program to execute genome-wide analysis")

    parser.add_option("--input-file-pattern",
                      dest="infile_pattern",
                      type="string",
                      help="file prefix that identifies a group of files")

    parser.add_option("--input-file-format",
                      dest="file_format",
                      type="choice",
                      choices=[
                          "plink", "plink_binary", "oxford", "oxford_binary",
                          "vcf", "GRM_binary", "GRM_gz"
                      ],
                      help="format of input files")

    parser.add_option("--phenotypes-file",
                      dest="pheno_file",
                      type="string",
                      help="text file of additional phenotypes")

    parser.add_option("--pheno",
                      dest="pheno",
                      type="string",
                      help="either phenotype file column header or number")

    parser.add_option("--covariates-file",
                      dest="covariate_file",
                      type="string",
                      help="file containing covariates")

    parser.add_option("--covariate-column",
                      dest="covar_col",
                      type="string",
                      help="column number(s) or header(s) to include in "
                      "association model")

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=[
                          "ld_prune", "summary", "flag_hets",
                          "remove_relations", "check_gender", "IBD"
                      ],
                      help="method to apply to genome-wide data")

    parser.add_option("--IBD-parameter",
                      dest="ibd_param",
                      type="choice",
                      choices=["norm", "relatives", "full"],
                      help="param "
                      "to pass to IBD calculations")

    parser.add_option("--principal-components",
                      dest="num_pcs",
                      type="int",
                      help="the number of principal components to output")

    parser.add_option("--matrix-shape",
                      dest="matrix_shape",
                      type="choice",
                      choices=["triangle", "square", "square0"],
                      help="output matrix shape.",
                      default="triangle")

    parser.add_option("--matrix-compression",
                      dest="matrix_compress",
                      type="choice",
                      choices=["gz", "bin", "bin4"],
                      help="compression to apply to output matrix file",
                      default="gz")

    parser.add_option("--matrix-form",
                      dest="matrix_form",
                      type="choice",
                      choices=["distance", "grm"],
                      help="type of relationship matrix to calculate")

    parser.add_option(
        "--matrix-metric",
        dest="matrix_metric",
        type="choice",
        choices=["fhat", "cov", "ibc2", "ibc3", "ibs", "genomic", "hamming"],
        help="value to calculate for diagonal elements of the "
        "grm. Default is fhat for grm and hamming for distance.")

    parser.add_option(
        "--matrix-options",
        dest="matrix_options",
        type="string",
        help="modifiers of matrix output, see plink documentation "
        "for details")

    parser.add_option("--strand-flip-subset",
                      dest="flip_subset",
                      action="store_true",
                      help="apply strand flipping to a subset of samples")

    parser.add_option("--flip-scan-type",
                      dest="scan_param",
                      type="choice",
                      choices=["default", "window", "threshold"],
                      help="strand flipping scan to apply to SNPs")

    parser.add_option("--sort-type",
                      dest="sort_type",
                      type="choice",
                      choices=["none", "natural", "ascii", "file"],
                      help="sort type to input files")

    parser.add_option("--merge-file-format",
                      dest="merge_format",
                      type="choice",
                      choices=["plink", "binary_plink"],
                      help="format of input files to be merged")

    parser.add_option(
        "--merge-mode",
        dest="merge_mode",
        type="choice",
        choices=[
            "default", "original_missing", "new_nonmissing", "no_overwrite",
            "force", "report_all", "report_nonmissing"
        ],
        help="merge mode to apply to dealing with merge conflicts")

    parser.add_option("--duplicates-method",
                      dest="dup_method",
                      type="choice",
                      choices=["same_ref", "id_match", "suppress_first"],
                      help="method for identifying and dealing with duplicate "
                      "variants")

    parser.add_option("--summary-method",
                      dest="summary_method",
                      type="choice",
                      choices=[
                          "allele_frequency", "missing_data", "hardy_weinberg",
                          "mendel_errors", "inbreeding", "inbreeding_coef",
                          "gender_checker", "wrights_fst"
                      ],
                      help="summary statistics to calculate")

    parser.add_option("--summary-parameter",
                      dest="sum_param",
                      type="string",
                      help="optional parameters that can be passed to summary "
                      "statistics methods")

    parser.add_option(
        "--genotype-rate",
        dest="filt_genotype_rate",
        type="string",
        help="genotyping rate threshold.  SNPs below this threshold "
        "will be excluded from analysis")

    parser.add_option("--indiv-missing",
                      dest="filt_missingness",
                      type="string",
                      help="individual missingness rate.  Individuals below "
                      "this threshold will be excluded from analysis")

    parser.add_option("--hardy-weinberg",
                      dest="filt_hwe",
                      type="string",
                      help="hardy-weinberg p-value threshold for SNPs.  SNPs "
                      "with a 2df chisquared p-value below this will be "
                      "filtered out")

    parser.add_option(
        "--min-allele-frequency",
        dest="filt_min_allele_frequency",
        type="string",
        help="only include SNPs with an allele frequency equal to "
        "or above this threshold")

    parser.add_option(
        "--max-allele-frequency",
        dest="filt_max_allele_frequency",
        type="string",
        help="only include SNPs with an allele frequency equal to "
        "or below this threshold")

    parser.add_option(
        "--mendelian-error",
        dest="filt_mendelian_error",
        type="string",
        help="exclude individuals/trios with mendelian errors that "
        "exceed this value")

    parser.add_option("--min-quality-score",
                      dest="filt_min_qaul_score",
                      type="string",
                      help="reset the minimum low bound of quality scores for "
                      "variants in a VCF file.  Default is 0")

    parser.add_option(
        "--max-quality-score",
        dest="filt_max_qual_score",
        type="string",
        help="reset the maximum upper bound of quality scores for "
        "a VCCF file.  Default is Inf")

    parser.add_option("--allow-no-gender",
                      dest="filt_allow_no_sex",
                      type="string",
                      help="allow individuals with gender missing")

    parser.add_option("--enforce-gender",
                      dest="filt_enforce_sex",
                      type="string",
                      help="only include individuals with non-missing gender "
                      "information")

    parser.add_option("--keep-individuals",
                      dest="filt_keep",
                      type="string",
                      help="a file containing individuals IDs to keep, "
                      "one per row")

    parser.add_option("--remove-individuals",
                      dest="filt_remove",
                      type="string",
                      help="a file of individual IDs to remove, one per row")

    parser.add_option("--subset-filter",
                      dest="filt_subset_filter",
                      type="choice",
                      choices=[
                          "cases", "controls", "males", "females", "founders",
                          "nonfounders"
                      ],
                      help="only apply filters to the specific subset of "
                      "individuals supplied")

    parser.add_option(
        "--extract-snps",
        dest="filt_extract",
        type="string",
        help="text file of variant IDs to include in the analysis, "
        "ignoring all others")

    parser.add_option("--exclude-snps",
                      dest="filt_exclude",
                      type="string",
                      help="a file of variant IDs to exclude from analysis")

    parser.add_option("--restrict-chromosome",
                      dest="filt_chromosome",
                      type="string",
                      help="restict analysis to either a single chromosome, "
                      "or a comma-separated list of chromosomes")

    parser.add_option("--exclude-chromosomes",
                      dest="filt_exclude_chromosome",
                      type="string",
                      help="exclude all variants on these "
                      "chromosome(s)")

    parser.add_option(
        "--autosome-only",
        dest="filt_autosome",
        action="store_true",
        help="if present only autosomal variants will be analysed")

    parser.add_option(
        "--pseudo-autosome",
        dest="filt_pseudo_autosome",
        action="store_true",
        help="include on the pseudo-autosomal region of chromosome X")

    parser.add_option("--ignore-indels",
                      dest="filt_ignore_indels",
                      action="store_true",
                      help="only include bi-allelic single nucleotide "
                      "variants in analysis")

    parser.add_option(
        "--snp-range",
        dest="filt_snp_bp_range",
        type="string",
        help="comma separated list of from, to genome co-ordinates "
        "within which to include variants for analysis")

    parser.add_option("--snp-id-range",
                      dest="filt_snp_id_range",
                      type="string",
                      help="comma separate list of IDs from, to within which "
                      "to include variants for analysis.")

    parser.add_option("--snp-id",
                      dest="filt_specific_snp",
                      type="string",
                      help="include a single snp in the analysis given by "
                      "it's variant ID.")

    parser.add_option("--exclude-variant",
                      dest="filt_exclude_snp",
                      type="string",
                      help="exclude a single variant from the analysis, "
                      "given by it's variant ID")

    parser.add_option(
        "--covariate-filter",
        dest="filt_covariate_filter",
        type="string",
        help="covariate column headers or column numbers on which "
        "to filter on. Requries --covariate-file")

    parser.add_option(
        "--filter-parameter",
        dest="param",
        type="string",
        help="parameter values to be passed to filtering function")

    parser.add_option("--window-size",
                      dest="window_size",
                      type="string",
                      help="alters the behaviour of the --snp-range and "
                      "--include/exclude snp options.  variants within +/- "
                      "half * window_size (kb) are included")

    parser.add_option(
        "--range-resolution",
        dest="filt_range_resolution",
        type="choice",
        choices=["bp", "kb", "mb"],
        help="alters the (from, to) range resolution to either bp, "
        "kb or mb")

    parser.add_option(
        "--output-file-pattern",
        dest="out_pattern",
        type="string",
        help="output file pattern prefix. file suffixes are dependent "
        "on the task executed")

    parser.add_option("--threads",
                      dest="threads",
                      type="int",
                      help="the number of threads to use for multi-threaded "
                      "processes")

    parser.add_option("--use-kb",
                      dest="kb",
                      action="store_true",
                      help="if present uses a kb sized window for LD pruning")

    parser.add_option("--prune-method",
                      dest="prune_method",
                      type="choice",
                      choices=["R2", "VIF"],
                      help="type of LD pruning to "
                      "perform, pair-wise LD or variance inflation factor")

    parser.add_option("--step-size",
                      dest="step",
                      type="string",
                      help="step size to advance window by")

    parser.add_option("--threshold",
                      dest="threshold",
                      type="string",
                      help="threshold on which to filter results")

    parser.add_option("--parallel",
                      dest="parallel",
                      type="int",
                      help="number of jobs to split task into")

    parser.add_option("--memory",
                      dest="memory",
                      type="string",
                      help="amount of memory to reserve for the task")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    parser.set_defaults(sum_param=None,
                        dup_method="same_ref",
                        matrix_shape="triangle",
                        matrix_options=None,
                        matrix_compress="gz",
                        kb=False,
                        random_seed=random.randint(0, 19999),
                        memory="60G",
                        parallel=None)

    if not options.infile_pattern:
        infiles = (argv[-1]).split(",")
    else:
        infiles = options.infile_pattern

    # create a new filegroup object
    geno_files = gwas.FileGroup(files=infiles,
                                file_format=options.file_format,
                                genotype_format="imputed")
    if options.pheno_file:
        geno_files.set_phenotype(pheno_file=options.pheno_file,
                                 pheno=options.pheno)
    else:
        pass

    # add FileGroup object to the gwas program object
    if options.program == "plink2":
        gwas_object = gwas.Plink2(files=geno_files)
        gwas_object.program_call(infiles=geno_files,
                                 outfile=options.out_pattern)
    elif options.program == "plinkdev":
        gwas_object = gwas.PlinkDev(files=geno_files)
        gwas_object.program_call(infiles=geno_files,
                                 outfile=options.out_pattern)

    elif options.program == "gcta":
        gwas_object = gwas.GCTA(files=geno_files)
        gwas_object.program_call(infiles=geno_files,
                                 outfile=options.out_pattern)
    else:
        pass

    # collect filtering options from options
    opt_dict = options.__dict__
    filter_keys = [fx for fx in opt_dict.keys() if re.search("filt", fx)]
    filter_dict = {k: options.__dict__[k] for k in filter_keys if opt_dict[k]}

    # iteratively add all filters to GWASProgram object
    for fkey in filter_dict:
        filt_key = fkey.replace("filt_", "")
        filter_value = filter_dict[fkey]
        gwas_object.apply_filters(filter_type=filt_key,
                                  filter_value=filter_value)

    # handle summary statistics
    if options.method == "ld_prune":
        gwas_object._qc_methods(ld_prune=options.prune_method,
                                kb=True,
                                window=options.window_size,
                                step=options.step,
                                threshold=options.threshold)
    elif options.method == "IBD":
        # use sum param to pass arguments to ibd estiamte
        # these are norm, full or relatitves
        gwas_object._qc_methods(ibd=options.ibd_param)
    elif options.method == "summary":
        if options.summary_method == "allele_frequency":
            gwas_object._output_statistics(allele_frequency=options.sum_param)
        elif options.summary_method == "hardy_weinberg":
            gwas_object._output_statistics(hardy_weinberg=options.sum_param)
        elif options.summary_method == "missing_data":
            gwas_object._output_statistics(missing_data=options.sum_param)
        elif options.summary_method == "mendel_errors":
            gwas_object._output_statistics(mendel_errors=options.sum_param)
        elif options.summary_method == "inbreeding":
            gwas_object._output_statistics(inbreeding=options.sum_param)
        elif options.summary_method == "inbreeding_coef":
            gwas_object._output_statistics(inbreeding_coef=options.sum_param)
        elif options.summary_method == "gender_checker":
            gwas_object._output_statistics(gender_checker=options.sum_param)
        elif options.summary_method == "wrights_fst":
            gwas_object._output_statistics(wrights_fst=options.sum_param)
        else:
            pass
    elif options.method == "remove_relations":
        gwas_object._run_tasks(remove_relations="cutoff",
                               parameter=options.threshold)
    elif options.method == "check_gender":
        gwas_object._run_tasks(check_gender="")
    else:
        pass

    gwas_object.build_statement(infiles=geno_files,
                                outfile=options.out_pattern,
                                threads=options.threads,
                                memory=options.memory,
                                parallel=options.parallel)

    # write footer and output benchmark information.
    E.stop()
예제 #10
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-r",
        "--mask-bed-file",
        "--mask-gff-file",
        dest="filename_bed",
        type="string",
        metavar='GFF',
        help="gff formatted file with masking locations. The number of "
        "reads overlapping the intervals in the given file will be "
        "computed. Note that the computation currently does not take "
        "into account indels, so it is an approximate count only. "
        "[%default]")

    parser.add_option(
        "-f",
        "--ignore-masked-reads",
        dest="ignore_masked_reads",
        action="store_true",
        help="as well as counting reads in the file given by --mask-bed-file, "
        "also remove these reads for duplicate and match statistics. "
        "[%default]")

    parser.add_option(
        "-i",
        "--num-reads",
        dest="input_reads",
        type="int",
        help="the number of reads - if given, used to provide percentages "
        "[%default]")

    parser.add_option(
        "-d",
        "--output-details",
        dest="output_details",
        action="store_true",
        help="output per-read details into a separate file. Read names are "
        "md5/base64 encoded [%default]")

    parser.add_option("--output-readmap",
                      dest="output_readmap",
                      action="store_true",
                      help="output map between read name and "
                      "md5/base64 encoded short name[%default]")

    parser.add_option(
        "--add-alignment-details",
        dest="add_alignment_details",
        action="store_true",
        help=
        "add alignment details to per-read details. Implies --output-details "
        "[%default]")

    parser.add_option(
        "-q",
        "--fastq-file",
        dest="filename_fastq",
        help="filename with sequences and quality scores. This file is only "
        "used to collect sequence identifiers. Thus, for paired end data a "
        "single file is sufficient [%default]")

    parser.add_option(
        "--basic-counts",
        dest="detailed_count",
        action="store_false",
        help="perform basic counting and do not compute per read stats. "
        "This is more memory efficient and faster stats computation, "
        "but only a summary counts table is output [%default]")

    parser.set_defaults(
        filename_bed=None,
        ignore_masked_reads=False,
        input_reads=0,
        force_output=False,
        filename_fastq=None,
        detailed_count=True,
        output_details=False,
        output_readmap=False,
        add_alignment_details=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if options.filename_bed:
        bed_mask = GTF.readAndIndex(
            GTF.iterator(iotools.open_file(options.filename_bed)))
    else:
        bed_mask = None

    if options.add_alignment_details:
        options.output_details = True

    is_stdin = True
    if len(args) > 0:
        pysam_in = pysam.AlignmentFile(args[0], "rb")
        if args[0] != "-":
            is_stdin = False
    elif options.stdin == sys.stdin:
        pysam_in = pysam.AlignmentFile("-", "rb")
    else:
        pysam_in = pysam.AlignmentFile(options.stdin, "rb")
        if options.stdin != "-":
            is_stdin = False

    if options.output_details:
        outfile_details = E.open_output_file("details", "w")
    else:
        outfile_details = None

    if options.output_readmap:
        outfile_readmap = E.open_output_file("readmap", "w")
    else:
        outfile_readmap = None

    if options.filename_fastq and not os.path.exists(options.filename_fastq):
        raise IOError("file %s does not exist" % options.filename_fastq)

    (counter, flags_counts, nh_filtered, nh_all,
     nm_filtered, nm_all, mapq, mapq_all, max_hi, details_df) = \
        bam2stats_count(pysam_in,
                        bed_mask=bed_mask,
                        ignore_masked_reads=options.ignore_masked_reads,
                        is_stdin=is_stdin,
                        filename_fastq=options.filename_fastq,
                        outfile_details=outfile_details,
                        add_alignment_details=options.add_alignment_details,
                        outfile_readmap=outfile_readmap,
                        detailed_count=options.detailed_count)

    if max_hi > 0 and max_hi != max(nh_all.keys()):
        E.warn("max_hi(%i) is inconsistent with max_nh (%i) "
               "- counts will be corrected" % (max_hi, max(nh_all.keys())))

    outs = options.stdout
    outs.write("category\tcounts\tpercent\tof\n")

    def _write(outs, text, numerator, denominator, base):
        percent = iotools.pretty_percent(numerator, denominator)
        outs.write('%s\t%i\t%s\t%s\n' % (text, numerator, percent, base))

    ###############################
    ###############################
    ###############################
    # Output alignment information
    ###############################
    nalignments_unmapped = flags_counts["unmapped"]
    nalignments_mapped = counter.alignments_input - nalignments_unmapped

    _write(outs, "alignments_total", counter.alignments_input,
           counter.alignments_input, "alignments_total")

    if counter.alignments_input == 0:
        E.warn("no alignments in BAM file - no further output")
        E.stop()
        return

    _write(outs, "alignments_mapped", nalignments_mapped,
           counter.alignments_input, 'alignments_total')
    _write(outs, "alignments_unmapped", nalignments_unmapped,
           counter.alignments_input, 'alignments_total')

    if nalignments_mapped == 0:
        E.warn("no mapped alignments - no further output")
        E.stop()
        return

    for flag, counts in sorted(flags_counts.items()):
        if flag == "unmapped":
            continue
        _write(outs, 'alignments_' + flag, counts, nalignments_mapped,
               'alignments_mapped')

    if options.filename_bed:
        _write(outs, "alignments_masked", counter.alignments_masked,
               nalignments_mapped, 'alignments_mapped')
        _write(outs, "alignments_notmasked", counter.alignments_notmasked,
               nalignments_mapped, 'alignments_mapped')

    _write(outs, "alignments_filtered", counter.alignments_filtered,
           nalignments_mapped, "alignments_mapped")

    if counter.filtered == nalignments_mapped:
        normby = "alignments_mapped"
    else:
        normby = "alignments_filtered"

    if counter.filtered > 0:
        _write(outs, "alignments_duplicates", counter.alignments_duplicates,
               counter.alignments_filtered, normby)
        _write(outs, "alignments_unique",
               counter.aligmnments_filtered - counter.alignments_duplicates,
               counter.alignments_filtered, normby)

    ###############################
    ###############################
    ###############################
    # Output read based information
    ###############################

    # derive the number of mapped reads in file from alignment counts
    if options.filename_fastq or not is_stdin:
        nreads_total = counter.total_read
        _write(outs, "reads_total", counter.total_read, nreads_total,
               'reads_total')
        _write(outs, "reads_unmapped", counter.total_read_is_unmapped,
               nreads_total, 'reads_total')
        _write(outs, "reads_mapped", counter.total_read_is_mapped,
               nreads_total, 'reads_total')
        _write(outs, "reads_missing", counter.total_read_is_missing,
               nreads_total, 'reads_total')
        _write(outs, "reads_mapped_unique", counter.total_read_is_mapped_uniq,
               counter.total_read_is_mapped, 'reads_mapped')
        _write(outs, "reads_multimapping", counter.total_read_is_mmap,
               counter.total_read_is_mapped, 'reads_mapped')
        _write(outs, "reads_mapped_supplementary",
               counter.total_read_has_supplementary,
               counter.total_read_is_mapped, 'reads_mapped')
    else:
        E.warn('inferring read counts from alignments and NH tags')
        nreads_unmapped = flags_counts["unmapped"]
        nreads_mapped = computeMappedReadsFromAlignments(
            nalignments_mapped, nh_all, max_hi)

        nreads_missing = 0
        if options.input_reads:
            nreads_total = options.input_reads
            # unmapped reads in bam file?
            if nreads_unmapped:
                nreads_missing = nreads_total - nreads_unmapped - nreads_mapped
            else:
                nreads_unmapped = nreads_total - nreads_mapped

        elif nreads_unmapped:
            # if unmapped reads are in bam file, take those
            nreads_total = nreads_mapped + nreads_unmapped
        else:
            # otherwise normalize by mapped reads
            nreads_unmapped = 0
            nreads_total = nreads_mapped

        outs.write("reads_total\t%i\t%5.2f\treads_total\n" %
                   (nreads_total, 100.0))
        outs.write("reads_mapped\t%i\t%5.2f\treads_total\n" %
                   (nreads_mapped, 100.0 * nreads_mapped / nreads_total))
        outs.write("reads_unmapped\t%i\t%5.2f\treads_total\n" %
                   (nreads_unmapped, 100.0 * nreads_unmapped / nreads_total))
        outs.write("reads_missing\t%i\t%5.2f\treads_total\n" %
                   (nreads_missing, 100.0 * nreads_missing / nreads_total))

        if len(nh_all) > 1:
            outs.write("reads_unique\t%i\t%5.2f\treads_mapped\n" %
                       (nh_all[1], 100.0 * nh_all[1] / nreads_mapped))

    pysam_in.close()

    ###############################
    ###############################
    ###############################
    # Output pair information
    ###############################
    if flags_counts["read2"] > 0:
        if options.filename_fastq:
            pairs_mapped = counter.total_pair_is_mapped

            # sanity check
            assert counter.total_pair_is_mapped == \
                (counter.total_pair_is_proper_uniq +
                 counter.total_pair_is_incomplete_uniq +
                 counter.total_pair_is_incomplete_mmap +
                 counter.total_pair_is_proper_duplicate +
                 counter.total_pair_is_proper_mmap +
                 counter.total_pair_not_proper_uniq +
                 counter.total_pair_is_other)

            outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" %
                       (counter.total_pairs,
                        100.0 * counter.total_pairs / counter.total_pairs))
            outs.write(
                "pairs_mapped\t%i\t%5.2f\tpairs_total\n" %
                (pairs_mapped, 100.0 * pairs_mapped / counter.total_pairs))
            outs.write("pairs_unmapped\t%i\t%5.2f\tpairs_total\n" %
                       (counter.total_pair_is_unmapped, 100.0 *
                        counter.total_pair_is_unmapped / counter.total_pairs))
            outs.write(
                "pairs_proper_unique\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_proper_uniq, 100.0 *
                 counter.total_pair_is_proper_uniq / counter.total_pairs))
            outs.write(
                "pairs_incomplete_unique\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_incomplete_uniq, 100.0 *
                 counter.total_pair_is_incomplete_uniq / counter.total_pairs))
            outs.write(
                "pairs_incomplete_multimapping\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_incomplete_mmap, 100.0 *
                 counter.total_pair_is_incomplete_mmap / counter.total_pairs))
            outs.write(
                "pairs_proper_duplicate\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_proper_duplicate, 100.0 *
                 counter.total_pair_is_proper_duplicate / counter.total_pairs))
            outs.write(
                "pairs_proper_multimapping\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_proper_mmap, 100.0 *
                 counter.total_pair_is_proper_mmap / counter.total_pairs))
            outs.write(
                "pairs_not_proper_unique\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_not_proper_uniq, 100.0 *
                 counter.total_pair_not_proper_uniq / counter.total_pairs))
            outs.write("pairs_other\t%i\t%5.2f\tpairs_total\n" %
                       (counter.total_pair_is_other, 100.0 *
                        counter.total_pair_is_other / counter.total_pairs))

            nread1_total = counter.total_read1
            _write(outs, "read1_total", counter.total_read1, nread1_total,
                   'read1_total')
            _write(outs, "read1_unmapped", counter.total_read1_is_unmapped,
                   nread1_total, 'read1_total')
            _write(outs, "read1_mapped", counter.total_read1_is_mapped,
                   nread1_total, 'read1_total')
            _write(outs, "read1_mapped_unique",
                   counter.total_read1_is_mapped_uniq,
                   counter.total_read1_is_mapped, 'read1_mapped')
            _write(outs, "reads_multimapping", counter.total_read1_is_mmap,
                   counter.total_read1_is_mapped, 'read1_mapped')
            _write(outs, "read1_missing", counter.total_read1_is_missing,
                   counter.total_read1_is_mapped, 'read1_total')

            nread2_total = counter.total_read2
            _write(outs, "read2_total", counter.total_read2, nread2_total,
                   'read2_total')
            _write(outs, "read2_unmapped", counter.total_read2_is_unmapped,
                   nread2_total, 'read2_total')
            _write(outs, "read2_mapped", counter.total_read2_is_mapped,
                   nread2_total, 'read2_total')
            _write(outs, "read2_mapped_unique",
                   counter.total_read2_is_mapped_uniq,
                   counter.total_read2_is_mapped, 'read2_mapped')
            _write(outs, "reads_multimapping", counter.total_read2_is_mmap,
                   counter.total_read2_is_mapped, 'read2_mapped')
            _write(outs, "read2_missing", counter.total_read2_is_missing,
                   counter.total_read2_is_mapped, 'read2_total')

        else:
            # approximate counts
            pairs_total = nreads_total // 2
            pairs_mapped = flags_counts["proper_pair"] // 2
            _write(outs, "pairs_total", pairs_total, pairs_total,
                   "pairs_total")
            _write(outs, "pairs_mapped", pairs_mapped, pairs_total,
                   "pairs_total")
    else:
        # no paired end data
        pairs_total = pairs_mapped = 0
        outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" %
                   (pairs_total, 0.0))
        outs.write("pairs_mapped\t%i\t%5.2f\tpairs_total\n" %
                   (pairs_mapped, 0.0))

    outs.write("error_rate\t%i\t%5.2f\tmatches+insertions\n" %
               (counter.error_counts, counter.error_rate * 100.0))
    outs.write("insertion_rate\t%i\t%5.2f\tmatches+insertions\n" %
               (counter.insertion_counts, counter.insertion_rate * 100.0))
    outs.write("deletion_rate\t%i\t%5.2f\tmatches+deletions\n" %
               (counter.deletion_counts, counter.deletion_rate * 100.0))
    outs.write("mismatch_rate\t%i\t%5.2f\tmatches\n" %
               (counter.mismatch_counts, counter.mismatch_rate * 100.0))
    outs.write("match_rate\t%i\t%5.2f\tmatches+insertions\n" %
               (counter.match_counts, counter.match_rate * 100.0))

    if options.force_output or len(nm_filtered) > 0:
        outfile = E.open_output_file("nm", "w")
        outfile.write("NM\talignments\n")
        if len(nm_filtered) > 0:
            for x in range(0, max(nm_filtered.keys()) + 1):
                outfile.write("%i\t%i\n" % (x, nm_filtered[x]))
        else:
            outfile.write("0\t%i\n" % (counter.filtered))
        outfile.close()

    if options.force_output or len(nh_all) > 1:
        outfile = E.open_output_file("nh_all", "w")
        outfile.write("NH\treads\n")
        if len(nh_all) > 0:
            writeNH(outfile, nh_all, max_hi)
        else:
            # assume all are unique if NH flag not set
            outfile.write("1\t%i\n" % (counter.mapped_reads))
        outfile.close()

    if options.force_output or len(nh_filtered) > 1:
        outfile = E.open_output_file("nh", "w")
        outfile.write("NH\treads\n")
        if len(nh_filtered) > 0:
            writeNH(outfile, nh_filtered, max_hi)
        else:
            # assume all are unique if NH flag not set
            outfile.write("1\t%i\n" % (counter.filtered))
        outfile.close()

    if options.force_output or len(mapq_all) > 1:
        outfile = E.open_output_file("mapq", "w")
        outfile.write("mapq\tall_reads\tfiltered_reads\n")
        for x in range(0, max(mapq_all.keys()) + 1):
            outfile.write("%i\t%i\t%i\n" % (x, mapq_all[x], mapq[x]))
        outfile.close()

    if details_df is not None:
        with E.open_output_file("summaries", "w") as outf:
            details_df.describe().transpose().to_csv(outf,
                                                     sep="\t",
                                                     index_label="metric")
        bins = numpy.arange(0, 1.01, 0.01)
        histogram_df = pandas.DataFrame.from_items([
            (x, numpy.histogram(details_df[x].dropna(), bins=bins)[0])
            for x in details_df.columns
        ])

        histogram_df.index = numpy.arange(0, 1.0, 0.01)

        row_sums = histogram_df.sum(axis=1)
        histogram_df = histogram_df[row_sums != 0]

        with E.open_output_file("histogram", "w") as outf:
            histogram_df.to_csv(outf, sep="\t", index_label="bin")

    # write footer and output benchmark information.
    E.stop()
예제 #11
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "--trna-scheme",
        dest="trna_scheme",
        type="choice",
        choices=("tDR-5'", "tRH-DA"),
        help="name of the tRNA scheme to make bed file for[default=%default]")

    parser.set_defaults(trna_scheme=None)

    (options, args) = E.start(parser, argv=argv)

    if len(args) == 0:
        args.append("-")

    E.info(options.stdin)

    outfile = IOTools.open_file(options.stdout.name, "w")
    trna_options = [
        "tRH-5'", "tRH-DA", "tRH-DTA", "tRH-AT", "tRH-3'", "tRF-5'", "tRF-3'",
        "tRF-D", "tRF-DA", "tRF-A", "tRF-AT", "tRF-T"
    ]
    for trna in trna_options:
        infile = IOTools.open_file(options.stdin.name)
        iterator = FastaIterator.FastaIterator(infile)

        d = collections.OrderedDict()
        cluster_dict = dict()

        # first iterate over the fasta file

        for cur_record in iterator:

            title = cur_record.title
            m = re.match("(cluster\d+):chr\S+.tRNA\d+-(\S+)-\((\S+)\)", title)

            cluster = m.group(1)
            trna_group = m.group(2)
            strand = m.group(3)

            chrom = cluster + ":" + trna_group + "-"
            score = "."
            print(trna)
            if trna == "tRH-5'":
                start = "1"
                end = "33"
            elif trna == "tRH-DA":
                start = "14"
                end = "43"
            elif trna == "tRH-DTA":
                start = "17"
                end = "54"
            elif trna == "tRH-AT":
                start = "38"
                end = "69"
            elif trna == "tRH-3'":
                start = "43"
                end = "73"
            elif trna == "tRF-5'":
                start = "1"
                end = "15"
            elif trna == "tRF-3'":
                start = "58"
                end = "73"
            elif trna == "tRF-D":
                start = "8"
                end = "23"
            elif trna == "tRF-DA":
                start = "20"
                end = "35"
            elif trna == "tRF-A":
                start = "27"
                end = "42"
            elif trna == "tRF-AT":
                start = "33"
                end = "53"
            elif trna == "tRF-T":
                start = "45"
                end = "71"
            else:
                print("tRNA fragment not implemented")
                break
            outfile.write(("%s\t%s\t%s\t%s\t%s\t%s\n") %
                          (chrom, start, end, trna, score, strand))

    E.stop()
예제 #12
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(version="%prog version: $Id$",
                              usage=globals()["__doc__"])

    parser.add_argument("-g",
                        "--gtf-file",
                        dest="filename_gtf",
                        type=str,
                        help="filename with gene models in gtf format ")

    parser.add_argument("-m",
                        "--filename-mismapped",
                        dest="filename_mismapped",
                        type=str,
                        help="output bam file for mismapped reads ")

    parser.add_argument("-j",
                        "--junctions-bed-file",
                        dest="filename_junctions",
                        type=str,
                        help="bam file with reads mapped across junctions ")

    parser.add_argument("-r",
                        "--filename-regions",
                        dest="filename_regions",
                        type=str,
                        help="filename with regions to remove in bed format ")

    parser.add_argument("-t",
                        "--transcripts-gtf-file",
                        dest="filename_transcriptome",
                        type=str,
                        help="bam file with reads mapped against transcripts ")

    parser.add_argument("-p",
                        "--map-tsv-file",
                        dest="filename_map",
                        type=str,
                        help="filename mapping transcript numbers (used by "
                        "--filename-transciptome) to transcript names "
                        "(used by --filename-gtf) ")

    parser.add_argument("-s",
                        "--filename-stats",
                        dest="filename_stats",
                        type=str,
                        help="filename to output stats to ")

    parser.add_argument(
        "-o",
        "--colour",
        dest="colour_mismatches",
        action="store_true",
        help="mismatches will use colour differences (CM tag) ")

    parser.add_argument("-i",
                        "--ignore-mismatches",
                        dest="ignore_mismatches",
                        action="store_true",
                        help="ignore mismatches ")

    parser.add_argument("-c",
                        "--remove-contigs",
                        dest="remove_contigs",
                        type=str,
                        help="','-separated list of contigs to remove ")

    parser.add_argument("-f",
                        "--force-output",
                        dest="force",
                        action="store_true",
                        help="force overwriting of existing files ")

    parser.add_argument("-u",
                        "--unique",
                        dest="unique",
                        action="store_true",
                        help="remove reads not matching uniquely ")

    parser.add_argument("--output-sam",
                        dest="output_sam",
                        action="store_true",
                        help="output in sam format ")

    parser.set_defaults(
        filename_gtf=None,
        filename_mismapped=None,
        filename_junctions=None,
        filename_transcriptome=None,
        filename_map=None,
        remove_contigs=None,
        force=False,
        unique=False,
        colour_mismatches=False,
        ignore_mismatches=False,
        output_sam=False,
        filename_table=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) != 1:
        raise ValueError("please supply one bam file")

    bamfile_genome = args[0]
    genome_samfile = pysam.AlignmentFile(bamfile_genome, "rb")

    if options.remove_contigs:
        options.remove_contigs = options.remove_contigs.split(",")

    if options.filename_map:
        E.info("reading map")
        id_map = iotools.read_map(iotools.open_file(options.filename_map),
                                  has_header=True)
        id_map = dict([(y, x) for x, y in id_map.items()])
    else:
        id_map = None

    transcripts = {}
    if options.filename_gtf:
        E.info("indexing geneset")
        mapped, missed = 0, 0
        for gtf in GTF.transcript_iterator(
                GTF.iterator(iotools.open_file(options.filename_gtf))):
            gtf.sort(key=lambda x: x.start)
            transcript_id = gtf[0].transcript_id
            if id_map:
                try:
                    transcript_id = id_map[transcript_id]
                    mapped += 1
                except KeyError:
                    missed += 1
                    continue
            transcripts[transcript_id] = gtf

        E.info("read %i transcripts from geneset (%i mapped, %i missed)" %
               (len(transcripts), mapped, missed))

    regions_to_remove = None
    if options.filename_regions:
        E.info("indexing regions")
        regions_to_remove = IndexedGenome.Simple()
        for bed in Bed.iterator(iotools.open_file(options.filename_regions)):
            regions_to_remove.add(bed.contig, bed.start, bed.end)
        E.info("read %i regions" % len(regions_to_remove))

    if options.filename_transcriptome:
        transcripts_samfile = pysam.AlignmentFile(
            options.filename_transcriptome, "rb")
    else:
        transcripts_samfile = None

    if options.output_sam:
        output_samfile = pysam.AlignmentFile("-",
                                             "wh",
                                             template=genome_samfile)
    else:
        output_samfile = pysam.AlignmentFile("-",
                                             "wb",
                                             template=genome_samfile)

    if options.filename_mismapped:
        if not options.force and os.path.exists(options.filename_mismapped):
            raise IOError("output file %s already exists" %
                          options.filename_mismapped)
        output_mismapped = pysam.AlignmentFile(options.filename_mismapped,
                                               "wb",
                                               template=genome_samfile)
    else:
        output_mismapped = None

    if options.filename_junctions:
        junctions_samfile = pysam.AlignmentFile(options.filename_junctions,
                                                "rb")
    else:
        junctions_samfile = None

    c = bams2bam_filter(genome_samfile,
                        output_samfile,
                        output_mismapped,
                        transcripts_samfile,
                        junctions_samfile,
                        transcripts,
                        regions=regions_to_remove,
                        unique=options.unique,
                        remove_contigs=options.remove_contigs,
                        colour_mismatches=options.colour_mismatches,
                        ignore_mismatches=options.ignore_mismatches,
                        ignore_transcripts=transcripts_samfile is None,
                        ignore_junctions=junctions_samfile is None)

    if options.filename_stats:
        outf = iotools.open_file(options.filename_stats, "w")
        outf.write("category\tcounts\n%s\n" % c.asTable())
        outf.close()

    if options.filename_transcriptome:
        transcripts_samfile.close()

    genome_samfile.close()
    output_samfile.close()
    if output_mismapped:
        output_mismapped.close()

    # write footer and output benchmark information.
    E.stop()
예제 #13
0
def main(argv=sys.argv):

    parser = E.ArgumentParser()

    parser.add_argument("--version",
                        action='version',
                        version='%(prog)s {version}'.format(version="1.0"))

    parser.add_argument("-t",
                        "--no-titles",
                        dest="input_has_titles",
                        action="store_false",
                        help="no titles in input.")

    parser.add_argument("--ignore-titles",
                        dest="ignore_titles",
                        action="store_true",
                        help="ignore titles in input")

    parser.add_argument("-i",
                        "--skip-titles",
                        dest="skip_titles",
                        action="store_true",
                        help="skip output of titles.")

    parser.add_argument("-m",
                        "--missing-value",
                        dest="missing_value",
                        type=str,
                        help="entry to use for missing values.")

    parser.add_argument("--header-names",
                        dest="headers",
                        type=str,
                        help="add headers for files as a ,-separated "
                        "list.")

    parser.add_argument("-c",
                        "--columns",
                        dest="columns",
                        type=str,
                        help="columns to use for joining. Multiple columns "
                        "can be specified as a comma-separated list ")

    parser.add_argument("-k",
                        "--take",
                        dest="take",
                        type=str,
                        action="append",
                        help="columns to take. If not set, all columns "
                        "except for "
                        "the join columns are taken")

    parser.add_argument("-g",
                        "--glob",
                        dest="glob",
                        type=str,
                        help="wildcard expression for table names.")

    parser.add_argument(
        "-s",
        "--sort-order",
        dest="sort",
        type=str,
        help="sort by column titles in particular given order: "
        "alphabetical|numeric|list of columns.")

    parser.add_argument("-e",
                        "--merge-overlapping",
                        dest="merge",
                        action="store_true",
                        help="simply merge tables without matching up "
                        "rows.")

    parser.add_argument("-a",
                        "--cat",
                        dest="cat",
                        type=str,
                        help="simply concatenate tables. Adds an "
                        "additional column called X with the filename ")

    parser.add_argument("--sort-keys",
                        dest="sort_keys",
                        type=str,
                        choices=("numeric", "alphabetic"),
                        help="sort key columns by value.")

    parser.add_argument("--keep-empty",
                        dest="ignore_empty",
                        action="store_false",
                        help="keep empty tables. The default is "
                        "to ignore them.")

    parser.add_argument("--ignore-empty",
                        dest="ignore_empty",
                        action="store_true",
                        help="ignore empty tables - this is "
                        "the default.")

    parser.add_argument("--add-file-prefix",
                        dest="add_file_prefix",
                        action="store_true",
                        help="add file prefix to "
                        "columns headers. Suitable for multi-column"
                        "tables")

    parser.add_argument("--use-file-prefix",
                        dest="use_file_prefix",
                        action="store_true",
                        help="use file prefix as column headers. "
                        "Suitable for two-column tables ")

    parser.add_argument("--prefixes",
                        dest="prefixes",
                        type=str,
                        help="list of prefixes to use. "
                        ", separated list of prefixes. "
                        "The number of prefixes need to correspond to the "
                        "number of input files")

    parser.add_argument("--regex-filename",
                        dest="regex_filename",
                        type=str,
                        help="pattern to apply to filename to "
                        "build prefix")

    parser.add_argument("--regex-start",
                        dest="regex_start",
                        type=str,
                        help="regular expression to start "
                        "collecting table in a file")

    parser.add_argument("--regex-end",
                        dest="regex_end",
                        type=str,
                        help="regular expression to end collecting "
                        "table in a file")

    parser.add_argument("--test",
                        dest="test",
                        type=int,
                        help="test combining tables with "
                        "first X rows")

    parser.set_defaults(
        input_has_titles=True,
        skip_titles=False,
        missing_value="na",
        headers=None,
        sort=None,
        glob=None,
        columns="1",
        sort_keys=False,
        merge=False,
        ignore_empty=True,
        regex_start=None,
        regex_end=None,
        add_file_prefix=False,
        use_file_prefix=False,
        cat=None,
        take=[],
        regex_filename="(.*)",
        prefixes=None,
        test=0,
    )

    (args, unknown) = E.start(parser, argv=argv, unknowns=True)

    if args.headers:
        if "," in args.headers:
            args.headers = args.headers.split(",")
        else:
            args.headers = re.split("\s+", args.headers.strip())

    if args.sort and args.sort not in ("numeric", "alphabetic"):
        if "," in args.sort:
            args.sort = args.sort.split(",")
        else:
            args.sort = re.split("\s+", args.sort)

    if args.merge:
        args.columns = []
    else:
        args.columns = [int(x) - 1 for x in args.columns.split(",")]

    args.filenames = []

    if args.glob:
        args.filenames += glob.glob(args.glob)

    args.filenames += unknown

    if len(args.filenames) < 1:
        raise ValueError("no tables found.")

    E.info("combining %i tables" % len(args.filenames))

    if args.cat:
        concatenate_tables(args.stdout, args, unknown)
    else:
        join_tables(args.stdout, args, unknown)

    E.stop()
예제 #14
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--inplace",
                        dest="inplace",
                        action="store_true",
                        help="update option list in place. New options will"
                        "be added to the list given by --options-tsv-file. "
                        "Options will only be added, not removed ")

    parser.add_argument("--options-tsv-file",
                        dest="tsv_file",
                        type=str,
                        help="existing table with options. Will be updated if "
                        "--in-place is set [default]")

    parser.set_defaults(inplace=False, tsv_file=None)

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv)

    old_options = None
    if args.tsv_file:
        if not os.path.exists(args.tsv_file):
            raise OSError("filename %s not found, see --options-tsv-file" %
                          args.tsv_file)
        old_options = pandas.read_csv(
            iotools.open_file(args.tsv_file),
            sep="\t",
            index_col=0,
        )
        old_options = old_options.fillna("")

    global ORIGINAL_START
    ORIGINAL_START = E.start

    all_options = collections.defaultdict(list)

    for label, expression in EXPRESSIONS:

        files = glob.glob(expression)
        files.sort()

        for f in files:

            E.debug("processing %s" % f)
            if os.path.isdir(f):
                continue
            if os.path.basename(f) in EXCLUDE:
                continue
            collected_options = collectOptionsFromScript(os.path.abspath(f))
            for o in collected_options:
                all_options[o].append(f)

    # add old options
    for x in old_options.index:
        if x not in all_options:
            all_options[x].append("--")

    if args.inplace:
        outfile = iotools.open_file(args.tsv_file, "w")
        E.info("updating file '%s'" % args.tsv_file)
    else:
        outfile = args.stdout

    outfile.write("option\taction\tcomment\talternative\tfiles\n")
    for o, v in sorted(all_options.items()):
        try:
            action, comment, alternative, ff = old_options.xs(o)

        except KeyError:
            action, comment, alternative, ff = "", "", "", ""

        if comment == "nan":
            comment = ""
        if alternative == "nan":
            alternative = ""

        outfile.write("\t".join(
            (list(map(str, (o, action, comment, alternative, ",".join(v)))))) +
                      "\n")

    if outfile != args.stdout:
        outfile.close()

    # write footer and output benchmark information.
    E.stop()
예제 #15
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-o",
                        "--min-overlap",
                        dest="min_overlap",
                        type=int,
                        help="minimum overlap")

    parser.add_argument(
        "-w",
        "--pattern-window",
        dest="pattern_window",
        type=str,
        help="regular expression to extract window coordinates from "
        "test id ")

    parser.add_argument("-i",
                        "--invert",
                        dest="invert",
                        action="store_true",
                        help="invert direction of fold change ")

    parser.set_defaults(min_overlap=10,
                        invert=False,
                        pattern_window="(\S+):(\d+)-(\d+)"),

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv, add_output_options=True)

    outfiles = iotools.FilePool(args.output_filename_pattern)

    if args.invert:
        test_f = lambda l2fold: l2fold < 0
    else:
        test_f = lambda l2fold: l2fold > 0

    def read():

        rx_window = re.compile(args.pattern_window)
        # filter any of the DESeq/EdgeR message that end up at the top of the
        # output file

        for data in iotools.iterate(args.stdin):

            contig, start, end = rx_window.match(data.test_id).groups()
            start, end = list(map(int, (start, end)))

            yield DATA._make(
                (data.test_id, contig, start, end, data.treatment_name,
                 float(data.treatment_mean),
                 float(data.treatment_std), data.control_name,
                 float(data.control_mean), float(data.control_std),
                 float(data.pvalue), float(data.qvalue), float(data.l2fold),
                 float(data.fold), int(data.significant), data.status, 0))

    def grouper(data, distance=10):

        last = next(data)
        entries = [last]

        while 1:
            d = next(data)
            if d is None:
                break
            if d.contig == last.contig and d.start < last.start:
                raise ValueError("error not sorted by start")

            if ((d.contig != last.contig) or (d.start - last.end > distance)
                    or (d.status != last.status)
                    or (d.significant != last.significant)
                    or (d.l2fold * last.l2fold < 0)):
                yield entries
                entries = []

            entries.append(d)
            last = d

        yield entries

    counter = E.Counter()

    args.stdout.write("\t".join(DATA._fields) + "\n")

    # set of all sample names - used to create empty files
    samples = set()

    # need to sort by coordinate
    all_data = list(read())
    all_data.sort(key=lambda x: (x.contig, x.start))

    group_id = 0

    for group in grouper(iter(all_data), distance=args.min_overlap):
        group_id += 1

        start, end = group[0].start, group[-1].end
        assert start < end, 'start > end: %s' % str(group)
        n = float(len(group))
        counter.input += n

        g = group[0]

        if g.l2fold < 0:
            l2fold = max([x.l2fold for x in group])
            fold = max([x.fold for x in group])
        else:
            l2fold = min([x.l2fold for x in group])
            fold = min([x.fold for x in group])

        outdata = DATA._make(
            (str(group_id), g.contig, start, end, g.treatment_name,
             sum([x.treatment_mean for x in group]) / n,
             max([x.treatment_std for x in group]), g.control_name,
             sum([x.control_mean
                  for x in group]) / n, max([x.control_std for x in group]),
             max([x.pvalue for x in group]), max([x.qvalue for x in group]),
             l2fold, fold, g.significant, g.status, int(n)))

        samples.add(g.treatment_name)
        samples.add(g.control_name)
        if g.significant:
            if test_f(g.l2fold):
                # treatment lower methylation than control
                outfiles.write(
                    g.treatment_name, "%s\t%i\t%i\t%i\t%f\n" %
                    (g.contig, g.start, g.end, group_id,
                     sum([x.treatment_mean for x in group]) / n))

            else:
                outfiles.write(
                    g.control_name, "%s\t%i\t%i\t%i\t%f\n" %
                    (g.contig, g.start, g.end, group_id,
                     sum([x.control_mean for x in group]) / n))

        args.stdout.write("\t".join(map(str, outdata)) + "\n")

        counter.output += 1

    # create empty files
    for sample in samples:
        outfiles.write(sample, "")

    outfiles.close()
    E.info("%s" % counter)

    # write footer and output benchmark information.
    E.stop()
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-u",
                      "--use",
                      dest="use",
                      type="choice",
                      choices=("pval", "padj"),
                      help="Type of p-value to use for clade size")

    parser.add_option("-m",
                      "--taxa-map",
                      dest="taxa_map",
                      type="string",
                      help="Taxa mapping file - basically a text tree")

    parser.add_option("-l",
                      "--highest-level",
                      dest="highest_level",
                      type="string",
                      help="highest taxonomic level to visualise")

    parser.add_option(
        "-f",
        "--filter",
        dest="filter",
        action="store_true",
        help="do you want to filter? will filter based on highest-level")

    parser.add_option("-k",
                      "--keep",
                      dest="keep",
                      type="string",
                      help="keep all clades below these")

    parser.add_option(
        "--additional-labels",
        dest="additional_labels",
        type="string",
        help=
        "by default just the highest level labels are shown. Here you can add additional labels"
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    # read tree
    tree = readTree(options.taxa_map, options.highest_level)

    # filter if neccessary
    new_tree = {}
    keep = set()
    if options.filter:
        assert options.keep, "must specify which clades to keep"
        to_keep = options.keep.split(",")
        for t in to_keep:
            new_tree[t] = tree[t]
    else:
        assert len(
            list(tree.keys())
        ) < 159, "not enough colours to support n = %i clades, please filter" % len(
            tree)
        new_tree = tree

    #get those to keep
    keep = set()
    for taxon, rest in new_tree.items():
        keep.add(taxon)
        for r in rest:
            keep.add(r)

    # get colours
    ncols = len(list(new_tree.keys()))
    colours = getColours(ncols)
    taxon2colour = {}
    for i in range(ncols):
        taxon2colour[list(new_tree.keys())[i]] = colours[i]

    # add in colours for all clade nodes that is
    # based on the highest node
    for h, r in new_tree.items():
        for taxon in r:
            taxon2colour[taxon] = taxon2colour[h]

    # read diff and output annotations
    result = collections.defaultdict(list)
    ps = []
    fcs = []
    taxa = []
    colours = []
    shapes = []
    sig = []
    for line in options.stdin.readlines():

        # skip header
        if "taxa" and "log2FoldChange" in line:
            continue

        data = line.strip("\n").split("\t")
        taxon = data[0]
        if taxon not in keep:
            continue

        # assign colour
        if taxon in list(taxon2colour.keys()):
            colour = taxon2colour[taxon]
        else:
            colour = "NA"
        colours.append(colour)

        # append taxon to list
        taxa.append(taxon)

        # use -log10 p-value for clade size
        if options.use == "pval":
            column = 5
        elif options.use == "padj":
            column = 6
        else:
            raise ValueError("must use pval or padj, not %s for clade size" %
                             options.use)

        # catch NA pvalues
        if data[column] == "NA":
            data[column] = 1

        p = -math.log10(float(data[column]))
        if p >= 1.3:
            shapes.append("*")
            sig.append(taxon)
        else:
            shapes.append("o")

        p = p * 100
        result[taxon].append(p)
        ps.append(p)

        # fold changes
        if data[2] == "NA":
            fc = 0
        else:
            fc = data[2]
        fcs.append(fc)

    # output annotations
    options.stdout.write("%s\t%s\n" % ("clade_separation", "0.9"))
    for t, s in zip(taxa, shapes):
        options.stdout.write("%s\t%s\t%s\n" % (t, "clade_marker_shape", s))
    for t, c in taxon2colour.items():
        options.stdout.write("%s\t%s\t%s\n" % (t, "clade_marker_color", c))
    for t, c in taxon2colour.items():
        options.stdout.write("%s\t%s\t%s\n" %
                             (t, "annotation_background_color", c))
    for t, c in taxon2colour.items():
        options.stdout.write("%s\t%s\t%s\n" % (t, "annotation_font_size", 12))

    for t, p, f in zip(taxa, ps, fcs):
        if t in sig and float(f) > 0:
            options.stdout.write("%s\t%s\t%s\n" %
                                 (t, "clade_marker_color", "r"))
            options.stdout.write("%s\t%s\t%f\n" %
                                 (t, "clade_marker_size", 200))
            options.stdout.write("%s\t%s\t%s\n" %
                                 (t, "annotation_background_color", "r"))
            options.stdout.write("%s\t%s\t%s\t%s\n" % (t, "ring_height", 1, f))
            options.stdout.write("%s\t%s\t%s\t%s\n" %
                                 (t, "ring_color", 1, "r"))
            options.stdout.write("%s\t%s\t%s\t%s\n" %
                                 (t, "ring_alpha", 1, 0.5))

        elif t in sig and float(f) < 0:
            options.stdout.write("%s\t%s\t%s\n" %
                                 (t, "clade_marker_color", "b"))
            options.stdout.write("%s\t%s\t%f\n" %
                                 (t, "clade_marker_size", 200))
            options.stdout.write("%s\t%s\t%s\n" %
                                 (t, "annotation_background_color", "b"))
            options.stdout.write("%s\t%s\t%s\t%s\n" %
                                 (t, "ring_height", 1, float(f) * -1))
            options.stdout.write("%s\t%s\t%s\t%s\n" %
                                 (t, "ring_color", 1, "b"))
            options.stdout.write("%s\t%s\t%s\t%s\n" %
                                 (t, "ring_alpha", 1, 0.5))

        elif t not in sig:
            options.stdout.write("%s\t%s\t%f\n" % (t, "clade_marker_size", p))

    # only output annotation for highest-level and
    # additional labels
    if options.additional_labels:
        additional_labels = options.additional_labels.split(",")
    else:
        additional_labels = []
    for t, p in zip(taxa, ps):
        if t in additional_labels:
            #           if "_" in t:
            #               a =  "".join(random.sample(list(string.ascii_lowercase),2)) + ":" + t.split(".")[-1]
            #           else:
            a = t.split(".")[-1]
            options.stdout.write("%s\t%s\t%s\n" % (t, "annotation", a))
            options.stdout.write("%s\t%s\t%s\n" %
                                 (t, "annotation_rotation", 90))
            options.stdout.write("%s\t%s\t%s\n" %
                                 (t, "annotation_font_size", 8))
        elif t in list(tree.keys()):
            a = t.split(".")[-1]
            options.stdout.write("%s\t%s\t%s\n" % (t, "annotation", a))
            options.stdout.write("%s\t%s\t%s\n" %
                                 (t, "annotation_font_size", 12))

    # write the tree out
    outf = open("input_tree.txt", "w")
    for x, y in new_tree.items():
        for taxon in y:
            outf.write(taxon + "\n")
    outf.close()

    # write footer and output benchmark information.
    E.stop()
예제 #17
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=[
                          "mutation-profile-bar-plot",
                          "depth-profile-line-plot", "manhattan-plot"
                      ],
                      help="methods to apply [%default]")

    parser.add_option("-t",
                      "--transformation",
                      dest="transformations",
                      type="choice",
                      action="append",
                      choices=["log-depth-ratio"],
                      help="dataframe transformation options [%default]")

    parser.add_option("-r",
                      "--regex-filename",
                      dest="regex_filename",
                      type="string",
                      help="[%default]")

    parser.add_option("-f",
                      "--reference-fasta-file",
                      dest="reference_fasta_file",
                      help="reference genomic sequence in fasta format. "
                      "[%default]")

    parser.add_option("--input-file-format",
                      dest="input_file_format",
                      type="choice",
                      choices=("tsv", "bcftools-query"),
                      help="input file format "
                      "[%default]")

    parser.add_option(
        "--plot-options",
        dest="plot_options",
        type="string",
        help="plot options to pass through to the plotter. The string is "
        "eval'ed, for example: --plot-options='window_size=3, ylabel=\"12\"' "
        "[%default]")

    parser.set_defaults(
        method=None,
        reference_fasta=None,
        input_file_format="tsv",
        plot_options=None,
        transformations=[],
    )

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    filenames = args

    if len(filenames) == 0:
        E.info("reading from stdin")
        filenames = [options.stdin]

    if options.plot_options is not None:
        plot_options = eval("dict({})".format(options.plot_options))
    else:
        plot_options = {}

    for index, filename in enumerate(filenames):

        E.info("working on {}".format(filename))

        try:
            if options.input_file_format == "bcftools-query":
                # for bctools query, header starts with "#".

                dataframe = pandas.read_csv(filename,
                                            sep="\t",
                                            skip_blank_lines=False,
                                            header=0,
                                            dtype={"CHROM": str})
                # names are of format [1]sample1:DP, extract sample1
                dataframe.columns = ([
                    re.search("\[\d+\]([^:]+)", x).groups()[0]
                    for x in dataframe.columns
                ])
            else:
                dataframe = pandas.read_csv(filename,
                                            sep="\t",
                                            dtype={"CHROM": str})
        except pandas.io.common.EmptyDataError:
            E.warn("no data in {}, skipped".format(filename))
            continue

        E.info("read data from {}".format(filename))

        if options.regex_filename:
            section = re.search(options.regex_filename, filename).groups()[0]
        else:
            section = "{}".format(index + 1)

        for method in options.transformations:
            if method == "log-depth-ratio":
                dataframe = compute_log_depth_ratio(dataframe)

        if dataframe.empty:
            E.warn("dataframe from {} is empty - skipped".format(filename))
            continue

        if options.method == "mutation-profile-bar-plot":
            plot_mutation_profile_bar_plot(dataframe, section, **plot_options)

        elif options.method == "depth-profile-line-plot":
            plot_depth_profile_plot(dataframe, section, **plot_options)

        elif options.method == "manhattan-plot":
            plot_manhattan_plot(dataframe,
                                section,
                                filename_fasta=options.reference_fasta_file,
                                **plot_options)

    E.stop()
예제 #18
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome (indexed).")

    parser.add_argument("-w",
                        "--windows-bed-file",
                        dest="filename_windows",
                        type=str,
                        help="gff file with windows to use.")

    parser.add_argument("-d",
                        "--filename-data",
                        dest="filename_data",
                        type=str,
                        help="gff file with data to use.")

    parser.add_argument("--is-gtf",
                        dest="is_gtf",
                        action="store_true",
                        help="filename-data is gtf file")

    parser.add_argument("-f",
                        "--features",
                        dest="features",
                        type=str,
                        action="append",
                        choices=("GC", ),
                        help="features to compute.")

    parser.add_argument("-c",
                        "--decorator",
                        dest="decorator",
                        type=str,
                        choices=("counts", "gc", "gc3", "mean-length",
                                 "median-length", "percent-coverage",
                                 "median-score", "mean-score", "stddev-score",
                                 "min-score", "max-score"),
                        help="decorators to use.")

    parser.add_argument("-e",
                        "--skip-empty",
                        dest="skip_empty",
                        action="store_true",
                        help="skip empty windows.")

    parser.add_argument(
        "-t",
        "--transform=",
        dest="transform",
        type=str,
        choices=("none", "overlap", "complement", "third_codon"),
        help="transform to use when mapping overlapping regions onto window.")

    parser.set_defaults(
        genome_file=None,
        filename_windows=None,
        filename_data=None,
        features=[],
        skip_empty=False,
        decorator="counts",
        transform="none",
        is_gtf=False,
    )

    (args) = E.start(parser)

    #    test_transform_third_codon()

    if not args.filename_windows:
        raise ValueError("please supply a gff file with window information.")

    if args.loglevel >= 1:
        args.stdlog.write("# reading windows...")
        args.stdlog.flush()

    windows = GTF.readAsIntervals(
        GTF.iterator(iotools.open_file(args.filename_windows, "r")))

    if args.loglevel >= 1:
        args.stdlog.write("done\n")
        args.stdlog.flush()

    if args.filename_data:
        if args.loglevel >= 1:
            args.stdlog.write("# reading data...")
            args.stdlog.flush()

        if args.is_gtf:
            gff_data = GTF.readFromFile(
                iotools.open_file(args.filename_data, "r"))
        else:
            gff_data = GTF.readFromFile(
                IOTOols.open_file(args.filename_data, "r"))

        if args.loglevel >= 1:
            args.stdlog.write("done\n")
            args.stdlog.flush()

        data_ranges = GTF.SortPerContig(gff_data)
    else:
        # use windows to compute properties
        # by supplying no data and asking for the complement = original window
        gff_data = None
        data_ranges = None
        args.transform = "complement"

    map_contig2size = {}

    if args.genome_file:
        fasta = IndexedFasta.IndexedFasta(args.genome_file)
        map_contig2size = fasta.getContigSizes()
    else:
        for contig, values in list(windows.items()):
            map_contig2size[contig] = max(lambda x: x[1], values)
        fasta = None

    contigs = list(map_contig2size.keys())
    contigs.sort()

    # proceed contig wise
    noutput_contigs, ncontigs_skipped_windows, ncontigs_skipped_data = 0, 0, 0

    args.stdout.write("\t".join(
        map(str, ("contig", "start", "end", "ngenes", "ntranscripts", "n1",
                  "l1", "n2", "l2", "score", "extra_info"))) + "\n")

    for contig in contigs:

        skip = False
        if contig not in windows:
            ncontigs_skipped_windows += 1
            skip = True

        if data_ranges and contig not in data_ranges:
            ncontigs_skipped_data += 1
            skip = True

        if skip:
            continue

        noutput_contigs += 1
        if data_ranges:
            annotateWindows(
                contig, windows[contig],
                gff_data[data_ranges[contig][0]:data_ranges[contig][1]], fasta,
                args)
        else:
            annotateWindows(contig, windows[contig], [], fasta, args)

    E.info(
        "ninput_windows=%i, noutput_contigs=%i, ninput_contigs=%i, nskipped_windows=%i, nskipped_data=%i"
        % (len(windows), noutput_contigs, len(contigs),
           ncontigs_skipped_windows, ncontigs_skipped_data))

    E.stop()
예제 #19
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id",
        usage=globals()["__doc__"])

    parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string",
                      help="UCSC genome identifier [default=%default].")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("--extend", dest="extension", type="int",
                      help="extend tags by this number of bases "
                      "[default=%default].")

    parser.add_option("--shift-size", dest="shift", type="int",
                      help="shift tags by this number of bases "
                      "[default=%default].")

    parser.add_option("--window-size", dest="window_size", type="int",
                      help="window size to be used in the analysis"
                      "[default=%default].")

    parser.add_option("--saturation-iterations",
                      dest="saturation_iterations", type="int",
                      help="iterations for saturation analysis "
                      "[default=%default].")

    parser.add_option("-t", "--toolset", dest="toolset", type="choice",
                      action="append",
                      choices=("saturation", "coverage", "enrichment",
                               "dmr", "rms", "rpm", "all", "convert"),
                      help="actions to perform [default=%default].")

    parser.add_option("-w", "--bigwig-file", dest="bigwig",
                      action="store_true",
                      help="store wig files as bigwig files - requires a "
                      "genome file [default=%default]")

    parser.add_option("--treatment", dest="treatment_files", type="string",
                      action="append",
                      help="BAM files for treatment. At least one is required "
                      "[%default]")

    parser.add_option("--control", dest="control_files", type="string",
                      action="append",
                      help="BAM files for control for differential "
                      "methylation analysis. Optional [%default].")

    parser.add_option("--input", dest="input_files", type="string",
                      action="append",
                      help="BAM files for input correction. "
                      "Optional [%default].")

    parser.add_option("--is-not-medip",
                      dest="is_medip", action="store_false",
                      help="data is not MeDIP data and is not expected "
                      "to fit the calibration model. No CpG "
                      "density normalized rms data is computed"
                      "[default=%default].")

    parser.add_option("--output-rdata", dest="output_rdata",
                      action="store_true",
                      help="in dmr analysis, write R session to file. "
                      "The file name "
                      "is given by --ouptut-filename-pattern [%default].")

    parser.add_option("--rdata-file", dest="input_rdata",
                      type="string",
                      help="in dmr analysis, read saved R session from "
                      "file. This can be used to apply different "
                      "filters [%default]")

    parser.add_option("--fdr-threshold", dest="fdr_threshold", type="float",
                      help="FDR threshold to apply for selecting DMR "
                      "[default=%default].")

    parser.add_option("--fdr-method", dest="fdr_method", type="choice",
                      choices=("bonferroni", "BH", "holm", "hochberg",
                               "hommel", "BY", "fdr", "none"),
                      help="FDR method to apply for selecting DMR "
                      "[default=%default].")

    parser.add_option("--bwa", dest="bwa", action="store_true",
                      help="alignment generated with bwa"
                      "[default=%default].")

    parser.add_option("--unique", dest="unique", type="float",
                      help="Threshold p-value to determine which read pile\
                      ups are the result of PCR overamplification"
                      "[default=%default].")

    parser.add_option("--chroms", dest="chroms", type="str",
                      help="Comma delimited list of chromosomes to include"
                      "[default=%default].")

    parser.set_defaults(
        input_format="bam",
        ucsc_genome="Hsapiens.UCSC.hg19",
        genome_file=None,
        extend=0,
        shift=0,
        window_size=300,
        saturation_iterations=10,
        toolset=[],
        bigwig=False,
        treatment_files=[],
        control_files=[],
        input_files=[],
        output_rdata=False,
        input_rdata=None,
        is_medip=True,
        fdr_threshold=0.1,
        fdr_method="BH",
        bwa=False,
        unique=0.001,
        chroms=None
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if "convert" in options.toolset:

        results = []
        for line in CSV.DictReader(options.stdin,
                       	           dialect="excel-tab"):
            if line['edgeR.p.value'] == "NA":
                continue

            # assumes only a single treatment/control
            treatment_name = options.treatment_files[0]
            control_name = options.control_files[0]
            status = "OK"
            try:
                results.append(
                    Expression.GeneExpressionResult._make((
                        "%s:%i-%i" % (line['chr'],
                                      int(line['start']),
                                      int(line['stop'])),
                        treatment_name,
                        float(line['MSets1.rpkm.mean']),
                        0,
                        control_name,
                        float(line['MSets2.rpkm.mean']),
                        0,
                        float(line['edgeR.p.value']),
                        float(line['edgeR.adj.p.value']),
                        float(line['edgeR.logFC']),
                        math.pow(2.0, float(line['edgeR.logFC'])),
                        float(line['edgeR.logFC']),  # no transform
                        ["0", "1"][float(line['edgeR.adj.p.value']) <
                                   options.fdr_threshold],
                        status)))
            except ValueError as msg:
                raise ValueError("parsing error %s in line: %s" % (msg, line))

        Expression.writeExpressionResults(options.stdout, results)
        return

    if len(options.treatment_files) < 1:
        raise ValueError("please specify a filename with sample data")

    if options.bigwig and not options.genome_file:
        raise ValueError("please provide a genome file when outputting bigwig")

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contig_sizes = fasta.getContigSizes()

    if len(options.toolset) == 0:
        options.toolset = ["all"]

    do_all = "all" in options.toolset

    if options.chroms is None:
        chrstring = ""
    else:
        chroms = options.chroms.split(",")
        chrstring = ' chr.select=c(\"%s\"), ' % '\",\"'.join(chroms)
    # load MEDIPS
    R.library('MEDIPS')
    genome_file = 'BSgenome.%s' % options.ucsc_genome
    R.library(genome_file)

    window_size = options.window_size
    extend = options.extend
    shift = options.shift
    saturation_iterations = options.saturation_iterations

    uniq = float(options.unique)

    if options.bwa is True:
        BWA = "TRUE"
    else:
        BWA = "FALSE"

    if "saturation" in options.toolset or do_all:
        E.info("saturation analysis")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''sr = MEDIPS.saturation(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            shift=%(shift)i,
            extend=%(extend)i,
            window_size=%(window_size)i,
            uniq=%(uniq)s,
            nit = %(saturation_iterations)i,
            paired = %(paired)s,
            bwa = %(BWA)s,
            %(chrstring)s
            nrit = 1)''' % locals())

            R.png(E.get_output_file("%s_saturation.png" % fn))
            R('''MEDIPS.plotSaturation(sr)''')
            R('''dev.off()''')
            R('''write.table(sr$estimation, file ='%s', sep='\t')''' %
              E.get_output_file("%s_saturation_estimation.tsv" % fn))

            outfile = iotools.open_file(
                E.get_output_file("%s_saturation.tsv" % fn), "w")
            outfile.write("category\tvalues\n")
            outfile.write(
                "estimated_correlation\t%s\n" %
                ",".join(["%f" % x for x in R('''sr$maxEstCor''')]))
            outfile.write(
                "true_correlation\t%s\n" %
                ",".join(["%f" % x for x in R('''sr$maxTruCor''')]))
            outfile.write(
                "nreads\t%s\n" %
                ",".join(["%i" % x for x in R('''sr$numberReads''')]))
            outfile.close()

    if "coverage" in options.toolset or do_all:
        E.info("CpG coverage analysis")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''cr = MEDIPS.seqCoverage(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            pattern='CG',
            shift=%(shift)i,
            extend=%(extend)i,
            paired=%(paired)s,
            bwa=%(BWA)s,
            %(chrstring)s
            uniq=%(uniq)s)''' % locals())

            R.png(E.get_output_file("%s_cpg_coverage_pie.png" % fn))
            R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
            type = "pie", cov.level = c(0, 1, 2, 3, 4, 5))''')
            R('''dev.off()''')

            R.png(E.get_output_file("%s_cpg_coverage_hist.png" % fn))
            R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
            type = "hist", t=15)''')
            R('''dev.off()''')

            # note: this file is large
            R('''write.table(cr$cov.res, file=gzfile('%s','w'),
            sep='\t')''' %
              E.get_output_file("%s_saturation_coveredpos.tsv.gz" % fn))

    if 'enrichment' in options.toolset or do_all:
        E.info("CpG enrichment analysis")
        outfile = iotools.open_file(E.get_output_file("enrichment.tsv.gz"), "w")
        slotnames = (("regions.CG", "regions_CG", "%i"),
                     ("regions.C", "regions_C", "%s"),
                     ("regions.G", "regions_G", "%f"),
                     ("regions.relH", "regions_relH", "%i"),
                     ("regions.GoGe", "regions_GoGe", "%i"),
                     ("genome.CG", "genome_CG", "%s"),
                     ("genome.C", "genome_C", "%s"),
                     ("genome.G", "genome_G", "%i"),
                     ("genome.relH", "genome_relH", "%i"),
                     ("enrichment.score.relH", "enrichment_relH", "%s"),
                     ("enrichment.score.GoGe", "enrichment_GoGe", "%s"))

        outfile.write("\t".join(['sample'] +
                                [x[1] for x in slotnames]) + "\n")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''ce = MEDIPS.CpGenrich(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            shift=%(shift)i,
            extend=%(extend)i,
            paired=%(paired)s,
            bwa=%(BWA)s,
            %(chrstring)s
            uniq=%(uniq)s)''' % locals())

            outfile.write("%s" % fn)
            for slotname, label, pattern in slotnames:
                value = tuple(R('''ce$%s''' % slotname))
                if len(value) == 0:
                    value = ""
                outfile.write("\t%s" % pattern % value[0])
            outfile.write("\n")
        outfile.close()

    if options.input_rdata:
        E.info("reading R session info from '%s'" % options.input_rdata)
        R('''load('%s')''' % options.input_rdata)

    else:
        if "dmr" in options.toolset or "correlation" in options.toolset \
           or do_all:
            # build four sets
            for x, fn in enumerate(options.treatment_files):
                paired = isPaired(fn)
                E.info("loading '%s'" % fn)
                R('''treatment_R%(x)i = MEDIPS.createSet(
                file='%(fn)s',
                BSgenome='%(genome_file)s',
                shift=%(shift)i,
                extend=%(extend)i,
                window_size=%(window_size)i,
                paired=%(paired)s,
                bwa=%(BWA)s,
                %(chrstring)s
                uniq=%(uniq)s)''' % locals())
            R('''treatment_set = c(%s)''' %
              ",".join(["treatment_R%i" % x
                        for x in range(len(options.treatment_files))]))

            if options.control_files:
                for x, fn in enumerate(options.control_files):
                    paired = isPaired(fn)
                    E.info("loading '%s'" % fn)
                    R('''control_R%(x)i = MEDIPS.createSet(
                    file='%(fn)s',
                    BSgenome='%(genome_file)s',
                    shift=%(shift)i,
                    extend=%(extend)i,
                    window_size=%(window_size)i,
                    paired=%(paired)s,
                    bwa=%(BWA)s,
                    %(chrstring)s
                    uniq=%(uniq)s)''' % locals())
                R('''control_set = c(%s)''' %
                  ",".join(["control_R%i" % x
                            for x in range(len(options.control_files))]))

            # build coupling vector
            R('''CS = MEDIPS.couplingVector(pattern="CG",
            refObj = treatment_set[[1]])''')

            if "correlation" in options.toolset or do_all:
                R('''cor.matrix = MEDIPS.correlation(
                c(treatment_set, control_set))''')

                R('''write.table(cor.matrix,
                file='%s',
                sep="\t")''' % E.get_output_file("correlation"))

            if "dmr" in options.toolset or do_all:
                # Data that does not fit the model causes
                # "Error in 1:max_signal_index : argument of length 0"
                # The advice is to set MeDIP=FALSE
                # See: http://comments.gmane.org/
                # gmane.science.biology.informatics.conductor/52319

                if options.is_medip:
                    medip = "TRUE"
                else:
                    medip = "FALSE"
                fdr_method = options.fdr_method

                E.info("applying test for differential methylation")
                R('''meth = MEDIPS.meth(
                MSet1 = treatment_set,
                MSet2 = control_set,
                CSet = CS,
                ISet1 = NULL,
                ISet2 = NULL,
                p.adj = "%(fdr_method)s",
                diff.method = "edgeR",
                MeDIP = %(medip)s,
                CNV = F,
                minRowSum = 1)''' % locals())

                # Note: several Gb in size
                # Output full methylation data table
                R('''write.table(meth,
                file=gzfile('%s', 'w'),
                sep="\t",
                row.names=F,
                quote=F)''' % E.get_output_file("data.tsv.gz"))

                # save R session
                if options.output_rdata:
                    R('''save.image(file='%s', safe=FALSE)''' %
                      E.get_output_file("session.RData"))

    # DMR analysis - test for windows and output
    if "dmr" in options.toolset:

        E.info("selecting differentially methylated windows")

        # test windows for differential methylation
        fdr_threshold = options.fdr_threshold
        R('''tested = MEDIPS.selectSig(meth,
        adj=T,
        ratio=NULL,
        p.value=%(fdr_threshold)f,
        bg.counts=NULL,
        CNV=F)''' % locals())

        R('''write.table(tested,
        file=gzfile('%s', 'w'),
        sep="\t",
        quote=F)''' % E.get_output_file("significant_windows.gz"))

        # select gain and merge adjacent windows
        try:
            R('''gain = tested[which(tested[, grep("logFC", colnames(tested))] > 0),];
            gain_merged = MEDIPS.mergeFrames(frames=gain, distance=1)''')
            E.info('gain output: %s, merged: %s' %
                   (str(R('''dim(gain)''')),
                    str(R('''dim(gain_merged)'''))))
            R('''of=gzfile('%s', 'w');
            write.table(gain_merged,
            file=of,
            sep="\t",
            quote=F,
            row.names=FALSE,
            col.names=FALSE); close(of)''' % E.get_output_file("gain.bed.gz"))
        except rpy2.rinterface.RRuntimeError as msg:
            E.warn("could not compute gain windows: msg=%s" % msg)
        # select loss and merge adjacent windows
        try:
            R('''loss = tested[which(tested[, grep("logFC", colnames(tested))] < 0),];
            loss_merged = MEDIPS.mergeFrames(frames=loss, distance=1)''')
            E.info('loss output: %s, merged: %s' %
                   (str(R('''dim(loss)''')),
                    str(R('''dim(loss_merged)'''))))

            R('''of=gzfile('%s', 'w');
            write.table(loss_merged,
            file=of,
            sep="\t",
            quote=F,
            row.names=F,
            col.names=F); close(of)''' % E.get_output_file("loss.bed.gz"))
        except rpy2.rinterface.RRuntimeError as msg:
            E.warn("could not compute loss windows: msg=%s" % msg)

    # if "rpm" in options.toolset or do_all:
    #     outputfile = E.get_output_file("rpm.wig")
    #     R('''MEDIPS.exportWIG(file = '%(outputfile)s',
    #     data = CONTROL.SET, raw = T, descr = "rpm")''' %
    #       locals())
    #     if options.bigwig:
    #         bigwig(outputfile, contig_sizes)
    #     else:
    #         compress(outputfile)

    # if "rms" in options.toolset or do_all:
    #     outputfile = E.get_output_file("rms.wig")
    #     R('''MEDIPS.exportWIG(file = '%(outputfile)s',
    #     data = CONTROL.SET, raw = F, descr = "rms")''' %
    #       locals())
    #     if options.bigwig:
    #         bigwig(outputfile, contig_sizes)
    #     else:
    #         compress(outputfile)

    # write footer and output benchmark information.
    E.stop()
예제 #20
0
def main(argv=None):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument(
        "-w",
        "--weights-tsv-file",
        dest="filename_weights",
        type=str,
        help="filename with codon frequencies. Multiple filenames "
        "can be separated by comma.")

    parser.add_argument("-s",
                        "--section",
                        dest="sections",
                        nargs="*",
                        type=str,
                        choices=("length", "sequence", "hid", "na", "aa",
                                 "cpg", "dn", "degeneracy", "gaps", "codons",
                                 "codon-usage", "codon-translator",
                                 "codon-bias"),
                        help="which sections to output ")

    parser.add_argument(
        "-t",
        "--sequence-type",
        dest="seqtype",
        type=str,
        choices=("na", "aa"),
        help="type of sequence: na=nucleotides, aa=amino acids .")

    parser.add_argument(
        "-e",
        "--regex-identifier",
        dest="regex_identifier",
        type=str,
        help="regular expression to extract identifier from fasta "
        "description line.")

    parser.add_argument(
        "--split-fasta-identifier",
        dest="split_id",
        action="store_true",
        help="split fasta description line (starting >) and use "
        "only text before first space")

    parser.add_argument(
        "--add-total",
        dest="add_total",
        action="store_true",
        help="add a row with column totals at the end of the table")

    parser.set_defaults(
        filename_weights=None,
        pseudocounts=1,
        sections=[],
        regex_identifier="(.+)",
        seqtype="na",
        gap_chars='xXnN',
        split_id=False,
        add_total=False,
    )

    (args) = E.start(parser, argv=argv)

    rx = re.compile(args.regex_identifier)

    reference_codons = []
    if args.filename_weights:
        args.filename_weights = args.filename_weights.split(",")
        for filename in args.filename_weights:
            if filename == "uniform":
                reference_codons.append(Genomics.GetUniformCodonUsage())
            else:
                reference_codons.append(
                    iotools.ReadMap(iotools.open_file(filename, "r"),
                                    has_header=True,
                                    map_functions=(str, float)))

        # print codon table differences
        args.stdlog.write(
            "# Difference between supplied codon usage preferences.\n")
        for x in range(0, len(reference_codons)):
            for y in range(0, len(reference_codons)):
                if x == y:
                    continue
                # calculate KL distance
                a = reference_codons[x]
                b = reference_codons[y]
                d = 0
                for codon, p in list(a.items()):
                    if Genomics.IsStopCodon(codon):
                        continue
                    d += b[codon] * math.log(b[codon] / p)

                args.stdlog.write(
                    "# tablediff\t%s\t%s\t%f\n" %
                    (args.filename_weights[x], args.filename_weights[y], d))

    iterator = FastaIterator.FastaIterator(args.stdin)

    def getCounter(section):

        if args.seqtype == "na":
            if section == "length":
                s = SequenceProperties.SequencePropertiesLength()
            elif section == "sequence":
                s = SequenceProperties.SequencePropertiesSequence()
            elif section == "hid":
                s = SequenceProperties.SequencePropertiesHid()
            elif section == "na":
                s = SequenceProperties.SequencePropertiesNA()
            elif section == "gaps":
                s = SequenceProperties.SequencePropertiesGaps(args.gap_chars)
            elif section == "cpg":
                s = SequenceProperties.SequencePropertiesCpg()
            elif section == "dn":
                s = SequenceProperties.SequencePropertiesDN()
            # these sections requires sequence length to be a multiple of 3
            elif section == "aa":
                s = SequenceProperties.SequencePropertiesAA()
            elif section == "degeneracy":
                s = SequenceProperties.SequencePropertiesDegeneracy()
            elif section == "codon-bias":
                s = SequenceProperties.SequencePropertiesBias(reference_codons)
            elif section == "codons":
                s = SequenceProperties.SequencePropertiesCodons()
            elif section == "codon-usage":
                s = SequenceProperties.SequencePropertiesCodonUsage()
            elif section == "codon-translator":
                s = SequenceProperties.SequencePropertiesCodonTranslator()
            else:
                raise ValueError("unknown section %s" % section)
        elif args.seqtype == "aa":
            if section == "length":
                s = SequenceProperties.SequencePropertiesLength()
            elif section == "sequence":
                s = SequenceProperties.SequencePropertiesSequence()
            elif section == "hid":
                s = SequenceProperties.SequencePropertiesHid()
            elif section == "aa":
                s = SequenceProperties.SequencePropertiesAminoAcids()
            else:
                raise ValueError("unknown section %s" % section)
        return s

    # setup totals
    totals = {}
    for section in args.sections:
        totals[section] = getCounter(section)

    args.stdout.write("id")
    for section in args.sections:
        args.stdout.write("\t" + "\t".join(totals[section].getHeaders()))

    args.stdout.write("\n")
    args.stdout.flush()

    s = getCounter("hid")
    s.loadSequence("AAAAAAAAA", "na")

    for cur_record in iterator:

        sequence = re.sub(" ", "", cur_record.sequence).upper()

        if len(sequence) == 0:
            raise ValueError("empty sequence %s" % cur_record.title)

        id = rx.search(cur_record.title).groups()[0]

        if args.split_id is True:
            args.stdout.write("%s" % id.split()[0])
        else:
            args.stdout.write("%s" % id)
        args.stdout.flush()

        for section in args.sections:
            s = getCounter(section)
            s.loadSequence(sequence, args.seqtype)
            totals[section].addProperties(s)

            args.stdout.write("\t" + "\t".join(s.getFields()))

        args.stdout.write("\n")

    if args.add_total:
        args.stdout.write("total")
        for section in args.sections:
            args.stdout.write("\t" + "\t".join(totals[section].getFields()))
        args.stdout.write("\n")

    E.stop()
예제 #21
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option("--task",
                      dest="task",
                      type="choice",
                      choices=[
                          "merge_exclusions", "flag_hets", "find_inbreds",
                          "flag_relations", "discordant_gender"
                      ],
                      help="task to execute on phenotype file(s)")

    parser.add_option("--gender-check-file",
                      dest="gender_check",
                      type="string",
                      help="output from gender checking "
                      "by Plink, suffix should be .sexcheck")

    parser.add_option("--relationship-file",
                      dest="relations",
                      type="string",
                      help="output file from IBS "
                      "calculation.  Should contain all pairwise "
                      "relationships.")

    parser.add_option("--inbreeding-coef-file",
                      dest="inbreed_file",
                      type="string",
                      help="file containing either Plink "
                      "or GCTA estimates of F, inbreeding coefficient")

    parser.add_option("--inbreeding-coefficient",
                      dest="inbred_coeff",
                      type="choice",
                      choices=["Fhat1", "Fhat2", "Fhat3", "F", "ibc"],
                      help="inbreeding coefficient "
                      "to use to identify highly inbred individuals")

    parser.add_option("--inbred-cutoff",
                      dest="inbred_cutoff",
                      type="float",
                      help="threshold above which individuals are classed "
                      "as inbred.")

    parser.add_option("--ibs-cutoff",
                      dest="ibs_cutoff",
                      type="float",
                      help="IBS threshold to flag individuals as being "
                      "closely related")

    parser.add_option("--trimmed-relationships",
                      dest="rel_cutoff",
                      type="string",
                      help="output file from Plink "
                      "--rel-cutoff with trimmed data set of unrelated "
                      "individuals.")

    parser.add_option(
        "--heterozygotes-file",
        dest="hets_file",
        type="string",
        help="file from heterozygote analysis containing observed "
        "homozygosity and F coefficients")

    parser.add_option("--auxillary-file",
                      dest="aux_file",
                      type="string",
                      help="a file of IIDs and FIDs for individuals that are "
                      "to be removed from analysis, unrelated to QC")

    parser.add_option("--plotting-path",
                      dest="plot_path",
                      type="string",
                      help="PATH to save any plots to")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if options.task == "flag_hets":
        # calculate heterozygosity rates, find and flag
        # individuals > 3 s.d. away from mean value
        # rate = (nonissing - homs) / nonmissing
        # i.e. non-homozygote rate
        flags = gwas.flagExcessHets(options.hets_file,
                                    plot=True,
                                    plot_path=options.plot_path)
        flags.to_csv(options.stdout, index=None, sep="\t")

    elif options.task == "merge_exclusions":
        exclusions = gwas.mergeQcExclusions(hets_file=options.hets_file,
                                            inbred_file=options.inbreed_file,
                                            related_file=options.relations,
                                            gender_file=options.gender_check,
                                            mask_file=options.aux_file)
        exclusions.to_csv(options.stdout, index=None, sep="\t")
    elif options.task == "find_inbreds":
        inbreds = gwas.flagInbred(inbred_file=options.inbreed_file,
                                  inbreeding_coefficient=options.inbred_coeff,
                                  ibc_threshold=options.inbred_cutoff,
                                  plot=True,
                                  plot_path=options.plot_path)
        inbreds.to_csv(options.stdout, sep="\t", index=None)
    elif options.task == "flag_relations":
        # the input file is likely to be huge! Ergo, read the file in chunks
        # calculate any related individuals and store them, store
        # an array of IBD values for plotting, drop the rest
        relate = gwas.flagRelated(ibd_file=options.relations,
                                  chunk_size=500000,
                                  threshold=options.ibs_cutoff,
                                  plot=True,
                                  plotting_path=options.plot_path)
    elif options.task == "discordant_gender":
        sex_discord = gwas.flagGender(gender_file=options.gender_check,
                                      plot=True,
                                      plot_path=options.plot_path)
        sex_discord.to_csv(options.stdout, index=None, sep="\t")
    else:
        pass

    # write footer and output benchmark information.
    E.stop()
예제 #22
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    rows = []
    labels = {}
    for label, expr in expressions:
        nchecked, data = cgat.Style.runPep8(expr)
        rows.append((label, nchecked, data))
        labels.update(dict([(x.code, x.description) for x in data]))

    # build table
    #
    # each row is data set and each column is a Warning/Error type
    # with some additional columns such as total and n.

    # build dictionary mapping error codes to columns
    # consistently across samples
    map_code2column = dict([(y, x + 3) for x, y in enumerate(labels.keys())])

    # build first row containing the column labels
    results = [['code', 'n', 'total'] + list(labels.keys())]

    # build array with column totals
    column_totals = [0] * (len(map_code2column) + 3)
    for label, nchecked, data in rows:
        row = [label, nchecked, 0] + [0] * len(map_code2column)
        column_totals[1] += nchecked
        for x in data:
            c = map_code2column[x.code]
            row[c] = x.count
            row[2] += int(x.count)
            column_totals[2] += int(x.count)
            column_totals[c] += int(x.count)

        results.append(row)
    # add column totals
    column_totals[0] = 'total'
    results.append(column_totals)

    # add descriptions as last row
    results.append([
        'description', 'number of files checked',
        'total errors/warnings in set'
    ] + list(labels.values()))

    # output transposed table
    outfile = sys.stdout
    for row in zip(*results):
        outfile.write('%s\n' % ('\t'.join(map(str, row))))

    # write footer and output benchmark information.
    E.stop()
예제 #23
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "--guess-format",
        dest="guess_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'),
        help="The default behaviour of the script is to guess the quality "
        "format of the input fastq file. The user can specify the "
        "quality format of the input file using the --guess-format option. "
        "The script will use this format if the "
        "sequence qualities are ambiguous.[default=%default].")

    parser.add_option(
        "--target-format",
        dest="target_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'),
        help="The script will convert quality scores to the destination "
        "format unless [default=%default].")

    parser.set_defaults(
        target_format=None,
        guess_format=None,
        min_quality=10,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    c = E.Counter()

    if options.target_format:
        iterator = Fastq.iterate_convert(options.stdin,
                                         format=options.target_format,
                                         guess=options.guess_format)
    else:
        iterator = Fastq.iterate_guess(options.stdin,
                                       guess=options.guess_format)

    options.stdout.write("read\tnfailed\tnN\t%s\n" %
                         ("\t".join(Stats.Summary().getHeaders())))

    min_quality = options.min_quality

    for record in iterator:
        c.input += 1
        quals = record.toPhred()
        nfailed = len([x for x in quals if x < min_quality])
        nns = record.seq.count("N") + record.seq.count(".")
        options.stdout.write(
            "%s\t%i\t%i\t%s\n" %
            (record.identifier, nfailed, nns, str(Stats.Summary(quals))))
        c.output += 1

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.stop()
예제 #24
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genomic sequence to retrieve "
                      "sequences from.")

    parser.add_option("-m", "--masker", dest="masker", type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker to mask output sequences "
                      "[%default].")

    parser.add_option("--output-mode", dest="output_mode", type="choice",
                      choices=("intervals", "leftright", "segments"),
                      help="what to output. "
                      "'intervals' generates a single sequence for "
                      "each bed interval. 'leftright' generates two "
                      "sequences, one in each direction, for each bed "
                      "interval. 'segments' can be used to output "
                      "sequence from bed12 files so that sequence only covers "
                      "the segements [%default]")

    parser.add_option("--min-sequence-length", dest="min_length", type="int",
                      help="require a minimum sequence length [%default]")

    parser.add_option("--max-sequence-length", dest="max_length", type="int",
                      help="require a maximum sequence length [%default]")

    parser.add_option(
        "--extend-at", dest="extend_at", type="choice",
        choices=("none", "3", "5", "both", "3only", "5only"),
        help="extend at 3', 5' or both or no ends. If 3only or 5only "
        "are set, only the added sequence is returned [default=%default]")

    parser.add_option(
        "--extend-by", dest="extend_by", type="int",
        help="extend by # bases [default=%default]")

    parser.add_option(
        "--use-strand", dest="ignore_strand",
        action="store_false",
        help="use strand information and return reverse complement "
        "on intervals located on the negative strand. "
        "[default=%default]")

    parser.set_defaults(
        genome_file=None,
        masker=None,
        output_mode="intervals",
        min_length=0,
        max_length=0,
        extend_at=None,
        extend_by=100,
        ignore_strand=True,
    )

    (options, args) = E.start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()
        fasta.setConverter(IndexedFasta.getConverter("zero-both-open"))

    counter = E.Counter()
    ids, seqs = [], []

    E.info("collecting sequences")
    for bed in Bed.setName(Bed.iterator(options.stdin)):
        counter.input += 1

        lcontig = fasta.getLength(bed.contig)

        if options.ignore_strand:
            strand = "+"
        else:
            strand = bed.strand

        if options.output_mode == "segments" and bed.columns == 12:
            ids.append("%s %s:%i..%i (%s) %s %s" %
                       (bed.name, bed.contig, bed.start, bed.end, strand,
                        bed["blockSizes"], bed["blockStarts"]))
            seg_seqs = [fasta.getSequence(bed.contig, strand, start, end)
                        for start, end in bed.toIntervals()]
            seqs.append("".join(seg_seqs))

        elif (options.output_mode == "intervals" or
              options.output_mode == "segments"):
            ids.append("%s %s:%i..%i (%s)" %
                       (bed.name, bed.contig, bed.start, bed.end, strand))
            seqs.append(
                fasta.getSequence(bed.contig, strand, bed.start, bed.end))

        elif options.output_mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0, bed.start - l), bed.end - l
            ids.append("%s_l %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

            start, end = bed.start + l, min(lcontig, bed.end + l)
            ids.append("%s_r %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

    E.info("collected %i sequences" % len(seqs))

    masked = Masker.maskSequences(seqs, options.masker)
    options.stdout.write(
        "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n")

    E.info("masked %i sequences" % len(seqs))

    counter.output = len(seqs)

    E.info("%s" % counter)

    E.stop()
예제 #25
0
def main(argv=sys.argv):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--is-gtf",
                        dest="is_gtf",
                        action="store_true",
                        help="input file is in gtf format")

    parser.add_argument("--set-name",
                        dest="name",
                        type=str,
                        help="field from the GFF/GTF file to use as the "
                        "name field in the BED file ",
                        choices=("gene_id", "transcript_id", "class", "family",
                                 "feature", "source", "repName",
                                 "gene_biotype"))

    parser.add_argument("--track",
                        dest="track",
                        type=str,
                        choices=("feature", "source", None),
                        help="use feature/source field to define BED tracks ")

    parser.add_argument(
        "--bed12-from-transcripts",
        dest="bed12",
        action="store_true",
        default=False,
        help="Process GTF file into Bed12 entries, with blocks as exons"
        "and thick/thin as coding/non-coding")

    parser.set_defaults(track=None, name="gene_id", is_gtf=False)

    (args) = E.start(parser, add_pipe_options=True)

    ninput, noutput = 0, 0

    iterator = GTF.iterator(args.stdin)

    if args.bed12:
        iterator = GTF.transcript_iterator(iterator)

    if args.track:
        all_input = list(iterator)

        if args.track == "feature":
            grouper = lambda x: x.feature
        elif args.track == "source":
            grouper = lambda x: x.source

        all_input.sort(key=grouper)

        bed = Bed.Bed()
        for key, vals in itertools.groupby(all_input, grouper):
            args.stdout.write("track name=%s\n" % key)
            for gff in vals:
                ninput += 1

                if args.bed12:
                    bed = transcript2bed12(gff)
                else:
                    bed.fromGTF(gff, name=args.name)

                args.stdout.write(str(bed) + "\n")
                noutput += 1

    else:
        bed = Bed.Bed()
        for gff in iterator:
            ninput += 1

            if args.bed12:
                bed = transcript2bed12(gff)
            else:
                bed.fromGTF(gff, name=args.name)

            args.stdout.write(str(bed) + "\n")

            noutput += 1

    E.info("ninput=%i, noutput=%i" % (ninput, noutput))
    E.stop()
예제 #26
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--is-gtf",
                      dest="is_gtf",
                      action="store_true",
                      help="input is gtf instead of gff.")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-m",
                      "--merge-adjacent",
                      dest="merge",
                      action="store_true",
                      help="merge adjacent intervals with the same attributes."
                      " [default=%default]")

    parser.add_option("-e",
                      "--feature",
                      dest="feature",
                      type="string",
                      help="filter by a feature, for example 'exon', 'CDS'."
                      " If set to the empty string, all entries are output "
                      "[%default].")

    parser.add_option("-f",
                      "--maskregions-bed-file",
                      dest="filename_masks",
                      type="string",
                      metavar="gff",
                      help="mask sequences with regions given in gff file "
                      "[%default].")

    parser.add_option("--remove-masked-regions",
                      dest="remove_masked_regions",
                      action="store_true",
                      help="remove regions instead of masking [%default].")

    parser.add_option("--min-interval-length",
                      dest="min_length",
                      type="int",
                      help="set minimum length for sequences output "
                      "[%default]")

    parser.add_option("--max-length",
                      dest="max_length",
                      type="int",
                      help="set maximum length for sequences output "
                      "[%default]")

    parser.add_option("--extend-at",
                      dest="extend_at",
                      type="choice",
                      choices=("none", "3", "5", "both", "3only", "5only"),
                      help="extend at no end, 3', 5' or both ends. If "
                      "3only or 5only are set, only the added sequence "
                      "is returned [default=%default]")

    parser.add_option("--header-attributes",
                      dest="header_attr",
                      action="store_true",
                      help="add GFF entry attributes to the FASTA record"
                      " header section")

    parser.add_option("--extend-by",
                      dest="extend_by",
                      type="int",
                      help="extend by # bases [default=%default]")

    parser.add_option("--extend-with",
                      dest="extend_with",
                      type="string",
                      help="extend using base [default=%default]")

    parser.add_option("--masker",
                      dest="masker",
                      type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker [%default].")

    parser.add_option("--fold-at",
                      dest="fold_at",
                      type="int",
                      help="fold sequence every n bases[%default].")

    parser.add_option(
        "--fasta-name-attribute",
        dest="naming_attribute",
        type="string",
        help="use attribute to name fasta entry. Currently only compatable"
        " with gff format [%default].")

    parser.set_defaults(
        is_gtf=False,
        genome_file=None,
        merge=False,
        feature=None,
        filename_masks=None,
        remove_masked_regions=False,
        min_length=0,
        max_length=0,
        extend_at=None,
        extend_by=100,
        extend_with=None,
        masker=None,
        fold_at=None,
        naming_attribute=False,
        header_attr=False,
    )

    (options, args) = E.start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator(options.stdin))
    else:
        gffs = GTF.iterator(options.stdin)
        if options.merge:
            iterator = GTF.joined_iterator(gffs)
        else:
            iterator = GTF.chunk_iterator(gffs)

    masks = None
    if options.filename_masks:
        masks = {}
        with iotools.open_file(options.filename_masks, "r") as infile:
            e = GTF.readAsIntervals(GTF.iterator(infile))

        # convert intervals to intersectors
        for contig in list(e.keys()):
            intersector = quicksect.IntervalTree()
            for start, end in e[contig]:
                intersector.add(start, end)
            masks[contig] = intersector

    ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0
    nskipped_length = 0
    nskipped_noexons = 0

    feature = options.feature

    # iterator is a list containing groups (lists) of features.
    # Each group of features have in common the same transcript ID, in case of
    # GTF files.
    for ichunk in iterator:

        ninput += 1

        if feature:
            chunk = [x for x in ichunk if x.feature == feature]
        else:
            chunk = ichunk

        if len(chunk) == 0:
            nskipped_noexons += 1
            E.info("no features in entry from "
                   "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start,
                                       ichunk[0].end, str(ichunk[0])))
            continue

        contig, strand = chunk[0].contig, chunk[0].strand

        if options.is_gtf:
            name = chunk[0].transcript_id
        else:
            if options.naming_attribute:
                attr_dict = {
                    x.split("=")[0]: x.split("=")[1]
                    for x in chunk[0].attributes.split(";")
                }
                name = attr_dict[options.naming_attribute]
            else:
                name = str(chunk[0].attributes)

        lcontig = contigs[contig]
        positive = Genomics.IsPositiveStrand(strand)
        intervals = [(x.start, x.end) for x in chunk]
        intervals.sort()

        if masks:
            if contig in masks:
                masked_regions = []
                for start, end in intervals:
                    masked_regions += [(x.start, x.end)
                                       for x in masks[contig].find(
                                           quicksect.Interval(start, end))]

                masked_regions = Intervals.combine(masked_regions)
                if len(masked_regions):
                    nmasked += 1

                if options.remove_masked_regions:
                    intervals = Intervals.truncate(intervals, masked_regions)
                else:
                    raise NotImplementedError("unimplemented")

                if len(intervals) == 0:
                    nskipped_masked += 1
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# skipped because fully masked: "
                            "%s: regions=%s masks=%s\n" %
                            (name, str([(x.start, x.end)
                                        for x in chunk]), masked_regions))
                    continue

        out = intervals

        if options.extend_at and not options.extend_with:
            if options.extend_at == "5only":
                intervals = [(max(0, intervals[0][0] - options.extend_by),
                              intervals[0][0])]
            elif options.extend_at == "3only":
                intervals = [(intervals[-1][1],
                              min(lcontig,
                                  intervals[-1][1] + options.extend_by))]
            else:
                if options.extend_at in ("5", "both"):
                    intervals[0] = (max(0,
                                        intervals[0][0] - options.extend_by),
                                    intervals[0][1])
                if options.extend_at in ("3", "both"):
                    intervals[-1] = (intervals[-1][0],
                                     min(lcontig,
                                         intervals[-1][1] + options.extend_by))

        if not positive:
            intervals = [(lcontig - x[1], lcontig - x[0])
                         for x in intervals[::-1]]
            out.reverse()

        s = [
            fasta.getSequence(contig, strand, start, end)
            for start, end in intervals
        ]
        # IMS: allow for masking of sequences
        s = Masker.maskSequences(s, options.masker)
        l = sum([len(x) for x in s])
        if (l < options.min_length
                or (options.max_length and l > options.max_length)):
            nskipped_length += 1
            if options.loglevel >= 1:
                options.stdlog.write("# skipped because length out of bounds "
                                     "%s: regions=%s len=%i\n" %
                                     (name, str(intervals), l))
                continue

        if options.extend_at and options.extend_with:
            extension = "".join((options.extend_with, ) * options.extend_by)

            if options.extend_at in ("5", "both"):
                s[1] = extension + s[1]
            if options.extend_at in ("3", "both"):
                s[-1] = s[-1] + extension

        if options.fold_at:
            n = options.fold_at
            s = "".join(s)
            seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)])
        else:
            seq = "\n".join(s)

        if options.header_attr:
            attributes = " ".join(
                [":".join([ax, ay]) for ax, ay in chunk[0].asDict().items()])
            options.stdout.write(
                ">%s %s:%s:%s feature:%s %s\n%s\n" %
                (name, contig, strand, ";".join(
                    ["%i-%i" % x
                     for x in out]), chunk[0].feature, attributes, seq))
        else:
            options.stdout.write(
                ">%s %s:%s:%s\n%s\n" %
                (name, contig, strand, ";".join(["%i-%i" % x
                                                 for x in out]), seq))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, "
           "nskipped_masked=%i, nskipped_length=%i" %
           (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked,
            nskipped_length))

    E.stop()
예제 #27
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-d",
                      "--outputdir",
                      dest="outdir",
                      type="string",
                      help="output directory to save plots")

    parser.add_option("-f",
                      "--fasta",
                      dest="fasta_file",
                      type="string",
                      help="fasta file containing tRNA cluster fasta seqs")

    parser.set_defaults(fasta_file=None, outdir=None)

    (options, args) = E.start(parser, argv=argv)

    if len(args) == 0:
        args.append("-")

    E.info(options.stdin)

    dict_trna = {}
    for record in FastaIterator.iterate(IOTools.open_file(options.fasta_file)):
        title = record.title.strip("-")
        length = len(record.sequence)
        dict_trna[title] = length

# For each read in bamfile find end position and then plot this using length of tRNA cluster
    samfile = pysam.AlignmentFile(options.stdin.name, "rb")
    refname = ""
    values = []
    n = 0
    for line in samfile:
        if line.reference_name == refname:
            if line.reference_end is None:
                pass
            else:
                end = int(line.reference_end) - int(line.reference_start)
                values.append(end)
        elif line.reference_name != refname:
            n += 1
            if n > 1:

                values = pd.Series(values)
                percent = values.value_counts() / values.count() * 100
                percent = percent.sort_index()
                percent = pd.DataFrame(percent)
                percent.rename(columns={0: 'Percent'}, inplace=True)

                # length of each tRNA from fasta
                length = dict_trna[refname.strip("-")] + 1

                temp_df = pd.DataFrame(0,
                                       index=range(1, length),
                                       columns=['A'])
                temp_df = pd.concat([temp_df, percent], axis=1)
                percent = temp_df.fillna(0)

                refname = options.outdir + refname.strip("-")
                outfile = refname + ".csv"
                outfig = refname + ".eps"

                percent.to_csv(outfile)

                g = sns.factorplot(x=percent.index,
                                   y="Percent",
                                   data=percent,
                                   size=8,
                                   kind="bar",
                                   palette="Blues")
                g.set_xlabels('position from 5\' end')
                g.set_xticklabels(rotation=90)
                g.savefig(outfig, format='eps')

                values = []
                refname = line.reference_name

            else:

                refname = line.reference_name

    E.stop()
예제 #28
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--correct-gap-shift",
                      dest="correct_shift",
                      action="store_true",
                      help="correct gap length shifts in alignments. "
                      "Requires alignlib_lite.py [%default]")

    parser.add_option(
        "-1",
        "--pattern1",
        dest="pattern1",
        type="string",
        help="pattern to extract identifier from in identifiers1. "
        "[%default]")

    parser.add_option(
        "-2",
        "--pattern2",
        dest="pattern2",
        type="string",
        help="pattern to extract identifier from in identifiers2. "
        "[%default]")

    parser.add_option("-o",
                      "--output-section",
                      dest="output",
                      type="choice",
                      action="append",
                      choices=("diff", "missed", "seqdiff"),
                      help="what to output [%default]")

    parser.set_defaults(correct_shift=False,
                        pattern1="(\S+)",
                        pattern2="(\S+)",
                        output=[])

    (options, args) = E.start(parser)

    if len(args) != 2:
        raise ValueError("two files needed to compare.")

    if options.correct_shift:
        try:
            import alignlib_lite
        except ImportError:
            raise ImportError(
                "option --correct-shift requires alignlib_lite.py_ "
                "but alignlib not found")

    seqs1 = dict([
        (x.title, x.sequence)
        for x in FastaIterator.iterate(iotools.open_file(args[0], "r"))
    ])
    seqs2 = dict([
        (x.title, x.sequence)
        for x in FastaIterator.iterate(iotools.open_file(args[1], "r"))
    ])

    if not seqs1:
        raise ValueError("first file %s is empty." % (args[0]))
    if not seqs2:
        raise ValueError("second file %s is empty." % (args[1]))

    MapIdentifiers(seqs1, options.pattern1)
    MapIdentifiers(seqs2, options.pattern2)

    nsame = 0
    nmissed1 = 0
    nmissed2 = 0
    ndiff = 0
    ndiff_first = 0
    ndiff_last = 0
    ndiff_prefix = 0
    ndiff_selenocysteine = 0
    ndiff_masked = 0
    nfixed = 0
    found2 = {}

    write_missed1 = "missed" in options.output
    write_missed2 = "missed" in options.output
    write_seqdiff = "seqdiff" in options.output
    write_diff = "diff" in options.output or write_seqdiff

    for k in sorted(seqs1):
        if k not in seqs2:
            nmissed1 += 1
            if write_missed1:
                options.stdout.write("---- %s ---- %s\n" % (k, "missed1"))
            continue

        found2[k] = 1

        s1 = seqs1[k].upper()
        s2 = seqs2[k].upper()
        m = min(len(s1), len(s2))

        if s1 == s2:
            nsame += 1
        else:
            status = "other"

            ndiff += 1

            if s1[1:] == s2[1:]:
                ndiff_first += 1
                status = "first"
            elif s1[:m] == s2[:m]:
                ndiff_prefix += 1
                status = "prefix"
            elif s1[:-1] == s2[:-1]:
                ndiff_last += 1
                status = "last"
            else:
                if len(s1) == len(s2):
                    # get all differences: the first and last residues
                    # can be different for peptide sequences when
                    # comparing my translations with ensembl peptides.
                    differences = []
                    for x in range(1, len(s1) - 1):
                        if s1[x] != s2[x]:
                            differences.append((s1[x], s2[x]))

                    l = len(differences)
                    # check for Selenocysteins
                    if len(
                        [x for x in differences
                         if x[0] == "U" or x[1] == "U"]) == l:
                        ndiff_selenocysteine += 1
                        status = "selenocysteine"

                    # check for masked residues
                    elif len([
                            x for x in differences
                            if x[0] in "NX" or x[1] in "NX"
                    ]) == l:
                        ndiff_masked += 1
                        status = "masked"

            # correct for different gap lengths
            if options.correct_shift:

                map_a2b = alignlib_lite.py_makeAlignmentVector()

                a, b = 0, 0
                keep = False

                x = 0
                while x < m and not (a == len(s1) and b == len(s2)):
                    try:
                        if s1[a] != s2[b]:
                            while s1[a] == "N" and s2[b] != "N":
                                a += 1
                            while s1[a] != "N" and s2[b] == "N":
                                b += 1

                            if s1[a] != s2[b]:
                                break
                    except IndexError:
                        print(
                            "# index error for %s: x=%i, a=%i, b=%i, l1=%i, l2=%i"
                            % (k, x, a, b, len(s1), len(s2)))
                        break

                    a += 1
                    b += 1
                    map_a2b.addPairExplicit(a, b, 0.0)
                    # check if we have reached the end:
                else:
                    keep = True
                    nfixed += 1
                    f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b)
                    print("fix\t%s\t%s" % (k, str(f)))

                if not keep:
                    print("# warning: not fixable: %s" % k)

            if write_diff:
                options.stdout.write("---- %s ---- %s\n" % (k, status))

            if write_seqdiff:
                options.stdout.write("< %s\n> %s\n" % (seqs1[k], seqs2[k]))

    for k in sorted(list(seqs2.keys())):
        if k not in found2:
            nmissed2 += 1
            if write_missed2:
                options.stdout.write("---- %s ---- %s\n" % (k, "missed2"))

    options.stdlog.write("""# Legend:
""")

    E.info("seqs1=%i, seqs2=%i, same=%i, ndiff=%i, nmissed1=%i, nmissed2=%i" %
           (len(seqs1), len(seqs2), nsame, ndiff, nmissed1, nmissed2))

    E.info(
        "ndiff=%i: first=%i, last=%i, prefix=%i, selenocysteine=%i, masked=%i, fixed=%i, other=%i"
        % (ndiff, ndiff_first, ndiff_last, ndiff_prefix, ndiff_selenocysteine,
           ndiff_masked, nfixed, ndiff - ndiff_first - ndiff_last -
           ndiff_prefix - ndiff_selenocysteine - ndiff_masked - nfixed))

    E.stop()
예제 #29
0
def main(argv=None):

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option(
        "-s", "--species", dest="species", type="string",
        help="species to use [default=%default].")

    parser.add_option(
        "-i", "--slims", dest="filename_slims", type="string",
        help="filename with GO SLIM categories "
        "[default=%default].")

    parser.add_option(
        "-g", "--genes-tsv-file", dest="filename_genes", type="string",
        help="filename with genes to analyse "
        "[default=%default].")

    parser.add_option(
        "-b", "--background-tsv-file", dest="filename_background",
        type="string",
        help="filename with background genes to analyse "
        "[default=%default].")

    parser.add_option(
        "-m", "--min-counts", dest="minimum_counts",
        type="int",
        help="minimum count - ignore all categories that have "
        "fewer than # number of genes"
        " [default=%default].")

    parser.add_option(
        "-o", "--sort-order", dest="sort_order", type="choice",
        choices=("fdr", "pvalue", "ratio"),
        help="output sort order [default=%default].")

    parser.add_option(
        "--ontology", dest="ontology", type="string",
        action="append",
        help="go ontologies to analyze. Ontologies are tested "
        "separately [default=%default].")

    parser.add_option(
        "-t", "--threshold", dest="threshold", type="float",
        help="significance threshold [>1.0 = all ]. If --fdr is set, this "
        "refers to the fdr, otherwise it is a cutoff for p-values.")

    parser.add_option(
        "--filename-dump", dest="filename_dump", type="string",
        help="dump GO category assignments into a flatfile "
        "[default=%default].")

    parser.add_option(
        "--gene2name-map-tsv-file", dest="filename_gene2name", type="string",
        help="optional filename mapping gene identifiers to gene names "
        "[default=%default].")

    parser.add_option(
        "--filename-ontology", dest="filename_ontology", type="string",
        help="filename with ontology in OBO format [default=%default].")

    parser.add_option(
        "--filename-input", dest="filename_input", type="string",
        help="read GO category assignments from a flatfile "
        "[default=%default].")

    parser.add_option(
        "--sample-size", dest="sample", type="int",
        help="do sampling (with # samples) [default=%default].")

    parser.add_option(
        "--filename-output-pattern", "--output-filename-pattern",
        dest="output_filename_pattern", type="string",
        help="pattern with output filename pattern "
        "(should contain: %(go)s and %(section)s ) [default=%default]")

    parser.add_option(
        "--fdr", dest="fdr", action="store_true",
        help="calculate and filter by FDR default=%default].")

    parser.add_option(
        "--go2goslim", dest="go2goslim", action="store_true",
        help="convert go assignments in STDIN to goslim assignments and "
        "write to STDOUT [default=%default].")

    parser.add_option(
        "--gene-pattern", dest="gene_pattern", type="string",
        help="pattern to transform identifiers to GO gene names "
        "[default=%default].")

    parser.add_option(
        "--filename-map-slims", dest="filename_map_slims", type="string",
        help="write mapping between GO categories and GOSlims "
        "[default=%default].")

    parser.add_option(
        "--get-genes", dest="get_genes", type="string",
        help="list all genes in the with a certain GOID [default=%default].")

    parser.add_option(
        "--strict", dest="strict", action="store_true",
        help="require all genes in foreground to be part of background. "
        "If not set, genes in foreground will be added to the background "
        "[default=%default].")

    parser.add_option(
        "-q", "--fdr-method", dest="qvalue_method", type="choice",
        choices=("empirical", "storey", "BH"),
        help="method to perform multiple testing correction by controlling "
        "the fdr [default=%default].")

    parser.add_option(
        "--pairwise", dest="compute_pairwise", action="store_true",
        help="compute pairwise enrichment for multiple gene lists. "
        "[default=%default].")

    # parser.add_option( "--fdr-lambda", dest="qvalue_lambda", type="float",
    #                   help="fdr computation: lambda [default=%default]."  )

    # parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
    #                    choices = ("smoother", "bootstrap" ),
    # help="fdr computation: method for estimating pi0 [default=%default]."  )

    parser.set_defaults(species=None,
                        filename_genes="-",
                        filename_background=None,
                        filename_slims=None,
                        minimum_counts=0,
                        ontology=[],
                        filename_dump=None,
                        sample=0,
                        fdr=False,
                        output_filename_pattern=None,
                        threshold=0.05,
                        filename_map_slims=None,
                        gene_pattern=None,
                        sort_order="ratio",
                        get_genes=None,
                        strict=False,
                        qvalue_method="empirical",
                        pairs_min_observed_counts=3,
                        compute_pairwise=False,
                        filename_gene2name=None
                        )

    (options, args) = E.start(parser, add_database_options=True)

    if options.go2goslim:
        GO.convertGo2Goslim(options)
        E.stop()
        sys.exit(0)

    if options.fdr and options.sample == 0:
        E.warn("fdr will be computed without sampling")

    #############################################################
    # dump GO
    if options.filename_dump:
        # set default orthologies to GO
        if not options.ontology:
            options.ontology = [
                "biol_process", "mol_function", "cell_location"]

        E.info("dumping GO categories to %s" % (options.filename_dump))

        dbhandle = database.connect(url=options.database_url)

        outfile = iotools.open_file(options.filename_dump, "w", create_dir=True)
        GO.DumpGOFromDatabase(outfile,
                              dbhandle,
                              options)
        outfile.close()
        E.stop()
        sys.exit(0)

    #############################################################
    # read GO categories from file
    if options.filename_input:
        E.info("reading association of categories and genes from %s" %
               (options.filename_input))
        infile = iotools.open_file(options.filename_input)
        gene2gos, go2infos = GO.ReadGene2GOFromFile(infile)
        infile.close()

    if options.filename_gene2name:
        E.info("reading gene identifier to gene name mapping from %s" %
               options.filename_gene2name)
        infile = iotools.open_file(options.filename_gene2name)
        gene2name = iotools.read_map(infile, has_header=True)
        infile.close()
        E.info("read %i gene names for %i gene identifiers" %
               (len(set(gene2name.values())),
                len(gene2name)))
    else:
        # use identity mapping
        gene2name = dict([(x, x) for x in list(gene2gos.keys())])

    #############################################################
    # read GO ontology from file
    if options.filename_ontology:
        E.info("reading ontology from %s" % (options.filename_ontology))

        infile = iotools.open_file(options.filename_ontology)
        ontology = GO.readOntology(infile)
        infile.close()

        def _g():
            return collections.defaultdict(GO.GOInfo)
        go2infos = collections.defaultdict(_g)

        # substitute go2infos
        for go in list(ontology.values()):
            go2infos[go.mNameSpace][go.mId] = GO.GOInfo(
                go.mId,
                go_type=go.mNameSpace,
                description=go.mName)

    #############################################################
    # get foreground gene list
    input_foreground, genelists = GO.ReadGeneLists(
        options.filename_genes,
        gene_pattern=options.gene_pattern)

    E.info("read %i genes for forground in %i gene lists" %
           (len(input_foreground), len(genelists)))

    #############################################################
    # get background
    if options.filename_background:

        # nick - bug fix: background is the first tuple element from
        # ReadGeneLists
        input_background = GO.ReadGeneLists(
            options.filename_background,
            gene_pattern=options.gene_pattern)[0]
        E.info("read %i genes for background" % len(input_background))
    else:
        input_background = None

    #############################################################
    # sort out which ontologies to test
    if not options.ontology:
        if options.filename_input:
            options.ontology = list(gene2gos.keys())

    E.info("found %i ontologies: %s" %
           (len(options.ontology), options.ontology))

    summary = []
    summary.append("\t".join((
        "genelist",
        "ontology",
        "significant",
        "threshold",
        "ngenes",
        "ncategories",
        "nmaps",
        "nforegound",
        "nforeground_mapped",
        "nbackground",
        "nbackground_mapped",
        "nsample_counts",
        "nbackground_counts",
        "psample_assignments",
        "pbackground_assignments",
        "messages")) + "\n")

    #############################################################
    # get go categories for genes
    for test_ontology in sorted(options.ontology):

        # store results for aggregate output of multiple gene lists
        all_results = []
        all_significant_results = []
        all_genelists_with_results = []

        E.info("working on ontology %s" % test_ontology)
        #############################################################
        # get/read association of GO categories to genes
        if options.filename_input:
            gene2go, go2info = gene2gos[test_ontology], go2infos[test_ontology]
        else:
            E.info("reading data from database ...")

            dbhandle.Connect(options)
            gene2go, go2info = GO.ReadGene2GOFromDatabase(
                dbhandle,
                test_ontology,
                options.database, options.species)

            E.info("finished")

        if len(go2info) == 0:
            E.warn(
                "could not find information for terms - "
                "could be mismatch between ontologies")

        ngenes, ncategories, nmaps, counts_per_category = GO.CountGO(gene2go)
        E.info("assignments found: %i genes mapped to %i categories "
               "(%i maps)" %
               (ngenes, ncategories, nmaps))

        if options.minimum_counts > 0:
            to_remove = set(
                [x for x, y in counts_per_category.items()
                 if y < options.minimum_counts])
            E.info("removing %i categories with less than %i genes" %
                   (len(to_remove), options.minimum_counts))
            GO.removeCategories(gene2go, to_remove)

            ngenes, ncategories, nmaps, counts_per_category = \
                GO.CountGO(gene2go)
            E.info("assignments after filtering: %i genes mapped "
                   "to %i categories (%i maps)" % (
                       ngenes, ncategories, nmaps))

        for genelist_name, foreground in sorted(genelists.items()):

            msgs = []
            E.info("processing %s with %i genes" %
                   (genelist_name, len(foreground)))
            ##################################################################
            ##################################################################
            ##################################################################
            # build background - reconcile with foreground
            ##################################################################
            if input_background is None:
                background = list(gene2go.keys())
            else:
                background = list(input_background)

            # nick - bug-fix backgorund included the foreground in a tuple.
            # background is the first tuple element
            missing = foreground.difference(set(background))

            if options.strict:
                assert len(missing) == 0, \
                    "%i genes in foreground but not in background: %s" % (
                        len(missing), str(missing))
            else:
                if len(missing) != 0:
                    E.warn("%i genes in foreground that are not in "
                           "background - added to background of %i" %
                           (len(missing), len(background)))

                background.extend(missing)

            E.info("(unfiltered) foreground=%i, background=%i" %
                   (len(foreground), len(background)))

            # sort foreground and background, important for reproducibility
            # under random seed
            foreground = sorted(foreground)
            background = sorted(background)

            #############################################################
            # sanity checks:
            # are all of the foreground genes in the dataset
            # missing = set(genes).difference( set(gene2go.keys()) )
            # assert len(missing) == 0, "%i genes in foreground set without GO annotation: %s" % (len(missing), str(missing))

            #############################################################
            # read GO slims and map GO categories to GO slim categories
            if options.filename_slims:
                go_slims = GO.GetGOSlims(
                    iotools.open_file(options.filename_slims, "r"))

                if options.loglevel >= 1:
                    v = set()
                    for x in list(go_slims.values()):
                        for xx in x:
                            v.add(xx)
                    options.stdlog.write(
                        "# read go slims from %s: go=%i, slim=%i\n" %
                        (options.filename_slims,
                         len(go_slims),
                         len(v)))

                if options.filename_map_slims:
                    if options.filename_map_slims == "-":
                        outfile = options.stdout
                    else:
                        outfile = iotools.open_file(
                            options.filename_map_slims, "w")

                    outfile.write("GO\tGOSlim\n")
                    for go, go_slim in sorted(list(go_slims.items())):
                        outfile.write("%s\t%s\n" % (go, go_slim))

                    if outfile != options.stdout:
                        outfile.close()

                gene2go = GO.MapGO2Slims(gene2go, go_slims, ontology=ontology)

                if options.loglevel >= 1:
                    ngenes, ncategories, nmaps, counts_per_category = \
                        GO.CountGO(gene2go)
                    options.stdlog.write(
                        "# after go slim filtering: %i genes mapped to "
                        "%i categories (%i maps)\n" % (
                            ngenes, ncategories, nmaps))

            #############################################################
            # Just dump out the gene list
            if options.get_genes:
                fg, bg, ng = [], [], []

                for gene, vv in list(gene2go.items()):
                    for v in vv:
                        if v.mGOId == options.get_genes:
                            if gene in genes:
                                fg.append(gene)
                            elif gene in background:
                                bg.append(gene)
                            else:
                                ng.append(gene)

                # skip to next GO class
                if not (bg or ng):
                    continue

                options.stdout.write(
                    "# genes in GO category %s\n" % options.get_genes)
                options.stdout.write("gene\tset\n")
                for x in sorted(fg):
                    options.stdout.write("%s\t%s\n" % ("fg", x))
                for x in sorted(bg):
                    options.stdout.write("%s\t%s\n" % ("bg", x))
                for x in sorted(ng):
                    options.stdout.write("%s\t%s\n" % ("ng", x))

                E.info("nfg=%i, nbg=%i, nng=%i" % (len(fg), len(bg), len(ng)))

                E.stop()
                sys.exit(0)

            #############################################################
            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='foreground',
                                     set=genelist_name)

            outfile.write("gene_id\n%s\n" % ("\n".join(sorted(foreground))))
            if options.output_filename_pattern:
                outfile.close()

            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='background',
                                     set=genelist_name)

            # Jethro bug fix - see section 'build background' for assignment
            outfile.write("gene_id\n%s\n" % ("\n".join(sorted(background))))
            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            # do the analysis
            go_results = GO.AnalyseGO(gene2go, foreground, background)

            if len(go_results.mSampleGenes) == 0:
                E.warn("%s: no genes with GO categories - analysis aborted" %
                       genelist_name)
                continue

            pairs = list(go_results.mResults.items())

            #############################################################
            # calculate fdr for each hypothesis
            if options.fdr:
                fdrs, samples, method = GO.computeFDRs(go_results,
                                                       foreground,
                                                       background,
                                                       options,
                                                       test_ontology,
                                                       gene2go,
                                                       go2info)
                for x, v in enumerate(pairs):
                    v[1].mQValue = fdrs[v[0]][0]
            else:
                fdrs, samples, method = {}, {}, None

            msgs.append("fdr=%s" % method)

            if options.sort_order == "fdr":
                pairs.sort(key=lambda x: x[1].mQValue)
            elif options.sort_order == "ratio":
                pairs.sort(key=lambda x: x[1].mRatio)
            elif options.sort_order == "pvalue":
                pairs.sort(key=lambda x: x[1].mPValue)

            #############################################################
            #############################################################
            #############################################################
            # output the full result
            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='overall',
                                     set=genelist_name)

            GO.outputResults(
                outfile, pairs, go2info, options, fdrs=fdrs, samples=samples)

            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            #############################################################
            #############################################################
            # filter significant results and output
            filtered_pairs = GO.selectSignificantResults(pairs, fdrs, options)

            nselected = len(filtered_pairs)
            nselected_up = len([x for x in filtered_pairs if x[1].mRatio > 1])
            nselected_down = len(
                [x for x in filtered_pairs if x[1].mRatio < 1])

            assert nselected_up + nselected_down == nselected

            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='results',
                                     set=genelist_name)

            GO.outputResults(outfile,
                             filtered_pairs,
                             go2info,
                             options,
                             fdrs=fdrs,
                             samples=samples)

            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            #############################################################
            #############################################################
            # save results for multi-gene-list analysis
            all_results.append(pairs)
            all_significant_results.append(filtered_pairs)
            all_genelists_with_results.append(genelist_name)

            #############################################################
            #############################################################
            #############################################################
            # output parameters
            ngenes, ncategories, nmaps, counts_per_category = \
                GO.CountGO(gene2go)

            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='parameters',
                                     set=genelist_name)

            nbackground = len(background)
            if nbackground == 0:
                nbackground = len(go_results.mBackgroundGenes)

            outfile.write(
                "# input go mappings for gene list '%s' and category '%s'\n" %
                (genelist_name, test_ontology))
            outfile.write("parameter\tvalue\tdescription\n")
            outfile.write("mapped_genes\t%i\tmapped genes\n" % ngenes)
            outfile.write(
                "mapped_categories\t%i\tmapped categories\n" % ncategories)
            outfile.write("mappings\t%i\tmappings\n" % nmaps)
            outfile.write("genes_in_fg\t%i\tgenes in foreground\n" %
                          len(foreground))
            outfile.write(
                "genes_in_fg_with_assignment\t%i\tgenes in foreground with GO assignments\n" %
                (len(go_results.mSampleGenes)))
            outfile.write(
                "genes_in_bg\t%i\tinput background\n" % nbackground)
            outfile.write(
                "genes_in_bg_with_assignment\t%i\tgenes in background with GO assignments\n" % (
                    len(go_results.mBackgroundGenes)))
            outfile.write(
                "associations_in_fg\t%i\tassociations in sample\n" %
                go_results.mSampleCountsTotal)
            outfile.write(
                "associations_in_bg\t%i\tassociations in background\n" %
                go_results.mBackgroundCountsTotal)
            outfile.write(
                "percent_genes_in_fg_with_association\t%s\tpercent genes in sample with GO assignments\n" % (
                    iotools.pretty_percent(len(go_results.mSampleGenes),
                                           len(foreground), "%5.2f")))
            outfile.write(
                "percent_genes_in_bg_with_associations\t%s\tpercent genes background with GO assignments\n" % (
                    iotools.pretty_percent(len(go_results.mBackgroundGenes),
                                           nbackground, "%5.2f")))
            outfile.write(
                "significant\t%i\tsignificant results reported\n" % nselected)
            outfile.write(
                "significant_up\t%i\tsignificant up-regulated results reported\n" % nselected_up)
            outfile.write(
                "significant_down\t%i\tsignificant up-regulated results reported\n" % nselected_down)
            outfile.write(
                "threshold\t%6.4f\tsignificance threshold\n" % options.threshold)

            if options.output_filename_pattern:
                outfile.close()

            summary.append("\t".join(map(str, (
                genelist_name,
                test_ontology,
                nselected,
                options.threshold,
                ngenes,
                ncategories,
                nmaps,
                len(foreground),
                len(go_results.mSampleGenes),
                nbackground,
                len(go_results.mBackgroundGenes),
                go_results.mSampleCountsTotal,
                go_results.mBackgroundCountsTotal,
                iotools.pretty_percent(
                    len(go_results.mSampleGenes), len(foreground), "%5.2f"),
                iotools.pretty_percent(
                    len(go_results.mBackgroundGenes), nbackground, "%5.2f"),
                ",".join(msgs)))) + "\n")

            #############################################################
            #############################################################
            #############################################################
            # output the fg patterns
            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='withgenes',
                                     set=genelist_name)

            GO.outputResults(outfile, pairs, go2info, options,
                             fdrs=fdrs,
                             samples=samples,
                             gene2go=gene2go,
                             foreground=foreground,
                             gene2name=gene2name)

            if options.output_filename_pattern:
                outfile.close()

        if len(genelists) > 1:

            ###################################################################
            # output various summary files
            # significant results
            GO.outputMultipleGeneListResults(all_significant_results,
                                             all_genelists_with_results,
                                             test_ontology,
                                             go2info,
                                             options,
                                             section='significant')

            # all results
            GO.outputMultipleGeneListResults(all_results,
                                             all_genelists_with_results,
                                             test_ontology,
                                             go2info,
                                             options,
                                             section='all')

            if options.compute_pairwise:
                GO.pairwiseGOEnrichment(all_results,
                                        all_genelists_with_results,
                                        test_ontology,
                                        go2info,
                                        options)

    outfile_summary = options.stdout
    outfile_summary.write("".join(summary))

    E.stop()
예제 #30
0
def main(argv=sys.argv):

    parser = E.ArgumentParser(description=__doc__)

    # IMS: new method: extend intervals by set amount
    parser.add_argument("-m",
                        "--method",
                        dest="methods",
                        type=str,
                        action="append",
                        choices=("merge", "filter-genome", "bins", "block",
                                 "sanitize-genome", "shift", "extend",
                                 "filter-names", "rename-chr"),
                        help="method to apply")

    parser.add_argument("--num-bins",
                        dest="num_bins",
                        type=int,
                        help="number of bins into which to merge (used for "
                        "method `bins)")

    parser.add_argument("--bin-edges",
                        dest="bin_edges",
                        type=str,
                        help="bin_edges for binning method")

    parser.add_argument(
        "--binning-method",
        dest="binning_method",
        type=str,
        choices=("equal-bases", "equal-intervals", "equal-range"),
        help="method used for binning (used for method `bins` if no "
        "bin_edges is given)")

    parser.add_argument(
        "--merge-distance",
        dest="merge_distance",
        type=int,
        help="distance in bases over which to merge that are not "
        "directly adjacent")

    parser.add_argument(
        "--merge-min-intervals",
        dest="merge_min_intervals",
        type=int,
        help="only output merged intervals that are build from at least "
        "x intervals")

    parser.add_argument("--merge-by-name",
                        dest="merge_by_name",
                        action="store_true",
                        help="only merge intervals with the same name")

    parser.add_argument(
        "--merge-and-resolve-blocks",
        dest="resolve_blocks",
        action="store_true",
        help="When merging bed12 entrys, should blocks be resolved?")

    parser.add_argument("--merge-stranded",
                        dest="stranded",
                        action="store_true",
                        help="Only merge intervals on the same strand")

    parser.add_argument(
        "--remove-inconsistent-names",
        dest="remove_inconsistent_names",
        action="store_true",
        help="when merging, do not output intervals where the names of "
        "overlapping intervals do not match")

    parser.add_argument("--offset",
                        dest="offset",
                        type=int,
                        help="offset for shifting intervals")

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome.")

    parser.add_argument("-b",
                        "--bam-file",
                        dest="bam_file",
                        type=str,
                        help="bam-formatted filename with genome.")

    parser.add_argument("--filter-names-file",
                        dest="names",
                        type=str,
                        help="list of names to keep. One per line")

    parser.add_argument(
        "--rename-chr-file",
        dest="rename_chr_file",
        type=str,
        help="mapping table between old and new chromosome names."
        "TAB separated 2-column file.")

    parser.set_defaults(methods=[],
                        merge_distance=0,
                        binning_method="equal-bases",
                        merge_by_name=False,
                        genome_file=None,
                        rename_chr_file=None,
                        bam_file=None,
                        num_bins=5,
                        merge_min_intervals=1,
                        bin_edges=None,
                        offset=10000,
                        test=None,
                        extend_distance=1000,
                        remove_inconsistent_names=False,
                        resolve_blocks=False)

    (args) = E.start(parser, add_pipe_options=True)

    contigs = None
    chr_map = None

    # Why provide full indexed genome, when a tsv of contig sizes would do?
    if args.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(args.genome_file)
        contigs = genome_fasta.getContigSizes()

    if args.bam_file:
        samfile = pysam.AlignmentFile(args.bam_file)
        contigs = dict(list(zip(samfile.references, samfile.lengths)))

    if args.rename_chr_file:
        chr_map = {}
        with open(args.rename_chr_file, 'r') as filein:
            reader = csv.reader(filein, delimiter='\t')
            for row in reader:
                if len(row) != 2:
                    raise ValueError(
                        "Mapping table must have exactly two columns")
                chr_map[row[0]] = row[1]
        if not len(chr_map.keys()) > 0:
            raise ValueError("Empty mapping dictionnary")

    processor = Bed.iterator(args.stdin)

    for method in args.methods:
        if method == "filter-genome":
            if not contigs:
                raise ValueError("please supply contig sizes")
            processor = filterGenome(processor, contigs)
        elif method == "sanitize-genome":
            if not contigs:
                raise ValueError("please supply contig sizes")
            processor = sanitizeGenome(processor, contigs)
        elif method == "merge":
            processor = merge(
                processor,
                args.merge_distance,
                by_name=args.merge_by_name,
                min_intervals=args.merge_min_intervals,
                remove_inconsistent=args.remove_inconsistent_names,
                resolve_blocks=args.resolve_blocks,
                stranded=args.stranded)
        elif method == "bins":
            if args.bin_edges:
                bin_edges = list(map(float, args.bin_edges.split(",")))
                # IMS: check bin edges are valid
                if not (len(bin_edges) == args.num_bins + 1):
                    raise ValueError(
                        "Number of bin edge must be one more than "
                        "number of bins")
            else:
                bin_edges = None
            processor, bin_edges = Bed.binIntervals(processor,
                                                    num_bins=args.num_bins,
                                                    method=args.binning_method,
                                                    bin_edges=bin_edges)
            E.info("# split bed: bin_edges=%s" % (str(bin_edges)))

        elif method == "block":
            processor = Bed.blocked_iterator(processor)
        elif method == "shift":
            # IMS: test that contig sizes are availible
            if not contigs:
                raise ValueError("please supply genome file")
            processor = shiftIntervals(processor, contigs, offset=args.offset)
        # IMS: new method: extend intervals by set amount
        elif method == "extend":
            if not contigs:
                raise ValueError("please supply genome file")
            processor = extendInterval(processor, contigs, args.offset)
        elif method == "filter-names":
            if not args.names:
                raise ValueError("please supply list of names to filter")
            names = [name.strip() for name in open(args.names)]
            processor = filterNames(processor, names)
        elif method == "rename-chr":
            if not chr_map:
                raise ValueError("please supply mapping file")
            processor = renameChromosomes(processor, chr_map)

    noutput = 0
    for bed in processor:
        args.stdout.write(str(bed) + "\n")
        noutput += 1

    E.info("noutput=%i" % (noutput))

    E.stop()