コード例 #1
0
ファイル: wig2bed.py プロジェクト: harmeet1990/cgat-apps
def applyThreshold(infile, fasta, threshold, max_distance=0):
    '''apply threshold to a wig file writing a
    bed-formatted file as output.'''

    c = E.Counter()

    for contig, size in list(
            fasta.getContigSizes(with_synonyms=False).items()):
        c.contigs += 1

        E.debug("processing %s" % contig)

        last_start, last_end = -1, 0

        for start, end, value in block_iterator(infile, contig, size):
            d = start - last_end
            if (d > 0 or value < threshold):
                if last_start >= 0:
                    yield contig, last_start, last_end
                    c.intervals += 1
                last_start = -1
            elif last_start < 0 and value >= threshold:
                last_start = start

            last_end = end

        if last_start >= 0:
            yield contig, last_start, end
            c.intervals += 1

        c.output += 1

    E.info(str(c))
コード例 #2
0
ファイル: setup_test.py プロジェクト: harmeet1990/cgat-apps
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("script", "module"),
                      help="type of tests to create [%default].")

    parser.set_defaults(method="script")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) == 0:
        raise ValueError(
            "setup_test.py requires one or more command line arguments")

    targetdir = os.path.dirname(__file__)

    counter = E.Counter()

    for arg in args:
        counter.input += 1
        script_dirname, basename = os.path.split(arg)

        dirname = os.path.join(targetdir, basename)

        if os.path.exists(dirname):
            E.warn("%s already exists - skipping" % basename)
            counter.skipped += 1
            continue

        os.mkdir(dirname)

        with open(os.path.join(dirname, "tests.yaml"), "w") as outf:
            outf.write(YAML_TEMPLATE)

        counter.created += 1

    E.info("%s" % str(counter))

    # write footer and output benchmark information.
    E.Stop()
コード例 #3
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    total_counter = E.Counter()
    table = []

    for section, map_task2runner in [("tool", map_tool_to_runner),
                                     ("metric", map_metric_to_runner),
                                     ("split", map_split_to_runner),
                                     ("collate", map_collate_to_runner)]:
        E.debug("processing section: {}".format(section))
        counter = E.Counter()

        for task, taskf in sorted(map_task2runner.items()):
            counter.ntasks += 1
            comments = []
            try:
                version = taskf().get_version()
                counter.version_ok += 1
            except Exception:
                version = ""
                comments.append("unavailable")
                counter.version_fail += 1

            comments = "; ".join(comments)
            table.append((section, task, version, comments))

        E.info("{}: {}".format(section, counter))
        total_counter += counter

    options.stdout.write("section\ttask\tversion\tcomments\n")
    for row in table:
        options.stdout.write("\t".join(map(str, row)) + "\n")

    E.info("{}: {}".format("total", counter))
    E.stop()
コード例 #4
0
ファイル: fastq2tsv.py プロジェクト: alphaneer/cgat-apps
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--input-fastq-file",
                      dest="input_fastq_file",
                      type="string",
                      help="input fastq file. "
                      "[%default]")

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      action="append",
                      type="choice",
                      choices=("length", ),
                      help="methods to apply [%default]")

    parser.set_defaults(
        methods=[],
        input_fastq_file=None,
    )

    (options, args) = E.start(parser, argv)

    if len(args) == 1:
        options.input_fastq_file = args[0]

    if options.input_fastq_file is None:
        raise ValueError("missing input fastq file")

    counter = E.Counter()

    # note: complete rewrite with Counters, currently only length
    if options.methods != ["length"]:
        raise NotImplementedError()

    with pysam.FastqFile(options.input_fastq_file) as inf:

        for read in inf:
            counter.input += 1
            options.stdout.write(
                "\t".join(map(str, (read.name, len(read.sequence)))) + "\n")

            counter.output += 1

    E.info(counter)
    E.stop()
コード例 #5
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-n",
                      "--dry-run",
                      dest="dry_run",
                      action="store_true",
                      help="dry run, do not delete any files [%default]")

    parser.set_defaults(dry_run=False)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    filenames = args

    c = E.Counter()
    for filename in filenames:
        c.checked += 1
        if os.path.exists(filename + ".log"):
            if iotools.isComplete(filename + ".log"):
                c.complete += 1
                continue

        if iotools.isComplete(filename):
            c.complete += 1
            continue

        c.incomplete += 1
        E.info('deleting %s' % filename)
        if options.dry_run:
            continue
        os.unlink(filename)
        c.deleted += 1

    E.info(c)

    # write footer and output benchmark information.
    E.Stop()
コード例 #6
0
def main(argv=sys.argv):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-i",
                        "--input-fastq-file",
                        dest="input_fastq_file",
                        type=str,
                        help="input fastq file. ")

    parser.add_argument("-m",
                        "--method",
                        dest="methods",
                        action="append",
                        type=str,
                        choices=("length", ),
                        help="methods to apply ")

    parser.set_defaults(
        methods=[],
        input_fastq_file=None,
    )

    (args, unknown) = E.start(parser, argv, unknowns=True)

    if len(unknown) == 1:
        args.input_fastq_file = unknown[0]

    if args.input_fastq_file is None:
        raise ValueError("missing input fastq file")

    counter = E.Counter()

    # note: complete rewrite with Counters, currently only length
    if args.methods != ["length"]:
        raise NotImplementedError()

    with pysam.FastqFile(args.input_fastq_file) as inf:

        for read in inf:
            counter.input += 1
            args.stdout.write(
                "\t".join(map(str, (read.name, len(read.sequence)))) + "\n")

            counter.output += 1

    E.info(counter)
    E.stop()
コード例 #7
0
def clean(files, logfile):
    '''clean up files given by glob expressions.

    Files are cleaned up by zapping, i.e. the files are set to size
    0. Links to files are replaced with place-holders.

    Information about the original file is written to `logfile`.

    Arguments
    ---------
    files : list
        List of glob expressions of files to clean up.
    logfile : string
        Filename of logfile.

    '''
    fields = ('st_atime', 'st_blksize', 'st_blocks', 'st_ctime', 'st_dev',
              'st_gid', 'st_ino', 'st_mode', 'st_mtime', 'st_nlink', 'st_rdev',
              'st_size', 'st_uid')

    dry_run = get_params().get("dryrun", False)

    if not dry_run:
        if not os.path.exists(logfile):
            outfile = iotools.open_file(logfile, "w")
            outfile.write("filename\tzapped\tlinkdest\t%s\n" %
                          "\t".join(fields))
        else:
            outfile = iotools.open_file(logfile, "a")

    c = E.Counter()
    for fn in files:
        c.files += 1
        if not dry_run:
            stat, linkdest = iotools.zap_file(fn)
            if stat is not None:
                c.zapped += 1
                if linkdest is not None:
                    c.links += 1
                outfile.write(
                    "%s\t%s\t%s\t%s\n" %
                    (fn, time.asctime(time.localtime(time.time())), linkdest,
                     "\t".join([str(getattr(stat, x)) for x in fields])))

    get_logger().info("zapped: %s" % (c))
    outfile.close()

    return c
コード例 #8
0
    def pair_iterator(test_vcf, truth_vcf, contig):
        counter = E.Counter()
        test_iter = test_vcf.fetch(contig)
        truth_iter = truth_vcf.fetch(contig)

        test_record = next(test_iter)
        truth_record = next(truth_iter)
        try:
            while 1:
                if test_record.pos < truth_record.pos:
                    test_record = next(test_iter)
                    continue

                elif test_record.pos > truth_record.pos:
                    truth_record = next(truth_iter)
                    continue

                elif len(test_record.alts) > 1:
                    counter.skip_test_truth += 1
                    test_record = next(test_iter)
                    continue

                elif len(truth_record.alts) > 1:
                    counter.skip_multiallelic_truth += 1
                    truth_record = next(truth_iter)
                    continue

                elif test_record.alts != truth_record.alts:
                    counter.skip_genotype_difference += 1
                    test_record = next(test_iter)
                    truth_record = next(truth_iter)
                    continue

                if test_record.ref != truth_record.ref:
                    # todo: deal with indels
                    raise ValueError("mismatching reference bases at position "
                                     "{}:{}".format(test_record.chrom,
                                                    test_record.pos))

                yield test_record, truth_record
                test_record = next(test_iter)
                truth_record = next(truth_iter)

        except StopIteration:
            pass

        E.debug(str(counter))
コード例 #9
0
def read_and_randomize_rows(infile, args):
    """read table from stdin and randomize rows, keeping header."""

    c = E.Counter()
    if args.has_headers:
        keep_header = 1
    else:
        keep_header = 0
    for x in range(keep_header):
        c.header += 1
        args.stdout.write(infile.readline())

    lines = infile.readlines()
    c.lines_input = len(lines)
    random.shuffle(lines)
    args.stdout.write("".join(lines))
    c.lines_output = len(lines)
    E.info(c)
コード例 #10
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-k",
                      "--keep-header",
                      dest="keep_header",
                      type="int",
                      help="randomize, but keep header in place [%default]")

    parser.set_defaults(keep_header=0)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    inf = options.stdin
    outf = options.stdout
    c = E.Counter()
    for x in range(options.keep_header):
        c.header += 1
        outf.write(inf.readline())

    lines = inf.readlines()
    c.lines_input = len(lines)
    random.shuffle(lines)
    for line in lines:
        outf.write(line)
    c.lines_output = len(lines)

    E.info(c)

    # write footer and output benchmark information.
    E.stop()
コード例 #11
0
def buildMRBed(infile, outfile):
    '''output bed6 file with methylated regions.

    All regions are output, even the insignificant ones.

    The score is the log fold change.
    '''

    outf = iotools.openFile(outfile, "w")
    c = E.Counter()
    for row in csv.DictReader(iotools.openFile(infile), dialect="excel-tab"):
        c.input += 1

        contig, start, end = re.match("(.*):(\d+)-(\d+)",
                                      row["interval_id"]).groups()
        c.output += 1
        outf.write("\t".join((contig, start, end, str(c.input),
                              row["lfold"])) + "\n")

    outf.close()

    E.info("%s" % str(c))
コード例 #12
0
ファイル: vcf_vs_vcf.py プロジェクト: alphaneer/cgat-apps
def read_vcf_positions_into_dataframe(filename, filters=None):

    vcf_in = pysam.VariantFile(filename)

    if filters is None:
        filters = []

    pass_filter = False
    snp_filter = False

    for f in filters:
        if f == "PASS":
            pass_filter = True
        elif f == "SNP":
            snp_filter = True

    records = []
    c = E.Counter()
    for record in vcf_in:
        c.input += 1
        f = record.filter.keys()
        if pass_filter and "PASS" not in f and "." not in f:
            c.removed_pass_filter += 1
            continue
        if snp_filter:
            is_snp = (len(record.ref) == 1 and len(record.alts) == 1
                      and len(record.alts[0]) == 1)
            if not is_snp:
                c.removed_snp_filter += 1
                continue

        c.output += 1
        records.append((record.chrom, record.pos))

    df = pandas.DataFrame.from_records(records, columns=["chrom", "pos"])

    E.info("{}: {}".format(filename, c))

    return df
コード例 #13
0
ファイル: go.py プロジェクト: tw7649116/cgat-flow
def createGOFromGeneOntology(infile, outfile):
    """get GO assignments from Geneontology.org

    GO terms are mapped to ensembl gene names via uniprot identifiers.

    Configuration
    -------------
    geneontology_file
       Filename on geneontology database, e.g.,
       gene_association.goa_human.gz
    database_name
       Pipeline database name

    Arguments
    ---------
    infile : string
        Unused
    outfile : string
        Output filename
    """

    filename = os.path.join(os.path.dirname(outfile), "geneontology.goa.gz")
    if not os.path.exists(filename):
        statement = '''
        wget -O %(filename)s http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/%(go_geneontology_file)s?rev=HEAD
    '''

        P.run(statement)

    # see http://www.geneontology.org/gene-associations/readme/goa.README
    Data = collections.namedtuple(
        "Data",
        "db db_object_id db_object_symbol qualifier goid dbreference evidence "
        " with_id aspect "
        " db_object_name synonym db_object_type "
        " taxon_id date assigned_by "
        " annotation_extension"
        " gene_product_form_id")

    dbh = sqlite3.connect(PARAMS["database_name"])
    cc = dbh.cursor()
    map_uniprot2ensembl = dict(
        cc.execute("SELECT DISTINCT gene_name, gene_id FROM transcript_info").
        fetchall())
    map_goid2description = dict(
        cc.execute("SELECT DISTINCT go_id, description FROM go_assignments").
        fetchall())

    aspect2name = {
        "P": "biol_process",
        "F": "mol_function",
        "C": "cell_location"
    }

    c = E.Counter()
    found_uniprot, found_genes, notfound_uniprot = set(), set(), set()
    outf = iotools.open_file(outfile, "w")
    outf.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n")
    for line in iotools.open_file(filename):
        if line.startswith("!"):
            continue
        c.input += 1
        data = Data._make(line[:-1].split("\t"))

        if data.db_object_symbol in map_uniprot2ensembl:
            gene_id = map_uniprot2ensembl[data.db_object_symbol]
            found_uniprot.add(data.db_object_symbol)
            found_genes.add(gene_id)
            outf.write(
                "%s\t%s\t%s\t%s\t%s\n" %
                (aspect2name[data.aspect], gene_id, data.goid,
                 map_goid2description.get(data.goid, ""), data.evidence))
            c.output += 1

        else:
            c.notfound += 1
            notfound_uniprot.add(data.db_object_symbol)

    c.found_genes = len(found_genes)
    c.found_uniprot = len(found_uniprot)
    c.notfound_uniprot = len(notfound_uniprot)

    E.info("%s" % str(c))
    E.info("not found=%s" % str(notfound_uniprot))
    outf.close()
コード例 #14
0
ファイル: go.py プロジェクト: tw7649116/cgat-flow
def imputeGO(infile_go, infile_paths, outfile):
    """impute GO accessions.

    Output a list of gene-to-GO associations for genes that includes
    ancestral terms.

    Arguments
    ---------
    infile_go : string
        Filename with gene-to-GO assocations for genes
    infile_paths : string
        Filename with paths of term to ancestor (see go2fmt.pl).
    outfile : string
         Output filename

    """

    c = E.Counter()

    term2ancestors = collections.defaultdict(set)
    with iotools.open_file(infile_paths) as inf:
        for line in inf:
            parts = line[:-1].split()
            term = parts[0]
            ancestors = [parts[x] for x in range(2, len(parts), 2)]
            # there can be multiple paths
            term2ancestors[term].update(ancestors)

    goid2description = {}
    gene2goids = collections.defaultdict(list)
    goid2type = {}
    with iotools.open_file(infile_go) as inf:
        for line in inf:
            if line.startswith("go_type"):
                continue
            go_type, gene_id, goid, description, evidence = line[:-1].split(
                "\t")
            gene2goids[gene_id].append(goid)
            goid2description[goid] = description
            goid2type[goid] = go_type

    outf = iotools.open_file(outfile, "w ")
    for gene_id, in_goids in gene2goids.items():
        c.genes += 1
        out_goids = set(in_goids)
        for goid in in_goids:
            out_goids.update(term2ancestors[goid])
        if len(in_goids) != len(out_goids):
            c.increased += 1
        else:
            c.complete += 1

        for goid in out_goids:
            outf.write("\t".join((goid2type.get(goid, ""), gene_id, goid,
                                  goid2description.get(goid, ""), "NA")) +
                       "\n")
            c.assocations += 1

    outf.close()

    E.info("%s" % str(c))
コード例 #15
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-t",
                        "--template-bam-file",
                        dest="filename_genome_bam",
                        type=str,
                        help="input bam file for header information ")

    parser.add_argument("-s",
                        "--contigs-tsv-file",
                        dest="filename_contigs",
                        type=str,
                        help="filename with contig sizes ")

    parser.add_argument(
        "-o",
        "--colour",
        dest="colour_mismatches",
        action="store_true",
        help="mismatches will use colour differences (CM tag) ")

    parser.add_argument("-i",
                        "--ignore-mismatches",
                        dest="ignore_mismatches",
                        action="store_true",
                        help="ignore mismatches ")

    parser.add_argument("-c",
                        "--remove-contigs",
                        dest="remove_contigs",
                        type=str,
                        help="','-separated list of contigs to remove ")

    parser.add_argument("-f",
                        "--force-output",
                        dest="force",
                        action="store_true",
                        help="force overwriting of existing files ")

    parser.add_argument("-u",
                        "--unique",
                        dest="unique",
                        action="store_true",
                        help="remove reads not matching uniquely ")

    parser.set_defaults(
        filename_genome_bam=None,
        filename_gtf=None,
        filename_mismapped=None,
        remove_contigs=None,
        force=False,
        unique=False,
        colour_mismatches=False,
        ignore_mismatches=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv)

    genomefile, referencenames, referencelengths = None, None, None

    if args.filename_genome_bam:
        genomefile = pysam.AlignmentFile(args.filename_genome_bam, "rb")
    elif args.filename_contigs:
        contigs = iotools.ReadMap(iotools.open_file(args.filename_contigs))
        data = list(zip(*list(contigs.items())))
        referencenames, referencelengths = data[0], list(map(int, data[1]))
    else:
        raise ValueError(
            "please provide either --template-bam-file or --contigs-tsv-file")

    infile = pysam.AlignmentFile("-", "rb")
    outfile = pysam.AlignmentFile("-",
                                  "wb",
                                  template=genomefile,
                                  referencenames=referencenames,
                                  referencelengths=referencelengths)

    if args.colour_mismatches:
        tag = "CM"
    else:
        tag = "NM"

    nambiguous = 0
    ninput = 0
    nunmapped = 0
    ncigar = 0
    nfull = 0
    noutput = 0

    contig2tid = dict([(y, x) for x, y in enumerate(outfile.references)])

    for qname, readgroup in itertools.groupby(infile, lambda x: x.qname):
        ninput += 1
        reads = list(readgroup)
        if reads[0].is_unmapped:
            nunmapped += 1
            continue

        # filter for best match
        best = min([x.opt(tag) for x in reads])
        reads = [x for x in reads if x.opt(tag) == best]
        if len(reads) > 1:
            nambiguous += 1
            continue

        read = reads[0]

        # reject complicated matches (indels, etc)
        # to simplify calculations below.
        if len(read.cigar) > 1:
            ncigar += 1
            continue

        # set NH flag to latest count
        t = dict(read.tags)
        t['NH'] = 1
        read.tags = list(t.items())

        sname = infile.getrname(read.tid)

        contig, first_exon_start, middle, last_exon_end, splice, strand = sname.split(
            "|")
        first_exon_end, last_exon_start = middle.split("-")
        first_exon_start, first_exon_end, last_exon_start, last_exon_end = list(
            map(int, (first_exon_start, first_exon_end, last_exon_start,
                      last_exon_end)))
        first_exon_end += 1

        total = first_exon_end - first_exon_start + \
            last_exon_end - last_exon_start
        first_exon_length = first_exon_end - first_exon_start

        match1 = first_exon_length - read.pos
        intron_length = last_exon_start - first_exon_end
        match2 = read.qlen - match1

        # match lies fully in one exon - ignore
        if match1 <= 0 or match2 <= 0:
            nfull += 1
            continue

        # increment pos
        read.pos = first_exon_start + read.pos
        read.tid = contig2tid[contig]
        # 3 = BAM_CREF_SKIP
        read.cigar = [(0, match1), (3, intron_length), (0, match2)]

        outfile.write(read)

        noutput += 1

    outfile.close()
    if genomefile:
        genomefile.close()

    c = E.Counter()
    c.input = ninput
    c.output = noutput
    c.full = nfull
    c.cigar = ncigar
    c.ambiguous = nambiguous
    c.unmapped = nunmapped

    E.info("%s" % str(c))

    # write footer and output benchmark information.
    E.stop()
コード例 #16
0
ファイル: gtf2table.py プロジェクト: harmeet1990/cgat-apps
def main(argv=None):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome.")

    parser.add_argument("-q",
                        "--quality-file",
                        dest="quality_file",
                        type=str,
                        help="filename with genomic base quality "
                        "information.")

    parser.add_argument("-b",
                        "--bam-file",
                        dest="bam_files",
                        type=str,
                        metavar="bam",
                        help="filename with read mapping information. "
                        "Multiple files can be submitted in a "
                        "comma-separated list.")

    parser.add_argument("-i",
                        "--bigwig-file",
                        dest="bigwig_file",
                        type=str,
                        metavar="bigwig",
                        help="filename with bigwig information ")

    parser.add_argument("-f",
                        "--gff-file",
                        dest="filename_gff",
                        type=str,
                        action="append",
                        metavar='bed',
                        help="filename with extra gff files. The order "
                        "is important.")

    parser.add_argument("--filename-format",
                        dest="filename_format",
                        type=str,
                        choices=("bed", "gff", "gtf"),
                        help="format of secondary stream.")

    parser.add_argument("--restrict-source",
                        dest="gff_sources",
                        type=str,
                        action="append",
                        help="restrict input to this 'source' in extra "
                        "gff file (for counter: overlap).")

    parser.add_argument("--restrict-feature",
                        dest="gff_features",
                        type=str,
                        action="append",
                        help="restrict input to this 'feature' in extra gff "
                        "file (for counter: overlap).")

    parser.add_argument("-r",
                        "--reporter",
                        dest="reporter",
                        type=str,
                        choices=("genes", "transcripts"),
                        help="report results for 'genes' or 'transcripts' ")

    parser.add_argument("-s",
                        "--section",
                        dest="sections",
                        type=str,
                        action="append",
                        choices=("exons", "introns"),
                        help="select range on which counters will operate ")

    parser.add_argument(
        "-c",
        "--counter",
        dest="counters",
        type=str,
        action="append",
        choices=("bigwig-counts", "binding-pattern", "classifier",
                 "classifier-rnaseq", "classifier-rnaseq-splicing",
                 "classifier-polii", "composition-na", "composition-cpg",
                 "coverage", "distance", "distance-genes", "distance-tss",
                 "length", 'neighbours', "overlap", "overlap-stranded",
                 "overlap-transcripts", "overrun", "position", "proximity",
                 "proximity-exclusive", "proximity-lengthmatched", "quality",
                 "read-coverage", "read-extension", "read-overlap",
                 "read-counts", "read-fullcounts", "readpair-counts",
                 "readpair-fullcounts", "splice", "splice-comparison",
                 "territories"),
        help="select counters to apply to input ")

    parser.add_argument("--add-gtf-source",
                        dest="add_gtf_source",
                        action="store_true",
                        help="add gtf field of source to output ")

    parser.add_argument("--proximal-distance",
                        dest="proximal_distance",
                        type=int,
                        help="distance to be considered proximal to "
                        "an interval.")

    parser.add_argument("--multi-mapping-method",
                        dest="multi_mapping",
                        type=str,
                        choices=('all', 'ignore', 'weight'),
                        help="how to treat multi-mapping reads in "
                        "bam-files. Requires "
                        "the NH flag to be set by the mapper ")

    parser.add_argument("--use-barcodes",
                        dest="use_barcodes",
                        action="store_true",
                        help="Use barcodes to count unique umi's. "
                        "UMI's are specified in the read identifier "
                        "as the last field, where fields are separated "
                        "by underscores, e.g. "
                        "@READ:ILLUMINA:STUFF_NAMINGSTUFF_UMI. "
                        "When true, unique counts are returned. "
                        "Currently only compatible with count-reads")

    parser.add_argument("--sample-probability",
                        dest="sample_probability",
                        type=float,
                        help="Specify the probability of whether any"
                        "given read or read pair in a file bam is counted"
                        "Currently only compatible with count-reads")

    parser.add_argument("--column-prefix",
                        dest="prefixes",
                        type=str,
                        action="append",
                        help="add prefix to column headers - prefixes "
                        "are used in the same order as the counters ")

    parser.add_argument("--library-type",
                        dest="library_type",
                        type=str,
                        choices=("unstranded", "firststrand", "secondstrand",
                                 "fr-unstranded", "fr-firststrand",
                                 "fr-secondstrand"),
                        help="library type of reads in bam file. ")

    parser.add_argument("--min-mapping-quality",
                        dest="minimum_mapping_quality",
                        type=float,
                        help="minimum mapping quality. Reads with a quality "
                        "score of less will be ignored. ")

    parser.set_defaults(genome_file=None,
                        reporter="genes",
                        with_values=True,
                        sections=[],
                        counters=[],
                        filename_gff=[],
                        filename_format=None,
                        gff_features=[],
                        gff_sources=[],
                        add_gtf_source=False,
                        proximal_distance=10000,
                        bam_files=None,
                        multi_mapping='all',
                        library_type='fr-unstranded',
                        prefixes=[],
                        minimum_mapping_quality=0,
                        use_barcodes=False,
                        sample_probability=1.0)

    if not argv:
        argv = sys.argv

    (args) = E.start(parser, add_output_options=True, argv=argv)

    if args.prefixes:
        if len(args.prefixes) != len(args.counters):
            raise ValueError("if any prefix is given, the number of prefixes "
                             "must be the same as the number of counters")

    # get files
    if args.genome_file:
        fasta = IndexedFasta.IndexedFasta(args.genome_file)
    else:
        fasta = None

    if args.quality_file:
        quality = IndexedFasta.IndexedFasta(args.quality_file)
        quality.setTranslator(IndexedFasta.TranslatorBytes())
    else:
        quality = None

    if args.bam_files:
        bam_files = []
        for bamfile in args.bam_files.split(","):
            bam_files.append(pysam.AlignmentFile(bamfile, "rb"))
    else:
        bam_files = None

    if args.bigwig_file:
        bigwig_file = pyBigWig.open(args.bigwig_file)
    else:
        bigwig_file = None

    counters = []

    if not args.sections:
        E.info("counters will use the default section (exons)")
        args.sections.append(None)

    if not args.gff_sources:
        args.gff_sources.append(None)
    if not args.gff_features:
        args.gff_features.append(None)

    cc = E.Counter()

    for n, c in enumerate(args.counters):
        if args.prefixes:
            prefix = args.prefixes[n]
        else:
            prefix = None

        if c == "position":
            for section in args.sections:
                counters.append(
                    GeneModelAnalysis.CounterPosition(section=section,
                                                      options=args,
                                                      prefix=prefix))
        elif c == "length":
            for section in args.sections:
                counters.append(
                    GeneModelAnalysis.CounterLengths(section=section,
                                                     options=args,
                                                     prefix=prefix))
        elif c == "splice":
            if fasta is None:
                raise ValueError('splice requires a genomic sequence')
            counters.append(
                GeneModelAnalysis.CounterSpliceSites(fasta=fasta,
                                                     prefix=prefix))
        elif c == "quality":
            if fasta is None:
                raise ValueError('quality requires a quality score sequence')
            counters.append(
                GeneModelAnalysis.CounterQuality(fasta=quality, prefix=prefix))
        elif c == "overrun":
            counters.append(
                GeneModelAnalysis.CounterOverrun(
                    filename_gff=args.filename_gff,
                    options=args,
                    prefix=prefix))
        elif c == "read-coverage":
            counters.append(
                GeneModelAnalysis.CounterReadCoverage(bam_files,
                                                      options=args,
                                                      prefix=prefix))
        elif c == "read-extension":
            counters.append(
                GeneModelAnalysis.CounterReadExtension(
                    bam_files,
                    filename_gff=args.filename_gff,
                    options=args,
                    prefix=prefix))
        elif c == "read-overlap":
            counters.append(
                GeneModelAnalysis.CounterReadOverlap(
                    bam_files,
                    multi_mapping=args.multi_mapping,
                    minimum_mapping_quality=args.minimum_mapping_quality,
                    options=args,
                    prefix=prefix))
        elif c == "read-counts":
            counters.append(
                GeneModelAnalysis.CounterReadCounts(
                    bam_files,
                    multi_mapping=args.multi_mapping,
                    use_barcodes=args.use_barcodes,
                    sample_probability=args.sample_probability,
                    minimum_mapping_quality=args.minimum_mapping_quality,
                    options=args,
                    prefix=prefix))
        elif c == "read-fullcounts":
            counters.append(
                GeneModelAnalysis.CounterReadCountsFull(
                    bam_files,
                    multi_mapping=args.multi_mapping,
                    sample_probability=args.sample_probability,
                    minimum_mapping_quality=args.minimum_mapping_quality,
                    options=args,
                    prefix=prefix))
        elif c == "readpair-counts":
            counters.append(
                GeneModelAnalysis.CounterReadPairCounts(
                    bam_files,
                    multi_mapping=args.multi_mapping,
                    sample_probability=args.sample_probability,
                    library_type=args.library_type,
                    minimum_mapping_quality=args.minimum_mapping_quality,
                    options=args,
                    prefix=prefix))
        elif c == "readpair-fullcounts":
            counters.append(
                GeneModelAnalysis.CounterReadPairCountsFull(
                    bam_files,
                    multi_mapping=args.multi_mapping,
                    sample_probability=args.sample_probability,
                    minimum_mapping_quality=args.minimum_mapping_quality,
                    options=args,
                    prefix=prefix))
        elif c == "bigwig-counts":
            counters.append(
                GeneModelAnalysis.CounterBigwigCounts(bigwig_file,
                                                      options=args,
                                                      prefix=prefix))
        elif c == "splice-comparison":
            if fasta is None:
                raise ValueError('splice-comparison requires a genomic '
                                 'sequence')
            counters.append(
                GeneModelAnalysis.CounterSpliceSiteComparison(
                    fasta=fasta,
                    filename_gff=args.filename_gff,
                    feature=None,
                    source=None,
                    options=args,
                    prefix=prefix))
        elif c == "composition-na":
            if fasta is None:
                raise ValueError('composition-na requires a genomic sequence')
            for section in args.sections:
                counters.append(
                    GeneModelAnalysis.CounterCompositionNucleotides(
                        fasta=fasta,
                        section=section,
                        options=args,
                        prefix=prefix))
        elif c == "composition-cpg":
            if fasta is None:
                raise ValueError('composition-cpg requires a genomic sequence')
            for section in args.sections:
                counters.append(
                    GeneModelAnalysis.CounterCompositionCpG(fasta=fasta,
                                                            section=section,
                                                            options=args,
                                                            prefix=prefix))

        elif c in ("overlap", "overlap-stranded", "overlap-transcripts",
                   "proximity", "proximity-exclusive",
                   "proximity-lengthmatched", "neighbours", "territories",
                   "distance", "distance-genes", "distance-tss",
                   "binding-pattern", "coverage"):
            if c == "overlap":
                template = GeneModelAnalysis.CounterOverlap
            if c == "overlap-stranded":
                template = GeneModelAnalysis.CounterOverlapStranded
            elif c == "overlap-transcripts":
                template = GeneModelAnalysis.CounterOverlapTranscripts
            elif c == "proximity":
                template = GeneModelAnalysis.CounterProximity
            elif c == "neighbours":
                template = GeneModelAnalysis.CounterNeighbours
            elif c == "proximity-exclusive":
                template = GeneModelAnalysis.CounterProximityExclusive
            elif c == "proximity-lengthmatched":
                template = GeneModelAnalysis.CounterProximityLengthMatched
            elif c == "territories":
                template = GeneModelAnalysis.CounterTerritories
            elif c == "distance":
                template = GeneModelAnalysis.CounterDistance
            elif c == "distance-genes":
                template = GeneModelAnalysis.CounterDistanceGenes
            elif c == "distance-tss":
                template = GeneModelAnalysis.CounterDistanceTranscriptionStartSites
            elif c == "coverage":
                template = GeneModelAnalysis.CounterCoverage
            elif c == "binding-pattern":
                template = GeneModelAnalysis.CounterBindingPattern

            for section in args.sections:
                for source in args.gff_sources:
                    for feature in args.gff_features:
                        counters.append(
                            template(filename_gff=args.filename_gff,
                                     feature=feature,
                                     source=source,
                                     fasta=fasta,
                                     section=section,
                                     options=args,
                                     prefix=prefix))

        elif c == "classifier":
            counters.append(
                GeneModelAnalysis.Classifier(filename_gff=args.filename_gff,
                                             fasta=fasta,
                                             options=args,
                                             prefix=prefix))

        elif c == "classifier-rnaseq":
            counters.append(
                GeneModelAnalysis.ClassifierRNASeq(
                    filename_gff=args.filename_gff,
                    fasta=fasta,
                    options=args,
                    prefix=prefix))
        elif c == "classifier-rnaseq-splicing":
            counters.append(
                GeneModelAnalysis.ClassifierRNASeqSplicing(
                    filename_gff=args.filename_gff,
                    fasta=fasta,
                    options=args,
                    prefix=prefix))
        elif c == "classifier-polii":
            counters.append(
                GeneModelAnalysis.ClassifierPolII(
                    filename_gff=args.filename_gff,
                    feature=None,
                    source=None,
                    fasta=fasta,
                    options=args,
                    prefix=prefix))
        elif c == "binding-pattern":
            counters.append(
                GeneModelAnalysis.CounterBindingPattern(
                    filename_gff=args.filename_gff,
                    feature=None,
                    source=None,
                    fasta=fasta,
                    options=args,
                    prefix=prefix))

    if args.reporter == "genes":
        iterator = GTF.flat_gene_iterator
        header = ["gene_id"]
        fheader = lambda x: [x[0].gene_id]
    elif args.reporter == "transcripts":
        iterator = GTF.transcript_iterator
        header = ["transcript_id"]
        fheader = lambda x: [x[0].transcript_id]

    if args.add_gtf_source:
        header.append("source")
        ffields = lambda x: [x[0].source]
    else:
        ffields = lambda x: []

    args.stdout.write("\t".join(header + [x.getHeader()
                                          for x in counters]) + "\n")

    for gffs in iterator(GTF.iterator(args.stdin)):
        cc.input += 1

        for counter in counters:
            counter.update(gffs)

        skip = len([x for x in counters if x.skip]) == len(counters)
        if skip:
            cc.skipped += 1
            continue

        args.stdout.write("\t".join(
            fheader(gffs) + ffields(gffs) +
            [str(counter) for counter in counters]) + "\n")

        cc.output += 1

    E.info("%s" % str(cc))
    for counter in counters:
        E.info("%s\t%s" % (repr(counter), str(counter.counter)))
    E.stop()
コード例 #17
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-i", "--input-bam", dest="input_bam_file", type="string",
        help="input bam file")

    parser.add_option(
        "-f", "--reference-bam", dest="reference_bam_file", type="string",
        help="reference BAM file [%default]")

    parser.add_option(
        "-q", "--query-name-regex", dest="query_name_regex", type="string",
        help="regular expression to apply on query name. "
        "Potentially required to match samtools sort order and should "
        "evaluate to an integer [%default]")

    parser.set_defaults(
        input_bam_file=None,
        reference_bam_file=None,
        query_name_regex=None,
    )

    (options, args) = E.start(parser, argv, add_output_options=True)

    if len(args) == 2:
        options.input_bam_file = args[0]
        options.reference_bam_file = args[1]

    if options.input_bam_file is None:
        raise ValueError("please supply a BAM file as input")

    if options.reference_bam_file is None:
        raise ValueError("please supply a BAM file as reference")

    # update paths to absolute
    options.input_bam_file = os.path.abspath(options.input_bam_file)
    options.reference_bam_file = os.path.abspath(options.reference_bam_file)

    if not os.path.exists(options.input_bam_file):
        raise OSError("input bam file {} does not exist".format(
            options.input_bam_file))

    if not os.path.exists(options.reference_bam_file):
        raise OSError("reference bam file {} does not exist".format(
            options.reference_bam_file))

    bam_in = pysam.AlignmentFile(options.input_bam_file)
    ref_in = pysam.AlignmentFile(options.reference_bam_file)

    outf_mapped = E.open_output_file("mapped")
    outf_mapped.write("\t".join(
        ["read",
         "length",
         "status",
         "overlap",
         "comp_contig",
         "comp_start",
         "comp_end",
         "ref_contig",
         "ref_start",
         "ref_end",
         "shared_misaligned",
         "shared_aligned",
         "shared_insertion",
         "shared_deletion",
         "comp_aligned",
         "comp_insertion",
         "comp_deletion",
         "ref_aligned",
         "ref_insertion",
         "ref_deletion"]) + "\n")

    outf_missing = E.open_output_file("missing")
    outf_missing.write("\t".join(
        ["read", "length", "status", "aligned",
         "insertion", "deletion"]) + "\n")

    counter = E.Counter()

    if options.query_name_regex:
        rx = re.compile(options.query_name_regex)

    def extract_query(x):
        return int(rx.search(x).groups()[0])

    qname_fn = None
    if options.query_name_regex:
        qname_fn = extract_query

    for reads_cmp, read_ref in group_pairs(iterate_read_pairs(
            bam_in.fetch(until_eof=True),
            ref_in.fetch(until_eof=True),
            qname_fn=qname_fn)):

        if len(reads_cmp) == 0:
            counter.missing += 1
            pairs_ref = set(read_ref.get_aligned_pairs())
            outf_missing.write("\t".join(
                map(str, (
                    read_ref.query_name,
                    read_ref.query_length,
                    "missing") +
                    count_pairs(pairs_ref))) + "\n")
            continue

        if len(reads_cmp) > 1:
            # multiple matches
            counter.multi_mapping += 1
            prefix = "multi_"
        else:
            counter.unique_mapping += 1
            prefix = "unique_"

        is_mapped = False
        for read_cmp in reads_cmp:

            counter.paired += 1

            if read_cmp.is_unmapped:
                counter.unmapped += 1
                pairs_ref = set(read_ref.get_aligned_pairs())
                outf_missing.write("\t".join(
                    map(str, (
                        read_ref.query_name,
                        read_ref.query_length,
                        "unmapped") +
                        count_pairs(pairs_ref))) + "\n")
                continue

            overlap = max(0, (min(read_cmp.reference_end,
                                  read_ref.reference_end) -
                              max(read_cmp.reference_start,
                                  read_ref.reference_start)))

            pairs_cmp = set(read_cmp.get_aligned_pairs())
            pairs_ref = set(read_ref.get_aligned_pairs())
            shared_cmp = pairs_cmp.intersection(pairs_ref)
            unique_cmp = pairs_cmp.difference(pairs_ref)
            missaligned = len([x for x, y in unique_cmp
                               if x is not None and y is not None])

            if read_cmp.reference_name != read_ref.reference_name or \
               overlap == 0:
                status = "mismapped"
            else:
                counter.overlap += 1
                status = "mapped"
                is_mapped = True

            outf_mapped.write("\t".join(
                map(str, (read_cmp.query_name,
                          read_cmp.query_length,
                          prefix + status,
                          overlap,
                          read_cmp.reference_name,
                          read_cmp.reference_start,
                          read_cmp.reference_end,
                          read_ref.reference_name,
                          read_ref.reference_start,
                          read_ref.reference_end,
                          missaligned) +
                    count_pairs(shared_cmp) +
                    count_pairs(pairs_cmp) +
                    count_pairs(pairs_ref))) + "\n")
        else:
            if is_mapped:
                status = "mapped"
            else:
                status = "mismapped"

            counter[prefix + status] += 1

    with E.open_output_file("summary") as outf:
        outf.write("category\tcounts\n")
        outf.write(counter.asTable() + "\n")

    E.stop()
コード例 #18
0
def main(argv=None):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument(
        "--input-filename-fasta", dest="input_filename_fasta", type=str,
        help="filename with reference sequence in fasta format ")

    parser.add_argument(
        "--input-filename-bam", dest="input_filename_bam", type=str,
        help="filename with aligned reads ")

    parser.add_argument(
        "--method", dest="methods", type=str, action="append",
        choices=["add-strelka-genotype",
                 "lift-over"],
        help="methods to apply ")

    parser.add_argument(
        "--input-filename-chain", dest="input_filename_chain", type=str,
        help="filename with alignment chain for lift-over ")

    parser.add_argument(
        "--normal-sample-regex", dest="normal_sample_regex", type=str,
        help="regular expression to apply to header to identify normal "
        "sample id ")

    parser.add_argument(
        "--output-filename-unmapped", dest="output_filename_unmapped", type=str,
        help="filename with variants that could not be lifted over ")

    parser.set_defaults(
        input_filename_fasta=None,
        input_filename_bam=None,
        input_filename_vcf="-",
        sample_size=0.001,
        region_size=20,
        methods=[],
        normal_sample_regex=None,
        input_filename_chain=None,
        output_filename_unmapped=None,
    )

    (args, unknown) = E.start(parser,
                              argv=argv,
                              add_output_options=True,
                              unknowns=True)

    if len(unknown) > 0:
        args.input_filename_vcf = unknown[0]

    vcf_in = pysam.VariantFile(args.input_filename_vcf)

    if "lift-over" in args.methods:
        if args.input_filename_chain is None:
            raise ValueError("--method=lift-over requires --input-filename-chain")
        if not os.path.exists(args.input_filename_chain):
            raise OSError("file {} with chain data does not exist".format(
                args.input_filename_chain))
        E.info("reading chain from {}".format(args.input_filename_chain))
        with iotools.open_file(args.input_filename_chain) as inf:
            map_chain, map_contig2length = read_liftover_chain(inf)

    if args.input_filename_fasta:
        fasta = pysam.FastaFile(args.input_filename_fasta)
    else:
        fasta = None

    if args.input_filename_bam:
        bam = pysam.AlignmentFile(args.input_filename_bam)
    else:
        bam = None

    outf = args.stdout

    c = E.Counter()

    if "add-strelka-genotype" in args.methods:
        map_nt2gt = {"ref": "0/0",
                     "het": "0/1",
                     "hom": "1/1",
                     "conflict": "."}

        map_tumour2gt = {"ref": "0/0",
                         "het": "0/1",
                         "hom": "1/1"}

        header = str(vcf_in.header).splitlines()

        header.insert(
            len(header) - 1,
            '##FORMAT=<ID=GT,Number=1,Type=String,Description='
            '"Genotypes of reference and alternative alleles, '
            'added by cgatcore vcf2vcf.">')

        header = "\n".join(header)
        if args.normal_sample_regex:
            normal_sample = re.search(" -bam-file \S+/([^/]+)_S\d+.bam", header).groups()[0]
        else:
            normal_sample = "NORMAL"

        is_first = True

        for record in vcf_in:
            c.input += 1

            if "GT" in record.format:
                if is_first:
                    outf.write(header + "\n")
                    is_first = False
                outf.write(str(record))
                c.has_gt += 1
                continue

            gt_normal = map_nt2gt[record.info["NT"]]
            gt_tumour = record.info["SGT"]
            norm, tumour = gt_tumour.split("->")
            if gt_tumour[0] in "ACGT":
                alts = record.alts
                if alts is None:
                    c.no_alt += 1
                    continue

                if len(record.alts) > 1:
                    c.multi_allelic += 1
                    continue

                _map_tumour2gt = {
                    record.alts[0]: "1",
                    record.ref: "0"}
                try:
                    gt_tumour = "/".join(
                        sorted([_map_tumour2gt[x] for x in tumour]))
                except KeyError:
                    gt_tumour = "."
                    c.ambigous_genotype += 1
            else:
                gt_tumour = map_tumour2gt[tumour]

            fields = str(record)[:-1].split("\t")
            # FORMAT
            fields[8] = ":".join(("GT", fields[8]))
            # SAMPLES
            # makes a few assumptions, fix!
            header_insert_normal = False
            if len(fields) == 11:
                fields[9] = ":".join((gt_normal, fields[9]))
                fields[10] = ":".join((gt_tumour, fields[10]))
            elif len(fields) == 10:
                header_insert_normal = True
                values = fields[9].split(":")
                fields.append(":".join((gt_tumour, fields[9])))
                fields[9] = ":".join([gt_normal] + ["."] * len(values))
            else:
                raise NotImplementedError()

            if is_first:
                if not header_insert_normal:
                    outf.write(header + "\n")
                else:
                    header = re.sub(r"\tFORMAT\t",
                                    "\tFORMAT\t%s\t" % normal_sample, header)
                    outf.write(header + "\n")
                is_first = False
            outf.write("\t".join(fields) + "\n")
            c.output += 1

    elif "lift-over" in args.methods:
        header = str(vcf_in.header).splitlines()

        if fasta:
            # validate contig size
            expected_lengths = dict(list(zip(fasta.references, fasta.lengths)))
        else:
            expected_lengths = map_contig2length

        # update contig names and sizes in VCF header
        header = [x for x in header if not x.startswith("##contig")]
        header[-1:-1] = ["##contig=<ID={},length={}>".format(
            contig, length) for contig, length in sorted(expected_lengths.items())]

        header.insert(
            len(header) - 1,
            '##liftover=<CHAIN={},REFERENCE={}>'.format(
                args.input_filename_chain,
                args.input_filename_fasta))
        outf.write("\n".join(header) + "\n")

        unmapped_contigs = set()
        unknown_contigs = set()

        trans_genotypes = str.maketrans("01", "10")

        if fasta:
            # validate contig size
            expected_lengths = dict(list(zip(fasta.references, fasta.lengths)))
            for contig, length in list(map_contig2length.items()):
                if contig in expected_lengths:
                    if length != expected_lengths[contig]:
                        raise ValueError(
                            "contig lengths mismatch. For contig {} chain files "
                            "says {}, but fasta files says {}".format(
                                contig, length, expected_lengths[contig]))
            E.info("contig sizes in chain file and fasta files correspond.")

        if args.output_filename_unmapped:
            outfile_unmapped = iotools.open_file(args.output_filename_unmapped, "w")
            outfile_unmapped.write("\n".join(header) + "\n")
        else:
            outfile_unmapped = None

        for record in vcf_in:
            c.input += 1

            try:
                mm = map_chain[record.contig]
            except KeyError:
                c.skipped_unmapped_contig += 1
                unmapped_contigs.add(record.contig)
                if outfile_unmapped:
                    outfile_unmapped.write("skipped_unmapped_contig\t{}".format(str(record)))
                continue

            try:
                m = mm.search(record.start, record.stop)
            except AttributeError:
                c.skipped_mapping_error += 1
                if outfile_unmapped:
                    outfile_unmapped.write("skipped_mapping_error\t{}".format(str(record)))
                continue

            if len(m) == 0:
                c.skipped_unmapped_position += 1
                if outfile_unmapped:
                    outfile_unmapped.write("skipped_unmapped_position\t{}".format(str(record)))
                continue
            elif len(m) > 1:
                c.skipped_multimapping_position += 1
                if outfile_unmapped:
                    outfile_unmapped.write("skipped_multimapping_position\t{}".format(str(record)))
                continue

            m = m[0]
            y_contig, y_start, y_end, y_invert = m.data

            if y_invert:
                y_pos = y_end - (record.start - m.start)
            else:
                y_pos = (record.start - m.start) + y_start

            if fasta:
                try:
                    ref_base = fasta.fetch(y_contig, y_pos, y_pos + len(record.ref)).upper()
                except KeyError:
                    c.skipped_unknown_contig += 1
                    unknown_contigs.add(y_contig)
                    ref_base = None
                    continue

            swap_alleles = False
            if ref_base:
                error = False
                if ref_base == record.ref:
                    c.matches += 1
                else:
                    if len(record.alts) == 1:
                        alt_base = record.alts[0]
                        if ref_base == alt_base:
                            swap_alleles = True
                            c.allele_swap_variant += 1
                        else:
                            c.error_mismatch_variant += 1
                            error = "mismatch"
                    else:
                        error = "multi-mismatch"
                        c.error_multi_mismatch_variant += 1

                if error:
                    if outfile_unmapped:
                        outfile_unmapped.write("{}\t{}".format(error, str(record)))
                    c.skipped_error_variant += 1
                    continue

            fields = str(record)[:-1].split("\t")
            fields[0] = y_contig
            fields[1] = str(y_pos)

            if swap_alleles:
                fields[4] = alt_base
                fields[5] = ref_base
                # update genotype fields
                keep = False
                for idx in range(9, len(fields)):
                    gt, rest = fields[idx].split(":", 1)
                    keep = keep or "0" in gt
                    fields[idx] = ":".join((gt.translate(trans_genotypes), rest))

                # remove reference only calls
                if not keep:
                    if outfile_unmapped:
                        outfile_unmapped.write("reference_call\t{}".format(str(record)))
                    c.skipped_allele_swap_reference += 1
                continue

            c.output += 1
            outf.write("\t".join(fields) + "\n")

        c.unmapped_contigs = len(unmapped_contigs)
        c.unknown_contigs = len(unknown_contigs)

        E.info(c.asTable())
        if unknown_contigs:
            E.info("unknown contigs: {}".format(",".join(sorted(unknown_contigs))))
        if unmapped_contigs:
            E.info("unmapped contigs: {}".format(",".join(sorted(unmapped_contigs))))

    E.stop()
コード例 #19
0
ファイル: fastq2table.py プロジェクト: alphaneer/cgat-apps
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "--guess-format",
        dest="guess_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'),
        help="The default behaviour of the script is to guess the quality "
        "format of the input fastq file. The user can specify the "
        "quality format of the input file using the --guess-format option. "
        "The script will use this format if the "
        "sequence qualities are ambiguous.[default=%default].")

    parser.add_option(
        "--target-format",
        dest="target_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'),
        help="The script will convert quality scores to the destination "
        "format unless [default=%default].")

    parser.set_defaults(
        target_format=None,
        guess_format=None,
        min_quality=10,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    c = E.Counter()

    if options.target_format:
        iterator = Fastq.iterate_convert(options.stdin,
                                         format=options.target_format,
                                         guess=options.guess_format)
    else:
        iterator = Fastq.iterate_guess(options.stdin,
                                       guess=options.guess_format)

    options.stdout.write("read\tnfailed\tnN\t%s\n" %
                         ("\t".join(Stats.Summary().getHeaders())))

    min_quality = options.min_quality

    for record in iterator:
        c.input += 1
        quals = record.toPhred()
        nfailed = len([x for x in quals if x < min_quality])
        nns = record.seq.count("N") + record.seq.count(".")
        options.stdout.write(
            "%s\t%i\t%i\t%s\n" %
            (record.identifier, nfailed, nns, str(Stats.Summary(quals))))
        c.output += 1

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.stop()
コード例 #20
0
def getRefSeqFromUCSC(dbhandle, outfile, remove_duplicates=False):
    '''get refseq gene set from UCSC database and save as :term:`gtf`
    formatted file.

    Matches to ``chr_random`` are ignored (as does ENSEMBL).

    Note that this approach does not work as a gene set, as refseq
    maps are not real gene builds and unalignable parts cause
    differences that are not reconcilable.

    Arguments
    ---------
    dbhandle : object
       Database handle to UCSC mysql database
    outfile : string
       Filename of output file in :term:`gtf` format. The filename
       aims to be close to the ENSEMBL gtf format.
    remove_duplicate : bool
       If True, duplicate mappings are removed.

    '''

    duplicates = set()

    if remove_duplicates:
        cc = dbhandle.execute("""SELECT name, COUNT(*) AS c FROM refGene
        WHERE chrom NOT LIKE '%_random'
        GROUP BY name HAVING c > 1""")
        duplicates = set([x[0] for x in cc.fetchall()])
        E.info("removing %i duplicates" % len(duplicates))

    # these are forward strand coordinates
    statement = '''
    SELECT gene.name, link.geneName, link.name, gene.name2, product,
    protAcc, chrom, strand, cdsStart, cdsEnd,
    exonCount, exonStarts, exonEnds, exonFrames
    FROM refGene as gene, refLink as link
    WHERE gene.name = link.mrnaAcc
    AND chrom NOT LIKE '%_random'
    ORDER by chrom, cdsStart
    '''

    outf = iotools.open_file(outfile, "w")

    cc = dbhandle.execute(statement)

    SQLResult = collections.namedtuple(
        'Result', '''transcript_id, gene_id, gene_name, gene_id2, description,
        protein_id, contig, strand, start, end,
        nexons, starts, ends, frames''')

    counts = E.Counter()
    counts.duplicates = len(duplicates)

    for r in map(SQLResult._make, cc.fetchall()):

        if r.transcript_id in duplicates:
            continue

        starts = list(map(int, r.starts.split(",")[:-1]))
        ends = list(map(int, r.ends.split(",")[:-1]))
        frames = list(map(int, r.frames.split(",")[:-1]))

        gtf = GTF.Entry()
        gtf.contig = r.contig
        gtf.source = "protein_coding"
        gtf.strand = r.strand
        gtf.gene_id = r.gene_id
        gtf.transcript_id = r.transcript_id
        gtf.addAttribute("protein_id", r.protein_id)
        gtf.addAttribute("transcript_name", r.transcript_id)
        gtf.addAttribute("gene_name", r.gene_name)

        assert len(starts) == len(ends) == len(frames)

        if gtf.strand == "-":
            starts.reverse()
            ends.reverse()
            frames.reverse()

        counts.transcripts += 1
        i = 0
        for start, end, frame in zip(starts, ends, frames):
            gtf.feature = "exon"
            counts.exons += 1
            i += 1
            gtf.addAttribute("exon_number", i)
            # frame of utr exons is set to -1 in UCSC
            gtf.start, gtf.end, gtf.frame = start, end, "."
            outf.write("%s\n" % str(gtf))

            cds_start, cds_end = max(r.start, start), min(r.end, end)
            if cds_start >= cds_end:
                # UTR exons have no CDS
                # do not expect any in UCSC
                continue
            gtf.feature = "CDS"
            # invert the frame
            frame = (3 - frame % 3) % 3
            gtf.start, gtf.end, gtf.frame = cds_start, cds_end, frame
            outf.write("%s\n" % str(gtf))

    outf.close()

    E.info("%s" % str(counts))
コード例 #21
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-o",
                        "--min-overlap",
                        dest="min_overlap",
                        type=int,
                        help="minimum overlap")

    parser.add_argument(
        "-w",
        "--pattern-window",
        dest="pattern_window",
        type=str,
        help="regular expression to extract window coordinates from "
        "test id ")

    parser.add_argument("-i",
                        "--invert",
                        dest="invert",
                        action="store_true",
                        help="invert direction of fold change ")

    parser.set_defaults(min_overlap=10,
                        invert=False,
                        pattern_window="(\S+):(\d+)-(\d+)"),

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv, add_output_options=True)

    outfiles = iotools.FilePool(args.output_filename_pattern)

    if args.invert:
        test_f = lambda l2fold: l2fold < 0
    else:
        test_f = lambda l2fold: l2fold > 0

    def read():

        rx_window = re.compile(args.pattern_window)
        # filter any of the DESeq/EdgeR message that end up at the top of the
        # output file

        for data in iotools.iterate(args.stdin):

            contig, start, end = rx_window.match(data.test_id).groups()
            start, end = list(map(int, (start, end)))

            yield DATA._make(
                (data.test_id, contig, start, end, data.treatment_name,
                 float(data.treatment_mean),
                 float(data.treatment_std), data.control_name,
                 float(data.control_mean), float(data.control_std),
                 float(data.pvalue), float(data.qvalue), float(data.l2fold),
                 float(data.fold), int(data.significant), data.status, 0))

    def grouper(data, distance=10):

        last = next(data)
        entries = [last]

        while 1:
            d = next(data)
            if d is None:
                break
            if d.contig == last.contig and d.start < last.start:
                raise ValueError("error not sorted by start")

            if ((d.contig != last.contig) or (d.start - last.end > distance)
                    or (d.status != last.status)
                    or (d.significant != last.significant)
                    or (d.l2fold * last.l2fold < 0)):
                yield entries
                entries = []

            entries.append(d)
            last = d

        yield entries

    counter = E.Counter()

    args.stdout.write("\t".join(DATA._fields) + "\n")

    # set of all sample names - used to create empty files
    samples = set()

    # need to sort by coordinate
    all_data = list(read())
    all_data.sort(key=lambda x: (x.contig, x.start))

    group_id = 0

    for group in grouper(iter(all_data), distance=args.min_overlap):
        group_id += 1

        start, end = group[0].start, group[-1].end
        assert start < end, 'start > end: %s' % str(group)
        n = float(len(group))
        counter.input += n

        g = group[0]

        if g.l2fold < 0:
            l2fold = max([x.l2fold for x in group])
            fold = max([x.fold for x in group])
        else:
            l2fold = min([x.l2fold for x in group])
            fold = min([x.fold for x in group])

        outdata = DATA._make(
            (str(group_id), g.contig, start, end, g.treatment_name,
             sum([x.treatment_mean for x in group]) / n,
             max([x.treatment_std for x in group]), g.control_name,
             sum([x.control_mean
                  for x in group]) / n, max([x.control_std for x in group]),
             max([x.pvalue for x in group]), max([x.qvalue for x in group]),
             l2fold, fold, g.significant, g.status, int(n)))

        samples.add(g.treatment_name)
        samples.add(g.control_name)
        if g.significant:
            if test_f(g.l2fold):
                # treatment lower methylation than control
                outfiles.write(
                    g.treatment_name, "%s\t%i\t%i\t%i\t%f\n" %
                    (g.contig, g.start, g.end, group_id,
                     sum([x.treatment_mean for x in group]) / n))

            else:
                outfiles.write(
                    g.control_name, "%s\t%i\t%i\t%i\t%f\n" %
                    (g.contig, g.start, g.end, group_id,
                     sum([x.control_mean for x in group]) / n))

        args.stdout.write("\t".join(map(str, outdata)) + "\n")

        counter.output += 1

    # create empty files
    for sample in samples:
        outfiles.write(sample, "")

    outfiles.close()
    E.info("%s" % counter)

    # write footer and output benchmark information.
    E.stop()
コード例 #22
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genomic sequence to retrieve "
                      "sequences from.")

    parser.add_option("-m", "--masker", dest="masker", type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker to mask output sequences "
                      "[%default].")

    parser.add_option("--output-mode", dest="output_mode", type="choice",
                      choices=("intervals", "leftright", "segments"),
                      help="what to output. "
                      "'intervals' generates a single sequence for "
                      "each bed interval. 'leftright' generates two "
                      "sequences, one in each direction, for each bed "
                      "interval. 'segments' can be used to output "
                      "sequence from bed12 files so that sequence only covers "
                      "the segements [%default]")

    parser.add_option("--min-sequence-length", dest="min_length", type="int",
                      help="require a minimum sequence length [%default]")

    parser.add_option("--max-sequence-length", dest="max_length", type="int",
                      help="require a maximum sequence length [%default]")

    parser.add_option(
        "--extend-at", dest="extend_at", type="choice",
        choices=("none", "3", "5", "both", "3only", "5only"),
        help="extend at 3', 5' or both or no ends. If 3only or 5only "
        "are set, only the added sequence is returned [default=%default]")

    parser.add_option(
        "--extend-by", dest="extend_by", type="int",
        help="extend by # bases [default=%default]")

    parser.add_option(
        "--use-strand", dest="ignore_strand",
        action="store_false",
        help="use strand information and return reverse complement "
        "on intervals located on the negative strand. "
        "[default=%default]")

    parser.set_defaults(
        genome_file=None,
        masker=None,
        output_mode="intervals",
        min_length=0,
        max_length=0,
        extend_at=None,
        extend_by=100,
        ignore_strand=True,
    )

    (options, args) = E.start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()
        fasta.setConverter(IndexedFasta.getConverter("zero-both-open"))

    counter = E.Counter()
    ids, seqs = [], []

    E.info("collecting sequences")
    for bed in Bed.setName(Bed.iterator(options.stdin)):
        counter.input += 1

        lcontig = fasta.getLength(bed.contig)

        if options.ignore_strand:
            strand = "+"
        else:
            strand = bed.strand

        if options.output_mode == "segments" and bed.columns == 12:
            ids.append("%s %s:%i..%i (%s) %s %s" %
                       (bed.name, bed.contig, bed.start, bed.end, strand,
                        bed["blockSizes"], bed["blockStarts"]))
            seg_seqs = [fasta.getSequence(bed.contig, strand, start, end)
                        for start, end in bed.toIntervals()]
            seqs.append("".join(seg_seqs))

        elif (options.output_mode == "intervals" or
              options.output_mode == "segments"):
            ids.append("%s %s:%i..%i (%s)" %
                       (bed.name, bed.contig, bed.start, bed.end, strand))
            seqs.append(
                fasta.getSequence(bed.contig, strand, bed.start, bed.end))

        elif options.output_mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0, bed.start - l), bed.end - l
            ids.append("%s_l %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

            start, end = bed.start + l, min(lcontig, bed.end + l)
            ids.append("%s_r %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

    E.info("collected %i sequences" % len(seqs))

    masked = Masker.maskSequences(seqs, options.masker)
    options.stdout.write(
        "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n")

    E.info("masked %i sequences" % len(seqs))

    counter.output = len(seqs)

    E.info("%s" % counter)

    E.stop()
コード例 #23
0
def main(argv=sys.argv):

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-s",
                        "--session",
                        dest="session",
                        type=str,
                        help="load session before creating plots ")

    parser.add_argument("-d",
                        "--snapshot-dir",
                        dest="snapshotdir",
                        type=str,
                        help="directory to save snapshots in ")

    parser.add_argument("-f",
                        "--format",
                        dest="format",
                        type=str,
                        choices=("png", "eps", "svg"),
                        help="output file format ")

    parser.add_argument("-o",
                        "--host",
                        dest="host",
                        type=str,
                        help="host that IGV is running on ")

    parser.add_argument("-p",
                        "--port",
                        dest="port",
                        type=int,
                        help="port that IGV listens at ")

    parser.add_argument("-e",
                        "--extend",
                        dest="extend",
                        type=int,
                        help="extend each interval by a number of bases ")

    parser.add_argument("-x",
                        "--expand",
                        dest="expand",
                        type=float,
                        help="expand each region by a certain factor ")

    parser.add_argument("--session-only",
                        dest="session_only",
                        action="store_true",
                        help="plot session after opening, "
                        "ignore intervals ")

    parser.add_argument("-n",
                        "--name",
                        dest="name",
                        type=str,
                        choices=("bed-name", "increment"),
                        help="name to use for snapshot ")

    parser.set_defaults(
        command="igv.sh",
        host='127.0.0.1',
        port=61111,
        snapshotdir=os.getcwd(),
        extend=0,
        format="png",
        expand=1.0,
        session=None,
        session_only=False,
        keep_open=False,
        name="bed-name",
    )

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv, add_output_options=True)

    igv_process = None
    if args.new_instance:
        E.info("starting new IGV process")
        igv_process = IGV.startIGV(command=args.command, port=args.port)
        E.info("new IGV process started")

    E.info("connection to process on %s:%s" % (args.host, args.port))
    E.info("saving images in %s" % args.snapshotdir)
    igv = IGV(host=args.host,
              port=args.port,
              snapshot_dir=os.path.abspath(args.snapshotdir))

    if args.session:
        E.info('loading session from %s' % args.session)
        igv.load(args.session)
        E.info('loaded session')

    if args.session_only:
        E.info('plotting session only ignoring any intervals')
        fn = "%s.%s" % (os.path.basename(args.session), args.format)
        E.info("writing snapshot to '%s'" % os.path.join(args.snapshotdir, fn))
        igv.save(fn)

    else:
        c = E.Counter()
        for bed in pysam.tabix_iterator(args.stdin, parser=pysam.asBed()):

            c.input += 1

            # IGV can not deal with white-space in filenames
            if args.name == "bed-name":
                name = re.sub("\s", "_", bed.name)
            elif args.name == "increment":
                name = str(c.input)

            E.info("going to %s:%i-%i for %s" %
                   (bed.contig, bed.start, bed.end, name))

            start, end = bed.start, bed.end
            extend = args.extend
            if args.expand:
                d = end - start
                extend = max(extend, (args.expand * d - d) // 2)

            start -= extend
            end += extend

            igv.go("%s:%i-%i" % (bed.contig, start, end))

            fn = E.get_output_file("%s.%s" % (name, args.format))
            E.info("writing snapshot to '%s'" % fn)
            igv.save(fn)

            c.snapshots += 1

        E.info(c)

    if igv_process is not None and not args.keep_open:
        E.info('shutting down IGV')
        igv_process.send_signal(signal.SIGKILL)

    E.stop()
コード例 #24
0
ファイル: fastq2fastq.py プロジェクト: alphaneer/cgat-apps
def process_daisy(options):

    filter_n = "filter-N" in options.methods

    filter_ont = "filter-ONT" in options.methods

    if "filter-identifier" in options.methods:
        if options.input_filter_tsv is None:
            raise ValueError(
                "please set --input-filter-tsv for method filter-identifier")
        with iotools.open_file(options.input_filter_tsv) as inf:
            filter_identifier = set(
                [x.split()[0].strip() for x in inf.readlines()])
    else:
        filter_identifier = False

    if options.output_removed_tsv:
        outf_removed_tsv = iotools.open_file(options.output_removed_tsv, "w")
    else:
        outf_removed_tsv = None

    if options.output_removed_fastq:
        outf_removed_fastq = iotools.open_file(options.output_removed_fastq,
                                               "w")
    else:
        outf_removed_fastq = None

    if options.set_prefix:
        prefix = "{}".format(options.set_prefix)
    else:
        prefix = None

    quality_offset = options.quality_offset
    counter = E.Counter()

    with pysam.FastxFile(options.input_fastq_file) as inf:
        for read in inf:
            counter.input += 1
            remove = False
            if filter_n:
                chars = collections.Counter(read.sequence)
                if "N" in chars and \
                   100.0 * chars["N"] / len(read.sequence) > options.max_percent_N:
                    remove = True
                    counter.filter_n += 1

            if filter_identifier:
                if read.name not in filter_identifier:
                    counter.filter_identifier += 1
                    remove = True

            if filter_ont:
                quals = read.get_quality_array()
                n = len(quals)
                if n < options.min_sequence_length or \
                        float(sum(quals)) / n < options.min_average_quality:
                    counter.remove_ont += 1
                    remove = True

            if remove:
                counter.removed += 1
                if outf_removed_tsv:
                    outf_removed_tsv.write(read.name + "\n")
                if outf_removed_fastq:
                    outf_removed_fastq.write(str(read) + "\n")
                continue

            if prefix:
                read.name = prefix + read.name[2:]

            if quality_offset:
                quals = numpy.array(read.get_quality_array())
                quals += quality_offset
                quals[quals < 0] = 0
                quals += 33
                # pysam fastq is read-only, so fudge it:
                # Note: not outputting description
                read = "@{}\n{}\n+\n{}".format(
                    read.name, read.sequence, "".join([chr(x) for x in quals]))

            counter.output += 1

            options.stdout.write(str(read) + "\n")

    if outf_removed_tsv:
        outf_removed_tsv.close()

    if outf_removed_fastq:
        outf_removed_fastq.close()

    if options.output_stats_tsv:
        with iotools.open_file(options.output_stats_tsv, "w") as outf:
            outf.write(counter.asTable(as_rows=False) + "\n")

    return counter
コード例 #25
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-f",
                      "--input-filter-tsv",
                      dest="input_filter_tsv",
                      type="string",
                      help="list with identifiers to remove. "
                      "[%default]")

    parser.add_option("--set-prefix",
                      dest="set_prefix",
                      type="string",
                      help="set sequence prefix [%default]")

    parser.add_option("--min-length",
                      dest="min_length",
                      type="int",
                      help="minimum alignment length [%default]")

    parser.add_option("--method",
                      dest="methods",
                      action="append",
                      choices=("shift-region", ),
                      help="methods to apply [%default]")

    parser.set_defaults(
        input_maf_file=None,
        input_filter_tsv=None,
        set_prefix=None,
        min_length=0,
        methods=[],
    )

    (options, args) = E.start(parser, argv)

    if options.input_filter_tsv:
        with iotools.open_file(options.input_filter_tsv) as inf:
            skip_id = set([x[:-1] for x in inf])
    else:
        skip_id = False

    counter = E.Counter()

    if options.set_prefix:
        prefix = "s {}".format(options.set_prefix)
    else:
        prefix = None

    for block in iterate_maf_blocks(options.stdin):
        counter.blocks_input += 1
        if skip_id:
            if block[2].startswith("s "):
                id = re.match("s (\S+)", block[2]).groups()[0]
                if id in skip_id:
                    counter.blocks_skipped_id += 1
                    continue

        if options.min_length:
            if block[2].startswith("s "):
                id, pos, length = re.match("s (\S+)\s+(\d+)\s+(\d+)",
                                           block[2]).groups()
                if int(length) <= options.min_length:
                    counter.blocks_skipped_length += 1
                    continue

        if prefix:
            block[2] = prefix + block[2][4:]

        if block[2].startswith("s "):
            header, ali1, ali2, qual = parse_block(block)
            if "shift-region" in options.methods:
                rows = []
                contig, start, end = parse_region_string(ali1.src)
                ali1 = ali1._replace(src=contig, start=start + ali1.start)
                rows.append(list(map(str, ali1)))
                rows.append(list(map(str, ali2)))
                if qual:
                    rows.append(list(map(str, qual)))
                lines = [header]
                lines.append(format_tabular(rows, "llrrrrl"))
                lines.append("\n")
                block = lines
        counter.blocks_output += 1
        options.stdout.write("".join(block))

    E.info(counter)
    E.stop()
コード例 #26
0
ファイル: lca2table.py プロジェクト: alphaneer/cgat-apps
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--summarise",
                      dest="summarise",
                      type="choice",
                      choices=("level-counts", "taxa-counts", "individual"),
                      help="summarise the taxa counts - no. phyla etc")

    parser.add_option("--output-map",
                      dest="output_map",
                      action="store_true",
                      help="ouput map of taxonomy")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if options.output_map:
        found = []
        options.stdout.write("""Domain\t \
        kingdom\t \
        phylum\t \
        class\t \
        order\t \
        family\t \
        genus\t \
        species\n""")
        # only output the mapping file - do not continue
        # summarise regardless of the specified options
        for lca in LCA.iterate(options.stdin):

            # if bacteria or archaea the kingdom will
            # be the domain
            if lca.domain == "Bacteria" or lca.domain == "Archaea":
                kingdom = lca.domain
            else:
                kingdom = lca.kingdom

            hierarchy = [
                lca.domain, kingdom, lca.phylum, lca._class, lca.order,
                lca.family, lca.genus, lca.species
            ]
            if hierarchy in found:
                continue
            else:
                found.append(hierarchy)
                options.stdout.write("\t".join(hierarchy) + "\n")
        return

    if options.summarise == "level-counts":
        level_counts = collections.defaultdict(set)
        total = 0
        nreads_domain = 0
        nreads_kingdom = 0
        nreads_kingdom_plus = 0
        nreads_phylum = 0
        nreads_phylum_plus = 0
        nreads_class = 0
        nreads_class_plus = 0
        nreads_order = 0
        nreads_order_plus = 0
        nreads_family = 0
        nreads_family_plus = 0
        nreads_genus = 0
        nreads_genus_plus = 0
        nreads_species = 0
        nreads_species_plus = 0
        nreads_subspecies = 0
        nreads_subspecies_plus = 0

        c = E.Counter()
        for lca in LCA.iterate(options.stdin):
            total += 1
            if lca.domain != "NA":
                nreads_domain += 1
                level_counts["domain"].add(lca.domain)
            else:
                c.kingdom_unmapped += 1

            if lca.kingdom != "NA":
                nreads_kingdom += 1
                level_counts["kingdom"].add(lca.kingdom)
            else:
                c.kingdom_unmapped += 1

            if lca.kingdom_plus != "NA":
                nreads_kingdom_plus += 1
                level_counts["kingdom+"].add(lca.kingdom_plus)
            else:
                c.kingdom_plus_unmapped += 1

            if lca.phylum != "NA":
                nreads_phylum += 1
                level_counts["phylum"].add(lca.phylum)
            else:
                c.phylum_unmapped += 1

            if lca.phylum_plus != "NA":
                nreads_phylum_plus += 1
                level_counts["phylum+"].add(lca.phylum_plus)
            else:
                c.phylum_plus_unmapped += 1

            if lca._class != "NA":
                nreads_class += 1
                level_counts["class"].add(lca._class)
            else:
                c.class_unmapped += 1

            if lca._class_plus != "NA":
                nreads_class_plus += 1
                level_counts["class+"].add(lca._class_plus)
            else:
                c.class_plus_unmapped += 1

            if lca.order != "NA":
                nreads_order += 1
                level_counts["order"].add(lca.order)
            else:
                c.order_unmapped += 1

            if lca.order_plus != "NA":
                nreads_order_plus += 1
                level_counts["order+"].add(lca.order_plus)
            else:
                c.order_plus_unmapped += 1

            if lca.family != "NA":
                nreads_family += 1
                level_counts["family"].add(lca.family)
            else:
                c.family_unmapped += 1

            if lca.family != "NA":
                nreads_family_plus == 1
                level_counts["family+"].add(lca.family_plus)
            else:
                c.family_plus_unmapped += 1

            if lca.genus != "NA":
                nreads_genus += 1
                level_counts["genus"].add(lca.genus)
            else:
                c.genus_unmapped += 1

            if lca.genus_plus != "NA":
                nreads_genus_plus == 1
                level_counts["genus+"].add(lca.genus_plus)
            else:
                c.genus_plus_unmapped += 1

            if lca.species != "NA":
                nreads_species += 1
                level_counts["species"].add(lca.species)
            else:
                c.species_unmapped += 1

            if lca.species_plus != "NA":
                nreads_species_plus += 1
                level_counts["species+"].add(lca.species_plus)
            else:
                c.species_plus_unmapped += 1

            # removed subspecies mapping for the time
            # being

            # if lca.subspecies != "NA":
            #     nreads_subspecies += 1
            #     level_counts["subspecies"].add(lca.subspecies)
            # else:
            #     c.subspecies_unmapped += 1

            # if lca.subspecies_plus != "NA":
            #     nreads_subspecies_plus += 1
            #     level_counts["subspecies+"].add(lca.subspecies_plus)
            # else:
            #     c.subspecies_plus_unmapped += 1

        options.stdout.write("\t".join([
            "ndomain", "nkingdom", "nkingdom+", "nphylum", "nphylum+",
            "nclass", "nclass+", "norder", "norder+", "nfamily", "nfamily+",
            "ngenus", "ngenus+", "nspecies", "nspecies+", "nseqkingdom",
            "nseqkingdom+", "nseqphylum", "nseqphylum+", "nseqclass",
            "nseqclass+", "nseqorder", "nseqorder+", "nseqfamily",
            "nseqfamily+", "nseqgenus", "nseqgenus+", "nseqspecies",
            "nseqspecies+"
        ]) + "\n")

        options.stdout.write("\t".join(
            map(str, [
                len(level_counts["domain"]),
                len(level_counts["kingdom"]),
                len(level_counts["kingdom+"]),
                len(level_counts["phylum"]),
                len(level_counts["phylum+"]),
                len(level_counts["class"]),
                len(level_counts["class+"]),
                len(level_counts["order"]),
                len(level_counts["order+"]),
                len(level_counts["family"]),
                len(level_counts["family+"]),
                len(level_counts["genus"]),
                len(level_counts["genus+"]),
                len(level_counts["species"]),
                len(level_counts["species+"]), nreads_domain, nreads_kingdom,
                nreads_phylum, nreads_phylum_plus, nreads_class,
                nreads_class_plus, nreads_order, nreads_order_plus,
                nreads_family, nreads_family_plus, nreads_genus,
                nreads_genus_plus, nreads_species, nreads_species_plus
            ])) + "\n")
    elif options.summarise == "taxa-counts":
        unmapped = collections.defaultdict(int)
        total = 0
        taxa_counts = {
            "domain": collections.defaultdict(int),
            "kingdom": collections.defaultdict(int),
            "kingdom+": collections.defaultdict(int),
            "phylum": collections.defaultdict(int),
            "phylum+": collections.defaultdict(int),
            "class": collections.defaultdict(int),
            "class+": collections.defaultdict(int),
            "order": collections.defaultdict(int),
            "order+": collections.defaultdict(int),
            "family": collections.defaultdict(int),
            "family+": collections.defaultdict(int),
            "genus": collections.defaultdict(int),
            "genus+": collections.defaultdict(int),
            "species": collections.defaultdict(int),
            "species+": collections.defaultdict(int)
        }

        c = E.Counter()
        for lca in LCA.iterate(options.stdin):
            total += 1
            if lca.domain != "NA":
                taxa_counts["domain"][lca.domain] += 1
            else:
                c.kingdom_unmapped += 1
                unmapped["domain"] += 1
            if lca.kingdom != "NA":
                taxa_counts["kingdom"][lca.kingdom] += 1
            else:
                c.kingdom_unmapped += 1
                unmapped["kingdom"] += 1
            if lca.kingdom_plus != "NA":
                taxa_counts["kingdom+"][lca.kingdom_plus] += 1
            else:
                c.kingdom_plus_unmapped += 1
                unmapped["kingdom+"] += 1
            if lca.phylum != "NA":
                taxa_counts["phylum"][lca.phylum] += 1
            else:
                c.phylum_unmapped += 1
                unmapped["phylum"] += 1
            if lca.phylum_plus != "NA":
                taxa_counts["phylum+"][lca.phylum_plus] += 1
            else:
                c.phylum_plus_unmapped += 1
                unmapped["phylum+"] += 1
            if lca._class != "NA":
                taxa_counts["class"][lca._class] += 1
            else:
                c.class_unmapped += 1
                unmapped["class"] += 1
            if lca._class_plus != "NA":
                taxa_counts["class+"][lca._class_plus] += 1
            else:
                c.class_plus_unmapped += 1
                unmapped["class+"] += 1
            if lca.order != "NA":
                taxa_counts["order"][lca.order] += 1
            else:
                c.order_unmapped += 1
                unmapped["order"] += 1
            if lca.order_plus != "NA":
                taxa_counts["order+"][lca.order_plus] += 1
            else:
                c.order_plus_unmapped += 1
                unmapped["order+"] += 1
            if lca.family != "NA":
                taxa_counts["family"][lca.family] += 1
            else:
                c.family_unmapped += 1
                unmapped["family"] += 1
            if lca.family_plus != "NA":
                taxa_counts["family+"][lca.family_plus] += 1
            else:
                c.family_plus_unmapped += 1
                unmapped["family+"] += 1
            if lca.genus != "NA":
                taxa_counts["genus"][lca.genus] += 1
            else:
                c.genus_unmapped += 1
                unmapped["genus"] += 1
            if lca.genus_plus != "NA":
                taxa_counts["genus+"][lca.genus_plus] += 1
            else:
                c.genus_plus_unmapped += 1
                unmapped["genus+"] += 1
            if lca.species != "NA":
                taxa_counts["species"][lca.species] += 1
            else:
                c.species_unmapped += 1
                unmapped["species"] += 1
            if lca.species_plus != "NA":
                taxa_counts["species+"][lca.species_plus] += 1
            else:
                c.species_plus_unmapped += 1
                unmapped["species+"] += 1

        options.stdout.write("level\ttaxa\tcount\tproportion\trpm\n")
        for level, taxa_count in sorted(taxa_counts.items()):
            total_level = total - unmapped[level]
            for taxa, count in sorted(taxa_count.items()):
                options.stdout.write("\t".join([
                    level, taxa,
                    str(count), "{:.8}".format(float(count) /
                                               total_level), "{:.8}".
                    format(float(count) / (float(total_level) / 1000000))
                ]) + "\n")

        E.info(c)

    elif options.summarise == "individual":
        # each read is output with its respective
        # taxon assignments
        options.stdout.write("\t".join([
            "id", "domain", "kingdom", "kingdom+", "phylum", "phylum+",
            "class", "class+", "order", "order+", "family", "family+", "genus",
            "genus+", "species", "species+"
        ]) + "\n")
        for lca in LCA.iterate(options.stdin):
            options.stdout.write("\t".join([
                lca.identifier, lca.domain, lca.kingdom, lca.kingdom_plus,
                lca.phylum, lca.phylum_plus, lca._class, lca._class_plus,
                lca.order, lca.order_plus, lca.family, lca.family_plus,
                lca.genus, lca.genus_plus, lca.species, lca.species_plus
            ]) + "\n")

    # write footer and output benchmark information.
    E.stop()
コード例 #27
0
def annotateGenome(iterator, fasta, options, default_code=DEFAULT_CODE):
    """annotate a genome given by the indexed *fasta* file and 
    an iterator over gtf annotations.
    """

    annotations = {}
    contig_sizes = fasta.getContigSizes(with_synonyms=False)
    E.info("allocating memory for %i contigs and %i bytes" %
           (len(contig_sizes),
            sum(contig_sizes.values()) * array.array("B").itemsize))
    # AString.AString( "a").itemsize ))

    for contig, size in list(contig_sizes.items()):
        E.debug("allocating %s: %i bases" % (contig, size))
        # annotations[contig] = AString.AString( default_code * size )
        # annotations[contig] = array.array("", default_code * size)
        # Go to list for py3 compatibility, patch
        annotations[contig] = [default_code] * size

    E.info("allocated memory for %i contigs" % len(fasta))

    counter = E.Counter()

    # output splice junctions
    outfile_junctions = E.open_output_file("junctions")
    outfile_junctions.write(
        "contig\tstrand\tpos1\tpos2\tframe\tgene_id\ttranscript_id\n")
    for gtfs in iterator:

        counter.input += 1

        if counter.input % options.report_step == 0:
            E.info("iteration %i" % counter.input)

        try:
            contig = fasta.getToken(gtfs[0].contig)
        except KeyError as msg:
            E.warn("contig %s not found - annotation ignored" % gtfs[0].contig)
            counter.skipped_contig += 1
            continue

        lcontig = fasta.getLength(contig)

        # make sure that exons are sorted by coordinate
        gtfs.sort(key=lambda x: x.start)

        is_positive = Genomics.IsPositiveStrand(gtfs[0].strand)
        source = gtfs[0].source

        # process non-coding data
        if source in MAP_ENSEMBL:
            code = MAP_ENSEMBL[source]

            intervals = [(x.start, x.end) for x in gtfs]
            addSegments(annotations[contig], intervals, is_positive, code)

        elif source == "protein_coding":

            # collect exons for utr
            exons = [(x.start, x.end) for x in gtfs if x.feature == "exon"]
            cds = [(x.start, x.end) for x in gtfs if x.feature == "CDS"]
            if len(cds) == 0:
                counter.skipped_transcripts += 1
                E.warn("protein-coding transcript %s without CDS - skipped" %
                       gtfs[0].transcript_id)
                continue

            exons = Intervals.truncate(exons, cds)
            start, end = cds[0][0], cds[-1][1]

            UTR5 = [x for x in exons if x[1] < start]
            UTR3 = [x for x in exons if x[0] >= end]

            if not is_positive:
                UTR5, UTR3 = UTR3, UTR5
                splice_code = "S"
            else:
                splice_code = "s"

            addSegments(annotations[contig], UTR5, is_positive, "u")

            addIntrons(annotations[contig], UTR5, is_positive,
                       options.max_frameshift_length)

            addSegments(annotations[contig], UTR3, is_positive, "v")

            addIntrons(annotations[contig], UTR3, is_positive,
                       options.max_frameshift_length)

            # output CDS according to frame
            addCDS(annotations[contig],
                   [x for x in gtfs if x.feature == "CDS"], is_positive)

            # add introns between CDS
            addIntrons(annotations[contig], cds, is_positive,
                       options.max_frameshift_length)

            # output splice junctions
            cds = [x for x in gtfs if x.feature == "CDS"]

            # apply corrections for 1-past end coordinates
            # to point between residues within CDS
            if is_positive:
                ender = lambda x: x.end - 1
                starter = lambda x: x.start
                out_positive = "+"
            else:
                ender = lambda x: lcontig - x.start - 1
                starter = lambda x: lcontig - x.end
                out_positive = "-"
                cds.reverse()

            end = ender(cds[0])
            for c in cds[1:]:
                start = starter(c)
                outfile_junctions.write("%s\t%s\t%i\t%i\t%s\t%s\t%s\n" % (
                    contig,
                    out_positive,
                    end,
                    start,
                    c.frame,
                    c.gene_id,
                    c.transcript_id,
                ))
                end = ender(c)

    E.info("finished reading genes: %s" % str(counter))

    outfile_junctions.close()

    E.info("started counting")
    outfile = E.open_output_file("counts")
    outputCounts(outfile, annotations)
    outfile.close()

    E.info("started output")
    for k in sorted(annotations.keys()):
        # options.stdout.write(">%s\n%s\n" % (k, annotations[k].tostring()))
        options.stdout.write(">%s\n%s\n" % (k, "".join(annotations[k])))
コード例 #28
0
def buildPolyphenInput(infiles, outfile):
    '''build polyphen input file.

    SNPS across all species are aggregated into a single
    file to avoid multiple submissions for the same variant.

    Mapping to Uniprot ids was not successful - 40% of the
    SNPs would have been lost. Hence I map to ensembl protein
    identifiers. Note that the sequence file is then to be
    submitted to POLYPHEN as well.

    Note that this method outputs 1-based coordinates for polyphen,
    while the coordinates in the .map file are still 0-based.

    SNPs are assigned a snp_id and a locus_id. The snp_id refers
    to the SNP within a peptide sequence while the locus_id refers
    to the genomic location. If there are alternative
    transcripts overlapping a SNP, the same SNP will get two
    snp_ids, but the same locus_id. As the peptide background might
    be different for the same SNP depending on the transcript,
    its effect needs to be predicted twice.
    '''

    statement = '''SELECT
        transcript_id,
        cds_start,
        cds_end,
        orig_codons,
        variant_codons,
        orig_na,
        variant_na,
        contig,
        snp_position
    FROM %(table)s_cds
    WHERE variant_code = '=' AND code = 'N'
    '''

    dbhandle = connect()
    cc = dbhandle.cursor()

    infiles.sort()

    # ensembl mapping
    map_transcript2id = dict(
        cc.execute("SELECT transcript_id, protein_id FROM annotations.transcript_info WHERE protein_id IS NOT NULL").fetchall())

    total_counts = E.Counter()
    notfound, found = set(), set()

    outf_map = open(outfile + ".map", "w")
    outf_map.write(
        "snp_id\ttrack\ttranscript_id\tprotein_id\tprotein_pos\tlocus_id\tcontig\tpos\tphase\n")

    outf = open(outfile, "w")

    snps = {}
    locus_ids = {}

    for infile in infiles:

        table = P.toTable(infile)
        track = table[:-len("_effects")]
        print(statement % locals())
        cc.execute(statement % locals())

        counts = E.Counter()

        snp_id = 0
        for transcript_id, cds_start, cds_end, orig_codons, variant_codons, orig_na, variant_na, contig, pos in cc:

            counts.input += 1

            if transcript_id not in map_transcript2id:
                notfound.add(transcript_id)
                counts.not_found += 1
                continue

            if "," in variant_codons:
                counts.heterozygous += 1
                continue

            for phase in range(0, 3):
                if orig_na[phase].lower() != variant_na[phase].lower():
                    break

            pid = map_transcript2id[transcript_id]
            # one-based coordinates
            peptide_pos = int(math.floor(cds_start / 3.0)) + 1
            key = "%s-%i-%s" % (pid, peptide_pos, variant_codons)

            if key in snps:
                snp_id = snps[key]
            else:
                snp_id = len(snps)
                snps[key] = snp_id
                outf.write("snp%010i\t%s\t%i\t%s\t%s\n" %
                           (snp_id,
                            pid,
                            peptide_pos,
                            orig_codons,
                            variant_codons,
                            ))
                counts.output += 1

            locus_key = "%s-%i-%s" % (contig, pos, variant_codons)
            if locus_key not in locus_ids:
                locus_ids[locus_key] = len(locus_ids)

            # use 0-based coordinates throughout, including peptide pos
            outf_map.write("snp%010i\t%s\t%s\t%s\t%i\tloc%010i\t%s\t%i\t%i\n" %
                           (snp_id,
                            track,
                            transcript_id,
                            pid,
                            peptide_pos - 1,
                            locus_ids[locus_key],
                            contig,
                            pos,
                            phase))

            found.add(transcript_id)

        total_counts += counts

        E.info("%s: %s" % (table, str(counts)))

    outf.close()
    outf_map.close()

    E.info("%s: transcripts: %s found, %i not found" % (table,
                                                        len(found),
                                                        len(notfound)))

    E.info("total=%s, snp_ids=%i, locus_ids=%i" %
           (str(total_counts), len(snps), len(locus_ids)))
    if notfound:
        E.warn("%i transcripts had SNPS that were ignored because there was no uniprot accession" %
               len(notfound))
        E.warn("notfound: %s" % ",".join(notfound))

    statement = '''sort -k2,2 -k3,3n %(outfile)s > %(outfile)s.tmp; mv %(outfile)s.tmp %(outfile)s'''

    P.run()
コード例 #29
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.set_defaults()

    # add common options (-h/--help, ...) and parse command line
    (args, unknown) = E.start(parser,
                              argv=argv,
                              add_output_options=True,
                              unknowns=True)

    # do sth
    if len(unknown) == 1:
        fastqfile1 = unknown[0]
        fastqfile2 = args.output_filename_pattern % "2"
    elif len(unknown) == 2:
        fastqfile1, fastqfile2 = unknown
    else:
        fastqfile1 = args.output_filename_pattern % "1"
        fastqfile2 = args.output_filename_pattern % "2"

    # only output compressed data
    if not fastqfile1.endswith(".gz"):
        fastqfile1 += ".gz"
    if not fastqfile2.endswith(".gz"):
        fastqfile2 += ".gz"

    if args.stdin != sys.stdin:
        samfile = pysam.AlignmentFile(args.stdin.name, "rb")
    else:
        samfile = pysam.AlignmentFile("-", "rb")

    tmpdir = tempfile.mkdtemp()

    outtemp1 = os.path.join(tmpdir, "pair1.gz")
    outtemp2 = os.path.join(tmpdir, "pair2.gz")

    outstream1 = iotools.open_file(outtemp1, "w")
    outstream2 = iotools.open_file(outtemp2, "w")

    E.info('writing fastq files to temporary directory %s' % tmpdir)

    found1, found2 = set(), set()
    read1_qlen, read2_qlen = 0, 0

    c = E.Counter()
    for read in samfile.fetch(until_eof=True):
        c.input += 1
        if not read.is_paired:
            outstream1.write("\t".join((read.qname, read.seq, read.qual)) +
                             "\n")
            found1.add(read.qname)
            if not read1_qlen:
                read1_qlen = read.qlen
            c.unpaired += 1
        elif read.is_read1:
            outstream1.write("\t".join((read.qname, read.seq, read.qual)) +
                             "\n")
            found1.add(read.qname)
            if not read1_qlen:
                read1_qlen = read.qlen
            c.output1 += 1
        elif read.is_read2:
            if read.qname not in found2:
                outstream2.write("\t".join((read.qname, read.seq, read.qual)) +
                                 "\n")
                found2.add(read.qname)
                if not read2_qlen:
                    read2_qlen = read.qlen
                c.output2 += 1

    if c.unpaired == 0 and c.output1 == 0 and c.output2 == 0:
        E.warn("no reads were found")
        return

    sort_statement = '''gunzip < %s
    | sort -k1,1
    | awk '{printf("@%%s\\n%%s\\n+\\n%%s\\n", $1,$2,$3)}'
    | gzip > %s'''

    if c.output1 == 0 and c.output2 == 0:
        # single end data:
        outstream1.close()
        outstream2.close()
        E.info("sorting fastq files")
        E.run(sort_statement % (outtemp1, fastqfile1))

    else:
        # paired end data
        for qname in found2.difference(found1):
            outstream1.write("\t".join((qname, "N" * read1_qlen,
                                        "B" * read1_qlen)) + "\n")
            c.extra1 += 1

        for qname in found1.difference(found2):
            outstream2.write("\t".join((qname, "N" * read2_qlen,
                                        "B" * read2_qlen)) + "\n")
            c.extra2 += 1

        E.info("%s" % str(c))

        outstream1.close()
        outstream2.close()

        E.info("sorting fastq files")
        E.run(sort_statement % (outtemp1, fastqfile1))
        E.run(sort_statement % (outtemp2, fastqfile2))

    shutil.rmtree(tmpdir)

    # write footer and output benchmark information.
    E.stop()
コード例 #30
0
def main(argv=None):
    """script main.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("bedgraph", "wiggle", "bigbed", "bigwig",
                               "bed"),
                      help="output format [default=%default]")

    parser.add_option("-s",
                      "--shift-size",
                      dest="shift",
                      type="int",
                      help="shift reads by a certain amount (ChIP-Seq) "
                      "[%default]")

    parser.add_option("-e",
                      "--extend",
                      dest="extend",
                      type="int",
                      help="extend reads by a certain amount "
                      "(ChIP-Seq) [%default]")

    parser.add_option("-p",
                      "--wiggle-span",
                      dest="span",
                      type="int",
                      help="span of a window in wiggle tracks "
                      "[%default]")

    parser.add_option("-m",
                      "--merge-pairs",
                      dest="merge_pairs",
                      action="store_true",
                      help="merge paired-ended reads into a single "
                      "bed interval [default=%default].")

    parser.add_option("--scale-base",
                      dest="scale_base",
                      type="float",
                      help="number of reads/pairs to scale bigwig file to. "
                      "The default is to scale to 1M reads "
                      "[default=%default]")

    parser.add_option("--scale-method",
                      dest="scale_method",
                      type="choice",
                      choices=(
                          "none",
                          "reads",
                      ),
                      help="scale bigwig output. 'reads' will normalize by "
                      "the total number reads in the bam file that are used "
                      "to construct the bigwig file. If --merge-pairs is used "
                      "the number of pairs output will be used for "
                      "normalization. 'none' will not scale the bigwig file"
                      "[default=%default]")

    parser.add_option("--max-insert-size",
                      dest="max_insert_size",
                      type="int",
                      help="only merge if insert size less that "
                      "# bases. 0 turns of this filter "
                      "[default=%default].")

    parser.add_option("--min-insert-size",
                      dest="min_insert_size",
                      type="int",
                      help="only merge paired-end reads if they are "
                      "at least # bases apart. "
                      "0 turns of this filter. [default=%default]")

    parser.set_defaults(
        samfile=None,
        output_format="wiggle",
        shift=0,
        extend=0,
        span=1,
        merge_pairs=None,
        min_insert_size=0,
        max_insert_size=0,
        scale_method='none',
        scale_base=1000000,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if len(args) >= 1:
        options.samfile = args[0]
    if len(args) == 2:
        options.output_filename_pattern = args[1]
    if not options.samfile:
        raise ValueError("please provide a bam file")

    # Read BAM file using Pysam
    samfile = pysam.AlignmentFile(options.samfile, "rb")

    # Create temporary files / folders
    tmpdir = tempfile.mkdtemp()
    E.debug("temporary files are in %s" % tmpdir)
    tmpfile_wig = os.path.join(tmpdir, "wig")
    tmpfile_sizes = os.path.join(tmpdir, "sizes")

    # Create dictionary of contig sizes
    contig_sizes = dict(list(zip(samfile.references, samfile.lengths)))
    # write contig sizes
    outfile_size = iotools.open_file(tmpfile_sizes, "w")
    for contig, size in sorted(contig_sizes.items()):
        outfile_size.write("%s\t%s\n" % (contig, size))
    outfile_size.close()

    # Shift and extend only available for bigwig format
    if options.shift or options.extend:
        if options.output_format != "bigwig":
            raise ValueError(
                "shift and extend only available for bigwig output")

    # Output filename required for bigwig / bigbed computation
    if options.output_format == "bigwig":
        if not options.output_filename_pattern:
            raise ValueError(
                "please specify an output file for bigwig computation.")

        # Define executable to use for binary conversion
        if options.output_format == "bigwig":
            executable_name = "wigToBigWig"
        else:
            raise ValueError("unknown output format `%s`" %
                             options.output_format)

        # check required executable file is in the path
        executable = iotools.which(executable_name)
        if not executable:
            raise OSError("could not find %s in path." % executable_name)

        # Open outout file
        outfile = iotools.open_file(tmpfile_wig, "w")
        E.info("starting output to %s" % tmpfile_wig)
    else:
        outfile = iotools.open_file(tmpfile_wig, "w")
        E.info("starting output to stdout")

    # Set up output write functions
    if options.output_format in ("wiggle", "bigwig"):
        # wiggle is one-based, so add 1, also step-size is 1, so need
        # to output all bases
        if options.span == 1:
            outf = lambda outfile, contig, start, end, val: \
                outfile.write(
                    "".join(["%i\t%i\n" % (x, val)
                             for x in range(start + 1, end + 1)]))
        else:
            outf = SpanWriter(options.span)
    elif options.output_format == "bedgraph":
        # bed is 0-based, open-closed
        outf = lambda outfile, contig, start, end, val: \
            outfile.write("%s\t%i\t%i\t%i\n" % (contig, start, end, val))

    # initialise counters
    ninput, nskipped, ncontigs = 0, 0, 0

    # set output file name
    output_filename_pattern = options.output_filename_pattern
    if output_filename_pattern:
        output_filename = os.path.abspath(output_filename_pattern)

    # shift and extend or merge pairs. Output temporay bed file
    if options.shift > 0 or options.extend > 0 or options.merge_pairs:
        # Workflow 1: convert to bed intervals and use bedtools
        # genomecov to build a coverage file.
        # Convert to bigwig with UCSC tools bedGraph2BigWig

        if options.merge_pairs:
            # merge pairs using bam2bed
            E.info("merging pairs to temporary file")
            counter = merge_pairs(samfile,
                                  outfile,
                                  min_insert_size=options.min_insert_size,
                                  max_insert_size=options.max_insert_size,
                                  bed_format=3)
            E.info("merging results: {}".format(counter))
            if counter.output == 0:
                raise ValueError("no pairs output after merging")
        else:
            # create bed file with shifted/extended tags
            shift, extend = options.shift, options.extend
            shift_extend = shift + extend
            counter = E.Counter()

            for contig in samfile.references:
                E.debug("output for %s" % contig)
                lcontig = contig_sizes[contig]

                for read in samfile.fetch(contig):
                    pos = read.pos
                    if read.is_reverse:
                        start = max(0, read.pos + read.alen - shift_extend)
                    else:
                        start = max(0, read.pos + shift)

                    # intervals extending beyond contig are removed
                    if start >= lcontig:
                        continue

                    end = min(lcontig, start + extend)
                    outfile.write("%s\t%i\t%i\n" % (contig, start, end))
                    counter.output += 1

        outfile.close()

        if options.scale_method == "reads":
            scale_factor = float(options.scale_base) / counter.output

            E.info("scaling: method=%s scale_quantity=%i scale_factor=%f" %
                   (options.scale_method, counter.output, scale_factor))
            scale = "-scale %f" % scale_factor
        else:
            scale = ""

        # Convert bed file to coverage file (bedgraph)
        tmpfile_bed = os.path.join(tmpdir, "bed")
        E.info("computing coverage")
        # calculate coverage - format is bedgraph
        statement = """bedtools genomecov -bg -i %(tmpfile_wig)s %(scale)s
        -g %(tmpfile_sizes)s > %(tmpfile_bed)s""" % locals()
        E.run(statement)

        # Convert bedgraph to bigwig
        E.info("converting to bigwig")
        tmpfile_sorted = os.path.join(tmpdir, "sorted")
        statement = ("sort -k 1,1 -k2,2n %(tmpfile_bed)s > %(tmpfile_sorted)s;"
                     "bedGraphToBigWig %(tmpfile_sorted)s %(tmpfile_sizes)s "
                     "%(output_filename_pattern)s" % locals())
        E.run(statement)

    else:

        # Workflow 2: use pysam column iterator to build a
        # wig file. Then convert to bigwig of bedgraph file
        # with UCSC tools.
        def column_iter(iterator):
            start = None
            end = 0
            n = None
            for t in iterator:
                if t.pos - end > 1 or n != t.n:
                    if start is not None:
                        yield start, end, n
                    start = t.pos
                    end = t.pos
                    n = t.n
                end = t.pos
            yield start, end, n

        if options.scale_method != "none":
            raise NotImplementedError(
                "scaling not implemented for pileup method")

        # Bedgraph track definition
        if options.output_format == "bedgraph":
            outfile.write("track type=bedGraph\n")

        for contig in samfile.references:
            # if contig != "chrX": continue
            E.debug("output for %s" % contig)
            lcontig = contig_sizes[contig]

            # Write wiggle header
            if options.output_format in ("wiggle", "bigwig"):
                outfile.write("variableStep chrom=%s span=%i\n" %
                              (contig, options.span))

            # Generate pileup per contig using pysam and iterate over columns
            for start, end, val in column_iter(samfile.pileup(contig)):
                # patch: there was a problem with bam files and reads
                # overextending at the end. These are usually Ns, but
                # need to check as otherwise wigToBigWig fails.
                if lcontig <= end:
                    E.warn("read extending beyond contig: %s: %i > %i" %
                           (contig, end, lcontig))
                    end = lcontig
                    if start >= end:
                        continue

                if val > 0:
                    outf(outfile, contig, start, end, val)
            ncontigs += 1

        # Close output file
        if type(outf) == type(SpanWriter):
            outf.flush(outfile)
        else:
            outfile.flush()

        E.info("finished output")

        # Report counters
        E.info("ninput=%i, ncontigs=%i, nskipped=%i" %
               (ninput, ncontigs, nskipped))

        # Convert to binary formats
        if options.output_format == "bigwig":
            outfile.close()

            E.info("starting %s conversion" % executable)
            try:
                retcode = subprocess.call(" ".join(
                    (executable, tmpfile_wig, tmpfile_sizes,
                     output_filename_pattern)),
                                          shell=True)
                if retcode != 0:
                    E.warn("%s terminated with signal: %i" %
                           (executable, -retcode))
                    return -retcode
            except OSError as msg:
                E.warn("Error while executing bigwig: %s" % msg)
                return 1
            E.info("finished bigwig conversion")
        else:
            with open(tmpfile_wig) as inf:
                sys.stdout.write(inf.read())

    # Cleanup temp files
    shutil.rmtree(tmpdir)

    E.stop()