Esempio n. 1
0
    def __init__(self, infile, *args, **kwargs):

        self.gtf = GTF.iterator(IOTools.open_file(infile, "r"))
Esempio n. 2
0
def runMAST(infiles, outfile):
    '''run mast on all intervals and motifs.

    Collect all results for an E-value up to 10000 so that all
    sequences are output and MAST curves can be computed.

    10000 is a heuristic.

    '''

    # job_options = "-l mem_free=8000M"

    controlfile, dbfile, motiffiles = infiles

    if IOTools.is_empty(dbfile) or len(motiffiles) == 0:
        IOTools.touch_file(outfile)
        return

    if not os.path.exists(controlfile):
        raise ValueError("control file %s for %s does not exist" %
                         (controlfile, dbfile))

    # remove previous results
    if os.path.exists(outfile):
        os.remove(outfile)

    tmpdir = P.get_temp_dir(".")
    tmpfile = P.get_temp_filename(".")

    for motiffile in motiffiles:
        if IOTools.is_empty(motiffile):
            L.info("skipping empty motif file %s" % motiffile)
            continue

        of = IOTools.open_file(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - foreground ::\n" % motif)
        of.close()

        # mast bails if the number of nucleotides gets larger than
        # 2186800982?
        # To avoid this, run db and control file separately.
        statement = '''
        cat %(dbfile)s
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run(statement)

        of = IOTools.open_file(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - background ::\n" % motif)
        of.close()

        statement = '''
        cat %(controlfile)s
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run(statement)

    P.run("gzip < %(tmpfile)s > %(outfile)s")

    shutil.rmtree(tmpdir)
    os.unlink(tmpfile)
Esempio n. 3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-b",
                      "--bam-file",
                      dest="bam_file",
                      type="string",
                      help="supply input bam file name")

    parser.add_option("-g",
                      "--gtf-file",
                      dest="gtf_file",
                      type="string",
                      help="supply input gtf file name")

    parser.add_option("-o",
                      "--outfile",
                      dest="outfile",
                      type="string",
                      help="supply output file name")

    parser.add_option(
        "-G",
        "--reference-gtf-file",
        dest="reference_gtf",
        type="string",
        help=
        "supply reference gtf for context of reads not contributing to transcripts"
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    ######################################################
    ######################################################
    # for all alignments
    ######################################################
    ######################################################

    # open outfile and prepare headers
    outf = IOTools.open_file(options.outfile, "w")
    outf.write("\t".join([
        "total alignments", "aligments in transcripts",
        "percent alignments in transcripts", "total spliced alignments",
        "spliced alignments in transcripts",
        "percent spliced alignments in transcripts"
    ]) + "\n")

    # calculate coverage over transcript file - NB split reads contribute twice to the transcript
    # use BedTool object
    pybedbamfile = pybedtools.BedTool(options.bam_file)

    # count alignments
    E.info("counting total number of alignments and spliced alignments")
    total_alignments = 0
    spliced_alignments = 0

    for alignment in pybedbamfile:
        cigar = alignment[5]
        if cigar.find("N") != -1:  # N signifies split read
            total_alignments += 1
            spliced_alignments += 1
        else:
            total_alignments += 1

    # merge the gtf file to avoid double counting of exons in different
    # transcripts - converts to a bed file
    gtffile = pybedtools.BedTool(options.gtf_file).merge()

    E.info("computing coverage of aligments in %s over intervals in %s" %
           (options.bam_file, options.gtf_file))
    cover = pybedbamfile.coverage(gtffile)

    # make sure that the exons aren't being counted twice - shouldn't be
    # because of merge
    E.info("counting reads contributing to transcripts")
    c = 0
    for entry in cover:
        coverage = int(entry[3])
        if coverage > 0:
            c += coverage

    # sum the coverage across exons from all transcripts
    coverage_in_transcripts = c

    ######################################################
    ######################################################
    # for spliced alignments
    ######################################################
    ######################################################

    # count total number of spliced alignments
    # requires that the CIGAR string 'N' is present

    # uses pysam to write out a bam file of the spliced reads only
    allreads = pysam.AlignmentFile(options.bam_file)
    spliced_bamname = IOTools.snip(options.bam_file,
                                   ".bam") + "_spliced_reads.bam"

    # open file for outputting spliced alignments
    splicedreads = pysam.AlignmentFile(spliced_bamname,
                                       "wb",
                                       template=allreads)

    # cigar string in pysam for spliced alignment is (3, int)
    spliced = collections.defaultdict(list)
    for read in allreads:
        for cigar_tag in read.cigar:
            if cigar_tag[0] == 3:
                spliced[read].append(cigar_tag)

    # write out spliced alignments
    for read in list(spliced.keys()):
        splicedreads.write(read)
    splicedreads.close()
    allreads.close()

    # index splice reads bam file
    pysam.sort(spliced_bamname, P.snip(spliced_bamname, ".bam"))
    pysam.index(spliced_bamname)

    # read in the spliced reads as a BedTool object
    splicedbam = pybedtools.BedTool(spliced_bamname)

    # perform coverage of spliced reads over intervals - will be twice
    # as many as there should be due to counting both exons
    # overlapping
    spliced_coverage = splicedbam.coverage(gtffile)

    # avoid double counting exons
    E.info("counting spliced reads contributing to transcripts")
    spliced_exons = {}
    c = 0
    for entry in spliced_coverage:
        coverage = int(entry[3])
        if coverage > 0:
            c += coverage

    spliced_coverage_in_transcripts = c

    # NOTE: the counting of spliced alignments is not accurate

    spliced_coverage_in_transcripts = float(
        spliced_coverage_in_transcripts) / 2

    ###########################
    # write out the results
    ###########################

    outf.write(str(int(total_alignments)) + "\t")
    # remove half of the coverage assigned to spliced reads
    coverage_in_transcripts = (coverage_in_transcripts) - (
        spliced_coverage_in_transcripts)
    outf.write(
        str(
            int(coverage_in_transcripts) -
            int(spliced_coverage_in_transcripts)) + "\t")
    outf.write(
        str(int((coverage_in_transcripts / total_alignments) * 100)) + "\t")

    # write out spliced counts
    outf.write(str(int(spliced_alignments)) + "\t")
    outf.write(str(int(spliced_coverage_in_transcripts)) + "\t")
    outf.write(
        str(int((spliced_coverage_in_transcripts / spliced_alignments) * 100)))

    outf.close()

    ############################
    # contextualise those that
    # don't fall in transcripts
    ############################

    if options.reference_gtf:
        context_summary = IOTools.open_file(
            IOTools.snip(options.bam_file, ".bam") + ".excluded.context", "w")
        context_summary.write("\t".join(["Feature", "number"]) + "\n")

        # write out the read info as well
        context_file = IOTools.open_file(
            IOTools.snip(options.bam_file, ".bam") + ".excluded", "w")

        context_dict = collections.defaultdict(int)
        # intersect bam - write non-overlapping with transcripts - intersect
        # with reference - write out
        context = pybedbamfile.intersect(gtffile, v=True, bed=True).intersect(
            pybedtools.BedTool(options.reference_gtf), wb=True)
        for entry in context:
            feature = entry[8]
            context_dict[feature] += 1
            context_file.write("\t".join([e for e in entry]) + "\n")

        for feature, value in context_dict.items():
            context_summary.write("\t".join([feature, str(value)]) + "\n")

        context_file.close()
        context_summary.close()

    # write footer and output benchmark information.
    E.stop()
Esempio n. 4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: data2multiple_anova.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-c", "--columns", dest="columns", type="string",
                      help="columns to take for calculating histograms.")
    parser.add_option("-t", "--tree-nh-file", dest="filename_tree", type="string",
                      help="filename with tree(s).")
    parser.add_option("--skip-header", dest="add_header", action="store_false",
                      help="do not add header to flat format.")
    parser.add_option("--output-with-header", dest="write_header", action="store_true",
                      help="write header and exit.")
    parser.add_option("--debug", dest="debug", action="store_true",
                      help="debug mode")
    parser.add_option("--display-tree", dest="display_tree", action="store_true",
                      help="display the tree")

    parser.add_option("-m", "--method", dest="methods", type="choice", action="append",
                      choices=("contrasts", "spearman", "pearson", "compute"),
                      help="methods to perform on contrasts.")

    parser.set_defaults(
        columns="all",
        filename_tree=None,
        add_header=True,
        write_header=False,
        debug=False,
        methods=[],
        value_format="%6.4f",
        pvalue_format="%e",
        display_tree=False,
    )

    (options, args) = E.start(parser, quiet=True)

    if options.columns not in ("all", "all-but-first"):
        options.columns = [int(x) - 1 for x in options.columns.split(",")]

    data = []

    options.filenames = args

    for filename in options.filenames:

        infile = IOTools.open_file(filename, "r")
        table, headers = IOTools.readTable(
            infile, take=options.columns, headers=False)
        infile.close()

        data.append(table)

    fields = ["Df", "Sum Sq", "F value", "Pr(>F)", "Mean Sq"]

    options.stdout.write("set1\tset2")
    for field in fields:
        options.stdout.write("\t%s" % field)
    options.stdout.write("\n")

    # CODE needs to be refactored for rpy2 usage

    for x in range(len(data)):

        for y in range(x + 1, len(data)):

            rpy.set_default_mode(rpy.NO_CONVERSION)

            factors = ["x"] * len(data[x][:, 0]) + ["y"] * len(data[y][:, 0])
            values = list(data[x][:, 0]) + list(data[y][:, 0])

            linear_model = R.lm(
                R("y ~ x"), data=R.data_frame(x=factors, y=values))
            rpy.set_default_mode(rpy.BASIC_CONVERSION)
            result = R.anova(linear_model)

            options.stdout.write(
                "%s\t%s" % (options.filenames[x], options.filenames[y]))
            for field in fields:
                options.stdout.write("\t%s" % str(result[field]))
            options.stdout.write("\n")
Esempio n. 5
0
def loadGLAM2SCAN(infile, outfile):
    '''parse mast file and load into database.

    Parse several motif runs and add them to the same
    table.
    '''
    tmpfile = tempfile.NamedTemporaryFile(delete=False)
    tmpfile.write(
        "motif\tid\tnmatches\tscore\tscores\tncontrols\tmax_controls\n")

    lines = IOTools.open_file(infile).readlines()
    chunks = [x for x in range(len(lines)) if lines[x].startswith("::")]
    chunks.append(len(lines))

    for chunk in range(len(chunks) - 1):

        # use real file, as parser can not deal with a
        # list of lines

        try:
            motif = re.match(":: motif = (\S+) ::",
                             lines[chunks[chunk]]).groups()[0]
        except AttributeError:
            raise ValueError("parsing error in line '%s'" %
                             lines[chunks[chunk]])

        if chunks[chunk] + 1 == chunks[chunk + 1]:
            L.warn("no results for motif %s - ignored" % motif)
            continue

        tmpfile2 = tempfile.NamedTemporaryFile(delete=False)
        tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]]))
        tmpfile2.close()
        glam = Glam2Scan.parse(IOTools.open_file(tmpfile2.name, "r"))

        os.unlink(tmpfile2.name)

        # collect control data
        full_matches = collections.defaultdict(list)
        controls = collections.defaultdict(list)
        for match in glam.matches:
            m = match.id.split("_")
            track, id = m[:2]
            if len(m) == 2:
                full_matches[id].append(match)
            else:
                controls[id].append(match.score)

        for id, matches in full_matches.items():

            nmatches = len(matches)
            scores = [x.score for x in matches]
            score = max(scores)
            # move to genomic coordinates
            # contig, start, end = re.match( "(\S+):(\d+)..(\d+)", match.id).groups()
            # start, end = int(start), int(end)
            # match.start += start
            # match.end += start
            contig = ""

            if id not in controls:
                P.warn("no controls for %s - increase evalue?" % id)

            c = controls[id]
            if len(c) == 0:
                mmax = ""
            else:
                mmax = max(c)

            tmpfile.write("\t".join(
                map(str, (motif, id, nmatches, score,
                          ",".join(map(str, scores)), len(c), mmax))) + "\n")

    tmpfile.close()

    P.load(tmpfile.name,
           outfile,
           options="--add-index=id "
           "--add-index=motif "
           "--add-index=id,motif "
           "--allow-empty-file "
           "--map=base_qualities:text")

    os.unlink(tmpfile.name)
Esempio n. 6
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-f",
                      "--input-filter-tsv",
                      dest="input_filter_tsv",
                      type="string",
                      help="list with identifiers to remove. "
                      "[%default]")

    parser.add_option("--set-prefix",
                      dest="set_prefix",
                      type="string",
                      help="set sequence prefix [%default]")

    parser.add_option("--min-length",
                      dest="min_length",
                      type="int",
                      help="minimum alignment length [%default]")

    parser.add_option("--method",
                      dest="methods",
                      action="append",
                      choices=("shift-region", ),
                      help="methods to apply [%default]")

    parser.set_defaults(
        input_maf_file=None,
        input_filter_tsv=None,
        set_prefix=None,
        min_length=0,
        methods=[],
    )

    (options, args) = E.start(parser, argv)

    if options.input_filter_tsv:
        with IOTools.open_file(options.input_filter_tsv) as inf:
            skip_id = set([x[:-1] for x in inf])
    else:
        skip_id = False

    counter = E.Counter()

    if options.set_prefix:
        prefix = "s {}".format(options.set_prefix)
    else:
        prefix = None

    for block in iterate_maf_blocks(options.stdin):
        counter.blocks_input += 1
        if skip_id:
            if block[2].startswith("s "):
                id = re.match("s (\S+)", block[2]).groups()[0]
                if id in skip_id:
                    counter.blocks_skipped_id += 1
                    continue

        if options.min_length:
            if block[2].startswith("s "):
                id, pos, length = re.match("s (\S+)\s+(\d+)\s+(\d+)",
                                           block[2]).groups()
                if int(length) <= options.min_length:
                    counter.blocks_skipped_length += 1
                    continue

        if prefix:
            block[2] = prefix + block[2][4:]

        if block[2].startswith("s "):
            header, ali1, ali2, qual = parse_block(block)
            if "shift-region" in options.methods:
                rows = []
                contig, start, end = parse_region_string(ali1.src)
                ali1 = ali1._replace(src=contig, start=start + ali1.start)
                rows.append(list(map(str, ali1)))
                rows.append(list(map(str, ali2)))
                if qual:
                    rows.append(list(map(str, qual)))
                lines = [header]
                lines.append(format_tabular(rows, "llrrrrl"))
                lines.append("\n")
                block = lines
        counter.blocks_output += 1
        options.stdout.write("".join(block))

    E.info(counter)
    E.stop()
Esempio n. 7
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(version="%prog version: $Id",
                                   usage=globals()["__doc__"])

    parser.add_option("--bin",
                      dest="bin",
                      action="store_true",
                      help="output average in bins across the interval")
    parser.add_option("-n",
                      "--num-bins",
                      dest="bin_number",
                      type=int,
                      help="number of bins for coverage profile")
    parser.add_option("-o",
                      "--output-filename-prefix",
                      dest="output_filename_prefix",
                      help="pattern to write coverage bins to")

    parser.set_defaults(bin=False, bin_number=10)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    inf = options.stdin

    coverage_result = collections.defaultdict(list)
    E.info("reading in coverage data")
    for line in inf.readlines():
        data = line[:-1].split("\t")
        contig, coverage = data[0], data[2]
        coverage_result[contig].append(coverage)
    E.info("read %i contigs" % len(list(coverage_result.keys())))

    options.stdout.write("contig\tcov_mean\tcov_sd\n")
    if options.bin:
        outf = IOTools.open_file(options.output_filename_prefix + ".binned",
                                 "w")
        outf.write(
            "%s" %
            "\t".join([str(i)
                       for i in range(1, options.bin_number + 1, 1)]) + "\n")
    for contig, coverage in coverage_result.items():
        coverage = list(map(float, coverage))
        options.stdout.write(
            "%s\t%s\t%s\n" %
            (contig, str(np.mean(coverage)), str(np.std(coverage))))
        if options.bin:
            bin_means = []
            bins = np.linspace(0, len(coverage), options.bin_number + 1)
            if len(coverage) < len(bins) - 1:
                E.warn("will not calculate coverage means for %s: too short" %
                       contig)
                continue
            for i in range(len(bins)):
                try:
                    bin_mean = np.mean(coverage[int(bins[i]):int(bins[i + 1])])
                except IndexError:
                    continue
                bin_means.append(bin_mean)
            outf.write(contig + "\t" + "\t".join(map(str, bin_means)) + "\n")
    outf.close()

    # write footer and output benchmark information.
    E.stop()
Esempio n. 8
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "--input-filename-fasta",
        dest="input_filename_fasta",
        type="string",
        help="filename with reference sequence in fasta format [%default]")

    parser.add_option("--input-filename-bam",
                      dest="input_filename_bam",
                      type="string",
                      help="filename with aligned reads [%default]")

    parser.add_option("--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=["add-strelka-genotype", "lift-over"],
                      help="methods to apply [%default]")

    parser.add_option(
        "--input-filename-chain",
        dest="input_filename_chain",
        type="string",
        help="filename with alignment chain for lift-over [%default]")

    parser.add_option(
        "--normal-sample-regex",
        dest="normal_sample_regex",
        type="string",
        help="regular expression to apply to header to identify normal "
        "sample id [%default]")

    parser.add_option(
        "--output-filename-unmapped",
        dest="output_filename_unmapped",
        type="string",
        help="filename with variants that could not be lifted over [%default]")

    parser.set_defaults(
        input_filename_fasta=None,
        input_filename_bam=None,
        input_filename_vcf="-",
        sample_size=0.001,
        region_size=20,
        methods=[],
        normal_sample_regex=None,
        input_filename_chain=None,
        output_filename_unmapped=None,
    )

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if len(args) > 0:
        options.input_filename_vcf = args[0]

    vcf_in = pysam.VariantFile(options.input_filename_vcf)

    if "lift-over" in options.methods:
        if options.input_filename_chain is None:
            raise ValueError(
                "--method=lift-over requires --input-filename-chain")
        if not os.path.exists(options.input_filename_chain):
            raise OSError("file {} with chain data does not exist".format(
                options.input_filename_chain))
        E.info("reading chain from {}".format(options.input_filename_chain))
        with IOTools.open_file(options.input_filename_chain) as inf:
            map_chain, map_contig2length = read_liftover_chain(inf)

    if options.input_filename_fasta:
        fasta = pysam.FastaFile(options.input_filename_fasta)
    else:
        fasta = None

    if options.input_filename_bam:
        bam = pysam.AlignmentFile(options.input_filename_bam)
    else:
        bam = None

    outf = options.stdout

    c = E.Counter()

    if "add-strelka-genotype" in options.methods:
        map_nt2gt = {"ref": "0/0", "het": "0/1", "hom": "1/1", "conflict": "."}

        map_tumour2gt = {"ref": "0/0", "het": "0/1", "hom": "1/1"}

        header = str(vcf_in.header).splitlines()

        header.insert(
            len(header) - 1,
            '##FORMAT=<ID=GT,Number=1,Type=String,Description='
            '"Genotypes of reference and alternative alleles, '
            'added by CGATCore vcf2vcf.">')

        header = "\n".join(header)
        if options.normal_sample_regex:
            normal_sample = re.search(" -bam-file \S+/([^/]+)_S\d+.bam",
                                      header).groups()[0]
        else:
            normal_sample = "NORMAL"

        is_first = True

        for record in vcf_in:
            c.input += 1

            if "GT" in record.format:
                if is_first:
                    outf.write(header + "\n")
                    is_first = False
                outf.write(str(record))
                c.has_gt += 1
                continue

            gt_normal = map_nt2gt[record.info["NT"]]
            gt_tumour = record.info["SGT"]
            norm, tumour = gt_tumour.split("->")
            if gt_tumour[0] in "ACGT":
                alts = record.alts
                if alts is None:
                    c.no_alt += 1
                    continue

                if len(record.alts) > 1:
                    c.multi_allelic += 1
                    continue

                _map_tumour2gt = {record.alts[0]: "1", record.ref: "0"}
                try:
                    gt_tumour = "/".join(
                        sorted([_map_tumour2gt[x] for x in tumour]))
                except KeyError:
                    gt_tumour = "."
                    c.ambigous_genotype += 1
            else:
                gt_tumour = map_tumour2gt[tumour]

            fields = str(record)[:-1].split("\t")
            # FORMAT
            fields[8] = ":".join(("GT", fields[8]))
            # SAMPLES
            # makes a few assumptions, fix!
            header_insert_normal = False
            if len(fields) == 11:
                fields[9] = ":".join((gt_normal, fields[9]))
                fields[10] = ":".join((gt_tumour, fields[10]))
            elif len(fields) == 10:
                header_insert_normal = True
                values = fields[9].split(":")
                fields.append(":".join((gt_tumour, fields[9])))
                fields[9] = ":".join([gt_normal] + ["."] * len(values))
            else:
                raise NotImplementedError()

            if is_first:
                if not header_insert_normal:
                    outf.write(header + "\n")
                else:
                    header = re.sub(r"\tFORMAT\t",
                                    "\tFORMAT\t%s\t" % normal_sample, header)
                    outf.write(header + "\n")
                is_first = False
            outf.write("\t".join(fields) + "\n")
            c.output += 1

    elif "lift-over" in options.methods:
        header = str(vcf_in.header).splitlines()

        if fasta:
            # validate contig size
            expected_lengths = dict(list(zip(fasta.references, fasta.lengths)))
        else:
            expected_lengths = map_contig2length

        # update contig names and sizes in VCF header
        header = [x for x in header if not x.startswith("##contig")]
        header[-1:-1] = [
            "##contig=<ID={},length={}>".format(contig, length)
            for contig, length in sorted(expected_lengths.items())
        ]

        header.insert(
            len(header) - 1, '##liftover=<CHAIN={},REFERENCE={}>'.format(
                options.input_filename_chain, options.input_filename_fasta))
        outf.write("\n".join(header) + "\n")

        unmapped_contigs = set()
        unknown_contigs = set()

        trans_genotypes = str.maketrans("01", "10")

        if fasta:
            # validate contig size
            expected_lengths = dict(list(zip(fasta.references, fasta.lengths)))
            for contig, length in list(map_contig2length.items()):
                if contig in expected_lengths:
                    if length != expected_lengths[contig]:
                        raise ValueError(
                            "contig lengths mismatch. For contig {} chain files "
                            "says {}, but fasta files says {}".format(
                                contig, length, expected_lengths[contig]))
            E.info("contig sizes in chain file and fasta files correspond.")

        if options.output_filename_unmapped:
            outfile_unmapped = IOTools.open_file(
                options.output_filename_unmapped, "w")
            outfile_unmapped.write("\n".join(header) + "\n")
        else:
            outfile_unmapped = None

        for record in vcf_in:
            c.input += 1

            try:
                mm = map_chain[record.contig]
            except KeyError:
                c.skipped_unmapped_contig += 1
                unmapped_contigs.add(record.contig)
                if outfile_unmapped:
                    outfile_unmapped.write(
                        "skipped_unmapped_contig\t{}".format(str(record)))
                continue

            try:
                m = mm.search(record.start, record.stop)
            except AttributeError:
                c.skipped_mapping_error += 1
                if outfile_unmapped:
                    outfile_unmapped.write("skipped_mapping_error\t{}".format(
                        str(record)))
                continue

            if len(m) == 0:
                c.skipped_unmapped_position += 1
                if outfile_unmapped:
                    outfile_unmapped.write(
                        "skipped_unmapped_position\t{}".format(str(record)))
                continue
            elif len(m) > 1:
                c.skipped_multimapping_position += 1
                if outfile_unmapped:
                    outfile_unmapped.write(
                        "skipped_multimapping_position\t{}".format(
                            str(record)))
                continue

            m = m[0]
            y_contig, y_start, y_end, y_invert = m.data

            if y_invert:
                y_pos = y_end - (record.start - m.start)
            else:
                y_pos = (record.start - m.start) + y_start

            if fasta:
                try:
                    ref_base = fasta.fetch(y_contig, y_pos,
                                           y_pos + len(record.ref)).upper()
                except KeyError:
                    c.skipped_unknown_contig += 1
                    unknown_contigs.add(y_contig)
                    ref_base = None
                    continue

            swap_alleles = False
            if ref_base:
                error = False
                if ref_base == record.ref:
                    c.matches += 1
                else:
                    if len(record.alts) == 1:
                        alt_base = record.alts[0]
                        if ref_base == alt_base:
                            swap_alleles = True
                            c.allele_swap_variant += 1
                        else:
                            c.error_mismatch_variant += 1
                            error = "mismatch"
                    else:
                        error = "multi-mismatch"
                        c.error_multi_mismatch_variant += 1

                if error:
                    if outfile_unmapped:
                        outfile_unmapped.write("{}\t{}".format(
                            error, str(record)))
                    c.skipped_error_variant += 1
                    continue

            fields = str(record)[:-1].split("\t")
            fields[0] = y_contig
            fields[1] = str(y_pos)

            if swap_alleles:
                fields[4] = alt_base
                fields[5] = ref_base
                # update genotype fields
                keep = False
                for idx in range(9, len(fields)):
                    gt, rest = fields[idx].split(":", 1)
                    keep = keep or "0" in gt
                    fields[idx] = ":".join(
                        (gt.translate(trans_genotypes), rest))

                # remove reference only calls
                if not keep:
                    if outfile_unmapped:
                        outfile_unmapped.write("reference_call\t{}".format(
                            str(record)))
                    c.skipped_allele_swap_reference += 1
                continue

            c.output += 1
            outf.write("\t".join(fields) + "\n")

        c.unmapped_contigs = len(unmapped_contigs)
        c.unknown_contigs = len(unknown_contigs)

        E.info(c.asTable())
        if unknown_contigs:
            E.info("unknown contigs: {}".format(",".join(
                sorted(unknown_contigs))))
        if unmapped_contigs:
            E.info("unmapped contigs: {}".format(",".join(
                sorted(unmapped_contigs))))

    E.stop()
Esempio n. 9
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version",
                            usage=globals()["__doc__"])

    parser.add_option("--output-quality-format",
                      dest="q_format",
                      type="int",
                      help="sequence quality format, e.g 33 = +33/Sanger"
                      "[default=%default].")

    parser.add_option("--output-paired-end",
                      dest="paired",
                      action="store_true",
                      help="generate paired end reads [default = %default].")

    parser.add_option("--insert-length-mean",
                      dest="insert_mean",
                      type="float",
                      help="mean insert length [default = %default].")

    parser.add_option(
        "--insert-length-sd",
        dest="insert_sd",
        type="float",
        help="insert length standard deviation [default = %default].")

    parser.add_option(
        "--counts-method",
        dest="counts_method",
        type="choice",
        choices=("reads", "copies"),
        help="simulate a ground truth number of reads per entry or"
        "copies per entry [default = %default].")

    parser.add_option("--counts-min",
                      dest="counts_min",
                      type="float",
                      help="minimum number of reads/read pairs per fasta entry"
                      "or copies per entry [default = %default].")

    parser.add_option(
        "--counts-max",
        dest="counts_max",
        type="float",
        help="maximum number of reads/read pairs per fasta entry "
        "or copies per entry [default = %default].")

    parser.add_option("--output-read-length",
                      dest="read_length",
                      type="int",
                      help="read length [default = %default].")

    parser.add_option("--sequence-error-phred",
                      dest="phred",
                      type="int",
                      help="phred quality score [default = %default].")

    parser.add_option("--output-counts",
                      dest="output_counts",
                      type="string",
                      help="name for counts outfile [default=%default].")

    parser.add_option(
        "--output-fastq2",
        dest="fastq2_out",
        type="string",
        help="filename for second fastq outfile [default=%default].")

    parser.add_option("--premrna-fraction",
                      dest="premrna_fraction",
                      type="float",
                      help="the fraction of reads to simulate from pre-mRNA"
                      "[default= % default].")

    parser.add_option("--infile-premrna-fasta",
                      dest="premrna_fasta",
                      type="string",
                      help="filename for pre-mRNA fasta[default=%default].")

    parser.set_defaults(q_format=33,
                        paired=False,
                        insert_mean=0,
                        insert_sd=1,
                        counts_method="reads",
                        counts_min=1,
                        counts_max=1,
                        read_length=50,
                        fastq2_out=None,
                        output_counts=None,
                        phred=30,
                        premrna_fraction=0,
                        premrna_fasta=None)

    (options, args) = E.start(parser)

    if options.paired:
        assert options.fastq2_out, ("must specify a second fastq outfile for "
                                    "paired end (--output-fastq2)")
        outf2 = IOTools.open_file(options.fastq2_out, "w")

    if options.premrna_fraction:
        assert options.premrna_fasta, ("must specfify the location of the"
                                       "fasta file for the pre-mRNA")

    # the sequence quality string will always be the same so define here
    sequence_quality = chr(options.q_format + options.phred)
    qual = "".join([sequence_quality] * options.read_length)

    if options.premrna_fraction:
        iterator = FastaIterator.iterate_together(
            options.stdin, IOTools.open_file(options.premrna_fasta))
    else:
        iterator = FastaIterator.FastaIterator(options.stdin)

    # set a cut off of twice the read/pair length for short entries
    if options.paired:
        minimum_entry_length = (
            2 * ((options.read_length * 2) + options.insert_mean))
    else:
        minimum_entry_length = 2 * options.read_length

    c = collections.Counter()
    counts = collections.Counter()
    copies = collections.Counter()

    for f_entry in iterator:

        if options.premrna_fraction:

            assert getTitle(f_entry[0]) == getTitle(
                f_entry[1]), ("entry ids do not match: %s != %s" %
                              (f_entry[0].title, f_entry[1].title))
            entry = f_entry[0]
            pre_entry = f_entry[1]

        else:
            entry = f_entry

        # reject short fasta entries
        if len(entry.sequence) < minimum_entry_length:
            E.info("skipping short transcript: %s length=%i" %
                   (entry.title, len(entry.sequence)))
            c['skipped'] += 1
            continue

        else:
            c['not_skipped'] += 1

        if options.paired:
            fragment_length = ((2 * options.read_length) + options.insert_mean)
        else:
            fragment_length = options.read_length

        reads_per_entry = float(len(entry.sequence)) / fragment_length

        if options.counts_method == "reads":
            n_reads = random.randint(options.counts_min,
                                     options.counts_max + 1)

            n_copies = float(n_reads) / reads_per_entry

            if options.premrna_fraction:
                n_reads_pre = int(round(n_reads * options.premrna_fraction))

        elif options.counts_method == "copies":

            # random float [0-1]
            rand = np.random.random_sample()
            n_copies = (options.counts_min +
                        (rand * (options.counts_max - options.counts_min)))

            n_reads = int(round(n_copies * reads_per_entry, 0))

            # as n_reads must be rounded to int, need to redefine n_copies
            n_copies = float(n_reads) / reads_per_entry

            if options.premrna_fraction:
                reads_per_pre_entry = (float(len(pre_entry.sequence)) /
                                       fragment_length)
                n_copies_pre = n_copies * options.premrna_fraction
                n_reads_pre = int(round(n_copies_pre * reads_per_pre_entry, 0))
                # as n_reads_pre must be rounded to int, need to
                # redefine n_copies_pre
                n_copies_pre = float(n_reads_pre) / reads_per_pre_entry

        entry_id = getTitle(entry)

        counts[entry_id] = n_reads
        copies[entry_id] = n_copies

        if "N" in entry.sequence.upper():
            E.warn("fasta entry %s contains unknown bases ('N')" % entry_id)

        for i in range(0, n_reads):

            read = generateRead(entry=entry.sequence.upper(),
                                read_length=options.read_length,
                                error_rate=options.phred,
                                paired=options.paired,
                                insert_mean=options.insert_mean,
                                insert_sd=options.insert_sd)

            if options.paired:
                r1, r2 = read
                h1 = "@%s_%i/1" % (entry_id, i)
                h2 = "@%s_%i/2" % (entry_id, i)

                options.stdout.write("\n".join((h1, r1, "+", qual)) + "\n")
                outf2.write("\n".join((h2, r2, "+", qual)) + "\n")

            else:
                h = "@%s_%i/1" % (entry_id, i)

                options.stdout.write("\n".join((h, read, "+", qual)) + "\n")

        if options.premrna_fraction:
            c['pre_counts'] += n_reads_pre
            c['pre_copies'] += n_copies_pre

            for i in range(0, n_reads_pre):

                read = generateRead(entry=pre_entry.sequence.upper(),
                                    read_length=options.read_length,
                                    error_rate=options.phred,
                                    paired=options.paired,
                                    insert_mean=options.insert_mean,
                                    insert_sd=options.insert_sd)

                if options.paired:
                    r1, r2 = read
                    h1 = "@%s_pre-mRNA_%i/1" % (entry_id, i)
                    h2 = "@%s_pre-mRNA_%i/2" % (entry_id, i)

                    options.stdout.write("\n".join((h1, r1, "+", qual)) + "\n")
                    outf2.write("\n".join((h2, r2, "+", qual)) + "\n")

                else:
                    h = "@%s_pre-mRNA_%i/1" % (entry_id, i)

                    options.stdout.write("\n".join((h, read, "+", qual)) +
                                         "\n")

    if options.paired:
        outf2.close()

    with IOTools.open_file(options.output_counts, "w") as counts_out:

        counts_out.write("%s\n" % "\t".join(("id", "read_count", "tpm")))

        sum_copies = sum(copies.values())
        sum_counts = sum(counts.values())

        for entry_id, count in counts.items():
            tpm = 1000000 * (float(copies[entry_id]) / sum_copies)
            counts_out.write("%s\n" %
                             "\t".join(map(str, (entry_id, count, tpm))))

    E.info("Reads simulated for %i fasta entries, %i entries skipped" %
           (c['not_skipped'], c['skipped']))

    E.info("Simulated: %i reads (%i mRNA, %i pre-mRNA), "
           "%f transcripts (%f mRNA, %f pre-mRNA)" %
           (sum_counts + c['pre_counts'], sum_counts, c['pre_counts'],
            sum_copies + c['pre_copies'], sum_copies, c['pre_copies']))

    E.stop()
Esempio n. 10
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--template-bam-file",
                      dest="filename_genome_bam",
                      type="string",
                      help="input bam file for header information [%default]")

    parser.add_option("-s",
                      "--contigs-tsv-file",
                      dest="filename_contigs",
                      type="string",
                      help="filename with contig sizes [%default]")

    parser.add_option(
        "-o",
        "--colour",
        dest="colour_mismatches",
        action="store_true",
        help="mismatches will use colour differences (CM tag) [%default]")

    parser.add_option("-i",
                      "--ignore-mismatches",
                      dest="ignore_mismatches",
                      action="store_true",
                      help="ignore mismatches [%default]")

    parser.add_option(
        "-c",
        "--remove-contigs",
        dest="remove_contigs",
        type="string",
        help="','-separated list of contigs to remove [%default]")

    parser.add_option("-f",
                      "--force-output",
                      dest="force",
                      action="store_true",
                      help="force overwriting of existing files [%default]")

    parser.add_option("-u",
                      "--unique",
                      dest="unique",
                      action="store_true",
                      help="remove reads not matching uniquely [%default]")

    parser.set_defaults(
        filename_genome_bam=None,
        filename_gtf=None,
        filename_mismapped=None,
        remove_contigs=None,
        force=False,
        unique=False,
        colour_mismatches=False,
        ignore_mismatches=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    genomefile, referencenames, referencelengths = None, None, None

    if options.filename_genome_bam:
        genomefile = pysam.AlignmentFile(options.filename_genome_bam, "rb")
    elif options.filename_contigs:
        contigs = IOTools.ReadMap(IOTools.open_file(options.filename_contigs))
        data = list(zip(*list(contigs.items())))
        referencenames, referencelengths = data[0], list(map(int, data[1]))
    else:
        raise ValueError(
            "please provide either --template-bam-file or --contigs-tsv-file")

    infile = pysam.AlignmentFile("-", "rb")
    outfile = pysam.AlignmentFile("-",
                                  "wb",
                                  template=genomefile,
                                  referencenames=referencenames,
                                  referencelengths=referencelengths)

    if options.colour_mismatches:
        tag = "CM"
    else:
        tag = "NM"

    nambiguous = 0
    ninput = 0
    nunmapped = 0
    ncigar = 0
    nfull = 0
    noutput = 0

    contig2tid = dict([(y, x) for x, y in enumerate(outfile.references)])

    for qname, readgroup in itertools.groupby(infile, lambda x: x.qname):
        ninput += 1
        reads = list(readgroup)
        if reads[0].is_unmapped:
            nunmapped += 1
            continue

        # filter for best match
        best = min([x.opt(tag) for x in reads])
        reads = [x for x in reads if x.opt(tag) == best]
        if len(reads) > 1:
            nambiguous += 1
            continue

        read = reads[0]

        # reject complicated matches (indels, etc)
        # to simplify calculations below.
        if len(read.cigar) > 1:
            ncigar += 1
            continue

        # set NH flag to latest count
        t = dict(read.tags)
        t['NH'] = 1
        read.tags = list(t.items())

        sname = infile.getrname(read.tid)

        contig, first_exon_start, middle, last_exon_end, splice, strand = sname.split(
            "|")
        first_exon_end, last_exon_start = middle.split("-")
        first_exon_start, first_exon_end, last_exon_start, last_exon_end = list(
            map(int, (first_exon_start, first_exon_end, last_exon_start,
                      last_exon_end)))
        first_exon_end += 1

        total = first_exon_end - first_exon_start + \
            last_exon_end - last_exon_start
        first_exon_length = first_exon_end - first_exon_start

        match1 = first_exon_length - read.pos
        intron_length = last_exon_start - first_exon_end
        match2 = read.qlen - match1

        # match lies fully in one exon - ignore
        if match1 <= 0 or match2 <= 0:
            nfull += 1
            continue

        # increment pos
        read.pos = first_exon_start + read.pos
        read.tid = contig2tid[contig]
        # 3 = BAM_CREF_SKIP
        read.cigar = [(0, match1), (3, intron_length), (0, match2)]

        outfile.write(read)

        noutput += 1

    outfile.close()
    if genomefile:
        genomefile.close()

    c = E.Counter()
    c.input = ninput
    c.output = noutput
    c.full = nfull
    c.cigar = ncigar
    c.ambiguous = nambiguous
    c.unmapped = nunmapped

    E.info("%s" % str(c))

    # write footer and output benchmark information.
    E.stop()
Esempio n. 11
0
def loadIntervals(infile, outfile):
    '''load intervals from :term:`bed` formatted files into
    the database.

    If a :term:`bam` file is associated with a :term:`bed`
    file, re-evaluate the intervals by counting reads within
    the interval. In contrast to the initial pipeline, the
    genome is not binned.

       nprobes: number of reads in interval
       peakcenter: position with maximum number of reads in interval
       avgval: average coverage within interval
    '''

    tmpfile = P.get_temp_file(".")

    headers = ("avgval", "disttostart",
               "genelist", "length",
               "peakcenter", "peakval",
               "position", "interval_id",
               "npeaks", "nprobes",
               "contig", "start", "end", "score", "strand")

    tmpfile.write("\t".join(headers) + "\n")

    (avgval, contig, disttostart, end, genelist,
     length, peakcenter, peakval, position,
     start, interval_id, npeaks, nprobes) = \
        0, "", 0, 0, "", 0, 0, 0, 0, 0, 0, 0, 0

    track = Sample(filename=P.snip(infile, ".bed.gz"))

    bamfiles, offsets = getAssociatedBAMFiles(track)

    if bamfiles:
        E.info("%s: associated bamfiles = %s" % (track, bamfiles))
    else:
        E.info("%s: no bamfiles associated" % (track))

    # open all bamfiles
    samfiles = [pysam.Samfile(fn, "rb") for fn in bamfiles]

    c = E.Counter()

    # count tags
    for bed in Bed.iterator(IOTools.open_file(infile, "r")):

        c.input += 1

        if "name" not in bed:
            bed.name = c.input

        try:
            strand = bed["strand"]
        except IndexError:
            strand = "."
            
        # The fifth field of a bed file can be used to supply a
        # score. Our iterator returns the optional fields as a "fields
        # array". The first of these is the interval name, and the
        # second the score. The score may be more is better or less is
        # better.
        if len(bed.fields) > 1:
            value = bed.fields[1]
            if value != "":
                score = value
            else:
                score = 1
        else:
            score = 1

        if samfiles:
            npeaks, peakcenter, length, avgval, peakval, nprobes = \
                PipelinePeakcalling.countPeaks(
                    bed.contig,
                    bed.start,
                    bed.end,
                    samfiles,
                    offsets)
            if nprobes == 0:
                c.skipped_reads += 1

        else:
            # deal with bed12
            bed_intervals = bed.toIntervals()
            length = sum([e - s for s, e in bed_intervals])
            mid_point = length / 2
            for s, e in bed_intervals:
                peakcenter = s + mid_point
                if peakcenter >= e:
                    mid_point = peakcenter - e
                else:
                    break

            npeaks, avgval, peakval, nprobes = \
                (1,
                 1,
                 1,
                 1)

        c.output += 1
        tmpfile.write("\t".join(map(
            str,
            (avgval, disttostart, genelist, length,
             peakcenter, peakval, position, bed.name,
             npeaks, nprobes,
             bed.contig, bed.start, bed.end, score, strand))) + "\n")

    if c.output == 0:
        E.warn("%s - no aggregate intervals")

    tmpfile.close()

    P.load(tmpfile.name,
           outfile,
           tablename=os.path.basename("%s_intervals" % track.asTable()),
           options="--allow-empty-file "
           "--add-index=interval_id")

    os.unlink(tmpfile.name)

    E.info("%s\n" % str(c))
Esempio n. 12
0
def createGOFromGeneOntology(infile, outfile):
    """get GO assignments from Geneontology.org

    GO terms are mapped to ensembl gene names via uniprot identifiers.

    Configuration
    -------------
    geneontology_file
       Filename on geneontology database, e.g.,
       gene_association.goa_human.gz
    database_name
       Pipeline database name

    Arguments
    ---------
    infile : string
        Unused
    outfile : string
        Output filename
    """

    filename = os.path.join(os.path.dirname(outfile), "geneontology.goa.gz")
    if not os.path.exists(filename):
        statement = '''
        wget -O %(filename)s http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/%(go_geneontology_file)s?rev=HEAD
    '''

        P.run(statement)

    # see http://www.geneontology.org/gene-associations/readme/goa.README
    Data = collections.namedtuple(
        "Data",
        "db db_object_id db_object_symbol qualifier goid dbreference evidence "
        " with_id aspect "
        " db_object_name synonym db_object_type "
        " taxon_id date assigned_by "
        " annotation_extension"
        " gene_product_form_id")

    dbh = sqlite3.connect(PARAMS["database_name"])
    cc = dbh.cursor()
    map_uniprot2ensembl = dict(
        cc.execute("SELECT DISTINCT gene_name, gene_id FROM transcript_info").
        fetchall())
    map_goid2description = dict(
        cc.execute("SELECT DISTINCT go_id, description FROM go_assignments").
        fetchall())

    aspect2name = {
        "P": "biol_process",
        "F": "mol_function",
        "C": "cell_location"
    }

    c = E.Counter()
    found_uniprot, found_genes, notfound_uniprot = set(), set(), set()
    outf = IOTools.open_file(outfile, "w")
    outf.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n")
    for line in IOTools.open_file(filename):
        if line.startswith("!"):
            continue
        c.input += 1
        data = Data._make(line[:-1].split("\t"))

        if data.db_object_symbol in map_uniprot2ensembl:
            gene_id = map_uniprot2ensembl[data.db_object_symbol]
            found_uniprot.add(data.db_object_symbol)
            found_genes.add(gene_id)
            outf.write(
                "%s\t%s\t%s\t%s\t%s\n" %
                (aspect2name[data.aspect], gene_id, data.goid,
                 map_goid2description.get(data.goid, ""), data.evidence))
            c.output += 1

        else:
            c.notfound += 1
            notfound_uniprot.add(data.db_object_symbol)

    c.found_genes = len(found_genes)
    c.found_uniprot = len(found_uniprot)
    c.notfound_uniprot = len(notfound_uniprot)

    E.info("%s" % str(c))
    E.info("not found=%s" % str(notfound_uniprot))
    outf.close()
Esempio n. 13
0
def imputeGO(infile_go, infile_paths, outfile):
    """impute GO accessions.

    Output a list of gene-to-GO associations for genes that includes
    ancestral terms.

    Arguments
    ---------
    infile_go : string
        Filename with gene-to-GO assocations for genes
    infile_paths : string
        Filename with paths of term to ancestor (see go2fmt.pl).
    outfile : string
         Output filename

    """

    c = E.Counter()

    term2ancestors = collections.defaultdict(set)
    with IOTools.open_file(infile_paths) as inf:
        for line in inf:
            parts = line[:-1].split()
            term = parts[0]
            ancestors = [parts[x] for x in range(2, len(parts), 2)]
            # there can be multiple paths
            term2ancestors[term].update(ancestors)

    goid2description = {}
    gene2goids = collections.defaultdict(list)
    goid2type = {}
    with IOTools.open_file(infile_go) as inf:
        for line in inf:
            if line.startswith("go_type"):
                continue
            go_type, gene_id, goid, description, evidence = line[:-1].split(
                "\t")
            gene2goids[gene_id].append(goid)
            goid2description[goid] = description
            goid2type[goid] = go_type

    outf = IOTools.open_file(outfile, "w ")
    for gene_id, in_goids in gene2goids.items():
        c.genes += 1
        out_goids = set(in_goids)
        for goid in in_goids:
            out_goids.update(term2ancestors[goid])
        if len(in_goids) != len(out_goids):
            c.increased += 1
        else:
            c.complete += 1

        for goid in out_goids:
            outf.write("\t".join((goid2type.get(goid, ""), gene_id, goid,
                                  goid2description.get(goid, ""), "NA")) +
                       "\n")
            c.assocations += 1

    outf.close()

    E.info("%s" % str(c))
Esempio n. 14
0
 def __init__(self, infile, *args, **kwargs):
     self.gff = pysam.tabix_iterator(IOTools.open_file(infile),
                                     parser=pysam.asGFF3())
Esempio n. 15
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("--library-source",
                      dest="library_source",
                      type="string",
                      help="supply help")

    parser.add_option("--library-selection",
                      dest="library_selection",
                      type="string",
                      help="supply help")

    parser.add_option("--tax-identifier",
                      dest="tax_id",
                      type="int",
                      help="supply help")

    parser.set_defaults(library_source=None,
                        library_selection=None,
                        tax_id=9606)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    # tree = ET.parse('/ifs/home/andreas/ena.xml')
    # root = tree.getroot()

    # for study in root.findall("STUDY"):
    #     alias = study.attrib["alias"]
    #     center_name = study.attrib["center_name"]
    #     accession   = study.attrib["accession"]
    #     try:
    #         description = study.find("*/STUDY_DESCRIPTION").text
    #         description = description.encode('ascii', 'ignore')
    #     except AttributeError:
    #         description = ""

    #     options.stdout.write( "\t".join( (alias,
    #                                       accession,
    #                                       center_name,
    #                                       description ) ) + "\n")

    # query_url = "http://www.ebi.ac.uk/ena/data/warehouse/search?query=%22tax_eq%289606%29%20AND%20library_source=%22TRANSCRIPTOMIC%22%20AND%20%28instrument_model=%22Illumina%20Genome%20Analyzer%20II%22%20OR%20instrument_model=%22Illumina%20Genome%20Analyzer%22%20OR%20instrument_model=%22Illumina%20Genome%20Analyzer%20IIx%22%20OR%20instrument_model=%22Illumina%20HiScanSQ%22%20OR%20instrument_model=%22Illumina%20HiSeq%201000%22%20OR%20instrument_model=%22Illumina%20HiSeq%202000%22%20OR%20instrument_model=%22Illumina%20HiSeq%202500%22%29%22&domain=read&download=txt"
    # query_url = "http://www.ebi.ac.uk/ena/data/view/search?query=%22tax_eq%289606%29%20AND%20library_source=%22TRANSCRIPTOMIC%22%20AND%20%28instrument_model=%22Illumina%20Genome%20Analyzer%20II%22%20OR%20instrument_model=%22Illumina%20Genome%20Analyzer%22%20OR%20instrument_model=%22Illumina%20Genome%20Analyzer%20IIx%22%20OR%20instrument_model=%22Illumina%20HiScanSQ%22%20OR%20instrument_model=%22Illumina%20HiSeq%201000%22%20OR%20instrument_model=%22Illumina%20HiSeq%202000%22%20OR%20instrument_model=%22Illumina%20HiSeq%202500%22%29%22&domain=read&download=txt"
    # query_url = "http://www.ebi.ac.uk/ena/data/warehouse/search?query=%22(instrument_model=%22Illumina%20HiSeq%202000%22%20OR%20instrument_model=%22Illumina%20HiSeq%201000%22%20OR%20instrument_model=%22Illumina%20HiSeq%202500%22)%20AND%20library_layout=%22PAIRED%22%20AND%20library_source=%22TRANSCRIPTOMIC%22%22&domain=read"
    # query_url = "http://www.ebi.ac.uk/ena/data/view/A00145&display=xml"

    query_url = "http://www.ebi.ac.uk/ena/data/warehouse/search"
    data_url = "http://www.ebi.ac.uk/ena/data/view"

    # params = None
    # query_url = "http://www.ebi.ac.uk/ena/data/view/DRP000011&display=xml"

    fields = [
        'base_count',
        'read_count',
        'instrument_model',
        'scientific_name',
        'library_layout',
        'library_source',
        'library_strategy',
        'library_selection',
        'experiment_accession',
        'experiment_title',
        'study_accession',
        'study_title',
        'first_public',
        'submission_accession',
        'center_name',
    ]

    query = 'tax_eq(%i) AND instrument_platform="ILLUMINA"' % (options.tax_id)

    if options.library_source:
        query += ' AND library_source="%s" ' % options.library_source
    if options.library_selection:
        query += ' AND library_selection="%s" ' % options.library_selection

    # collect pre-study results
    params = urlencode({
        'query': query,
        'display': 'report',
        'fields': ",".join(fields),
        'result': 'read_run'
    })

    E.debug("?".join((query_url, params)))

    lines = urlopen(query_url, params)

    header = lines.readline()

    fields.insert(0, 'run_accession')

    DATA = collections.namedtuple("DATA", fields)

    fields.append("read_length")
    fields.append("design")

    table_study = options.stdout  # IOTools.open_file( "study.tsv", "w" )
    table_study.write("\t".join(fields) + "\n")
    # collect a list of all studies
    studies = set()

    for line in lines:
        # line endings are \r\n for data, but only \n for header
        line = line[:-2]

        data = DATA(*line.split("\t"))
        try:
            read_length = float(data.base_count) / float(data.read_count)
        except ValueError:
            read_length = 0

        if data.library_layout == "PAIRED":
            read_length /= 2.0

        design = MAP_CODE2DESIGN.get(
            (data.library_selection, data.library_source), "other")

        table_study.write(line + "\t" + str(read_length) + "\t" + design +
                          "\n")

        studies.add(data.study_accession)

    table_studies = IOTools.open_file("studies.tsv", "w")
    studies_fields = ["study_accession", "nreferences", "pubmed_ids"]

    table_studies.write("\t".join(studies_fields) + "\n")

    return

    # params = urllib.urlencode( { 'display' : 'xml' } )
    # url =  "/".join( ( data_url, 'SRP013999') ) + "&" + params
    # print urllib2.urlopen( url ).read()

    for study_accession in studies:
        # get additional info
        params = urlencode({'display': 'xml'})
        url = "/".join((data_url, study_accession)) + "&" + params

        info_lines = urlopen(url)
        tree = ET.parse(info_lines)
        root = tree.getroot()

        pmids = []
        for link in root.findall('*//XREF_LINK'):
            db = link.find('DB').text
            if db == "pubmed":
                pmids.append(link.find('ID').text)

        # get geo
        geos = []
        for attribute in root.findall('*//STUDY_ATTRIBUTE'):
            if attribute.find('TAG').text == "GEO Accession":
                geos.append(attribute.find('VALUE').text)

        params = {
            'dbfrom': 'gds',
            'db': 'pubmed',
        }

        geo_pmids = []
        for geo in geos:
            Entrez.email = "*****@*****.**"
            handle = Entrez.esearch(db="gds", retmax=1, term=geo)
            record = Entrez.read(handle)

            uids = record['IdList']
            handle.close()

            for uid in uids:
                record = Entrez.read(
                    Entrez.elink(dbfrom="gds", dbto="pubmed", id=uid))
                linksets = record[0]["LinkSetDb"]
                if not linksets:
                    continue

                assert len(linksets) == 1
                for linksetdb in linksets:
                    geo_pmids = [x['Id'] for x in linksetdb["Link"]]

        if not pmids:
            pmids = geo_pmids

        table_studies.write("\t".join(
            map(str, (study_accession, len(pmids), ",".join(pmids), len(geos),
                      ",".join(geos)))) + "\n")

    # write footer and output benchmark information.
    E.stop()
Esempio n. 16
0
def main(argv=None):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: plot_histogram.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-l",
                      "--plot-legend",
                      dest="legend",
                      type="string",
                      help="legend for plot [default=%default].")
    parser.add_option("-t",
                      "--title",
                      dest="title",
                      type="string",
                      help="title for plot [default=%default].")
    parser.add_option(
        "-p",
        "--hardcopy",
        dest="hardcopy",
        type="string",
        help=
        "filename for hardcopy of plot. The extension defines the format. Known extensions are: 'emf, eps, jpeg, jpg, pdf, png, ps, raw, rgba, svg, svgz' [default=%default].",
        metavar="FILE")
    parser.add_option("",
                      "--xrange",
                      dest="xrange",
                      type="string",
                      help="x viewing range of plot [default=%default].")
    parser.add_option("",
                      "--yrange",
                      dest="yrange",
                      type="string",
                      help="y viewing range of plot[default=%default].")
    parser.add_option("-o",
                      "--logscale",
                      dest="logscale",
                      type="string",
                      help="use logscale on x, y or xy [default=%default]")
    parser.add_option("-x",
                      "--xtitle",
                      dest="xtitle",
                      type="string",
                      help="title for x axis [default=%default]")
    parser.add_option("-y",
                      "--ytitle",
                      dest="ytitle",
                      type="string",
                      help="title for y axis [default=%default]")
    parser.add_option("-d",
                      "--dpi",
                      dest="dpi",
                      type="int",
                      help="dpi of images [default=%default]")
    parser.add_option("-n",
                      "--normalize",
                      dest="normalize",
                      action="store_true",
                      help="normalize histograms [default=%default]")
    parser.add_option(
        "--cumulate",
        dest="cumulate",
        action="store_true",
        help="calculate cumulative histogram [default=%default].")
    parser.add_option(
        "--reverse-cumulate",
        dest="reverse_cumulate",
        action="store_true",
        help=
        "calculate cumulative histogram in reverse order [default=%default].")
    parser.add_option("--legend-location",
                      dest="legend_location",
                      type="choice",
                      choices=("upper left", "upper right", "lower left",
                               "lower right", "center", "center right",
                               "center left", "none"),
                      help="location of legend [default=%default]")
    parser.add_option("--backend",
                      dest="backend",
                      type="string",
                      help="backend to use [Agg|SVG|PS] [default=%default]")
    parser.add_option(
        "--symbols",
        dest="symbols",
        type="string",
        help="symbols to use for each histogram [steps|...] [default=%default]."
    )
    parser.add_option("--dump",
                      dest="dump",
                      action="store_true",
                      help="dump data for debug purposes [default=%default].")
    parser.add_option("-c",
                      "--columns",
                      dest="columns",
                      type="string",
                      help="columns to use for plotting [default=%default].")
    parser.add_option(
        "--truncate",
        dest="truncate",
        action="store_true",
        help=
        "truncate date within x range. If not set, xrange is simply a viewing range [default=%default]."
    )
    parser.add_option("--as-lines",
                      dest="as_lines",
                      action="store_true",
                      help="plot only lines, no symbols [default=%default].")
    parser.add_option(
        "--noheaders",
        dest="headers",
        action="store_false",
        help="do not take first input line as header [default=%default].")
    parser.add_option("--stacked",
                      dest="stacked",
                      action="store_true",
                      help="do a stacked plot [default=%default].")
    parser.add_option("--add-function",
                      dest="function",
                      type="string",
                      help="add a function to the plot [default=%default].")
    parser.add_option(
        "--add-error-bars",
        dest="error_bars",
        type="choice",
        choices=("interleaved", "blocked"),
        help=
        "add error bars. The input format is 'interleaved' or 'blocked'. In the interleaved format the error follows each column. I the blocked format first the data, then the errors in the same order [default=%default]."
    )

    parser.set_defaults(
        legend=None,
        title=None,
        hardcopy=None,
        logscale=None,
        xtitle=None,
        ytitle=None,
        xrange=None,
        yrange=None,
        normalize=None,
        columns="all",
        headers=True,
        legend_location="upper right",
        backend="cairo",
        symbols="g-D,b-h,r-+,c-+,m-+,y-+,k-o,g-^,b-<,r->,c-D,m-h",
        dump=False,
        truncate=False,
        cumulate=False,
        reverse_cumulate=False,
        function=None,
        add_error_bars=None,
        as_lines=False,
        stacked=False,
        dpi=80,
    )

    (options, args) = E.start(parser)

    # import matplotlib/pylab. Has to be done here
    # for batch scripts without GUI.
    import matplotlib
    if options.hardcopy:
        matplotlib.use("cairo")
    import pylab

    # put this method here (because it requires pylab)
    def doStackedPlot(data, legend):

        colors = [
            "red", "blue", "green", "cyan", "magenta", "yellow", "brown",
            "silver", "purple", "lightyellow", "black", "ivory", "pink",
            "orange", "gray", "teal"
        ]

        ax = data[:, 0]
        xvals = numpy.concatenate((ax, ax[::-1]))
        y_top = numpy.zeros(len(ax))

        min_y = min(data[:, 1:].flat)
        max_y = min_y
        new_legend, dummy_lines = [], []

        for i in range(1, len(legend)):
            new_y_top = y_top + data[:, i]
            yvals = numpy.concatenate((new_y_top, y_top[::-1]))
            p = pylab.fill(xvals, yvals, colors[i % len(colors)])

            y_top = new_y_top
            max_y = max(y_top)

            dummy_lines.append(
                pylab.plot(xvals, yvals, colors[i % len(colors)]))

            new_legend.append(legend[i])

        if not options.xrange:
            options.xrange = min(data[:, 0]), max(data[:, 0])

        if not options.yrange:
            options.yrange = 0, max_y

        return dummy_lines, new_legend

    if options.as_lines:
        options.symbols = []
        for y in ("-", ":", "--"):
            for x in "gbrcmyk":
                options.symbols.append(y + x)
    else:
        options.symbols = options.symbols.split(",")

    if options.xrange:
        options.xrange = list(map(float, options.xrange.split(",")))
    if options.yrange:
        options.yrange = list(map(float, options.yrange.split(",")))

    # Added support for (inclusive) range format: "1,3,5,7-100"  (Gerton
    # 13/12/06)
    if options.columns != "all":
        cols = []
        for d in options.columns.split(','):
            colopts = d.split('-')
            if len(colopts) == 2:
                cols += list(range(int(colopts[0]), int(colopts[1]) + 1))
            else:
                cols += [int(d) - 1]
        options.columns = cols

    if args:
        if args[0] == "-":
            infile = sys.stdin
        else:
            infile = IOTools.open_file(args[0], "r")
    else:
        infile = sys.stdin

    if options.truncate:
        xr = options.xrange
    else:
        xr = None

    data, legend = IOTools.readTable(infile,
                                     numeric_type=numpy.float,
                                     take=options.columns,
                                     headers=options.headers,
                                     truncate=xr)

    if infile != sys.stdin:
        infile.close()
    if len(data) == 0:  # or data is None:
        E.info("empty table: no plot")
        E.stop()
        return

    nrows, ncols = data.shape

    # note: because of MA, iteration makes copy of slices
    # Solution: inplace edits.
    if options.cumulate:
        if options.add_error_bars:
            raise ValueError("can not add error bars to cumulative histogram")
        if data.mask.any():
            # cumsum does not work with masked arrays, so do it manually
            for y in range(1, ncols):
                c = 0
                for x in range(0, nrows):
                    if not data.mask[x, y]:
                        data[x, y] += c
                        c = data[x, y]
        else:
            for x in range(1, ncols):
                data[:, x] = data[:, x].cumsum()

    elif options.reverse_cumulate:
        if options.add_error_bars:
            raise ValueError("can not add error bars to cumulative histogram")
        if data.mask.any():
            l = [0] * ncols
            for x in range(nrows - 1, -1, -1):
                for y in range(1, ncols):
                    if not data.mask[x, y]:
                        data[x, y] += l[y]
                        l[y] = data[x, y]
        else:
            l = [0] * ncols
            for x in range(nrows - 1, -1, -1):
                for y in range(1, ncols):
                    data[x, y] += l[y]
                    l[y] = data[x, y]

    if options.normalize:
        if options.add_error_bars:
            raise ValueError("can not add error bars to normalized histogram")
        if data.mask.any():
            m = [0] * ncols
            for x in range(nrows):
                for y in range(1, ncols):
                    if not data.mask[x, y]:
                        m[y] = max(m[y], float(data[x, y]))

            for y in range(1, ncols):
                if m[y] == 0:
                    m[y] = 1.0

            for x in range(nrows):
                for y in range(1, ncols):
                    data[x, y] = data[x, y] / m[y]
        else:
            for x in range(1, ncols):
                m = float(data[:, x].max())
                data[:, x] /= m

    if options.legend:
        legend = options.legend.split(",")

    if options.dump:
        for d in data:
            print(d)

    if options.title:
        pylab.title(options.title)

    if options.xtitle:
        pylab.xlabel(options.xtitle)
    else:
        pylab.xlabel(legend[0])

    if options.ytitle:
        pylab.ylabel(options.ytitle)

    lines = []
    # use dummy_lines to workaround a bug in errorbars that
    # causes the line styles to be set incorrectly.
    dummy_lines = []
    new_legend = []

    if options.error_bars:
        if options.error_bars == "interleaved":
            step_size = 2
            max_size = len(legend)
        elif options.error_bars == "blocked":
            step_size = 1
            max_size = (len(legend) - 1) / 2
    else:
        step_size = 1
        max_size = len(legend)

    if options.stacked:
        dummy_lines, new_legend = doStackedPlot(data, legend)
    else:
        nplotted = 0
        nskipped = 0
        for x in range(1, max_size, step_size):

            s = options.symbols[nplotted % len(options.symbols)]

            yvals = data[:, x]

            xvals = numpy.ma.masked_array(data[:, 0], numpy.ma.getmask(yvals))

            xvals = xvals.compressed()
            yvals = yvals.compressed()

            if len(xvals) == 0:
                E.warn("skipped empty column %i: %s" % (x, legend[x]))

            if options.error_bars == "interleaved":
                yerr = data[:, x + 1]
                yerr = yerr.compressed()
            else:
                yerr = None

            lines.append(pylab.errorbar(xvals, yvals, yerr=yerr, fmt=s))

            dummy_lines.append(pylab.plot(xvals, yvals, s))

            new_legend.append(legend[x])

            nplotted += 1

        E.info("nplotted=%i, nskipped=%i" % (nplotted, nskipped))

    if len(lines) == 0:
        E.stop()
        return

    if options.legend_location != "none":
        pylab.figlegend(dummy_lines, new_legend, options.legend_location)

    if options.logscale:
        if "x" in options.logscale:
            pylab.gca().set_xscale('log')
        if "y" in options.logscale:
            pylab.gca().set_yscale('log')

    if options.xrange:
        pylab.xlim(options.xrange)

    if options.yrange:
        pylab.ylim(options.yrange)

    if options.function:
        xstart, xend = pylab.gca().get_xlim()
        increment = (xend - xstart) / 100.0
        exec(("f = lambda x: %s" % options.function), locals())
        xvals, yvals = [], []
        for x in range(0, 100):
            xvals.append(xstart)
            yvals.append(f(xstart))
            xstart += increment
        xvals.append(xstart)
        yvals.append(f(xstart))

        pylab.plot(xvals, yvals)

    if options.hardcopy:
        pylab.savefig(os.path.expanduser(options.hardcopy), dpi=options.dpi)
    else:
        pylab.show()

    E.stop()
Esempio n. 17
0
def read_fastq_screen(infiles, track_regex, sep="-"):
    """merge fastqscreen output into dataframes.

    Arguments
    ---------
    infiles : string
        Input filename with fastqscreen output.
    regex_track: string
        Regular expression to extract track name from filename.
    sep: char
        Separator for merging multiple capture groups in regex.

    Returns
    -------
    multiple dataframes
    """

    dfs, tracks, summaries = [], [], []
    for infile in infiles:

        try:
            track = sep.join(re.search(track_regex, infile).groups())
        except AttributeError:
            raise ValueError("regex {} did not match file {}".format(
                track_regex, infile))

        with IOTools.open_file(infile) as inf:
            lines = inf.readlines()
        version, aligner, reads = re.search(
            "#Fastq_screen version: (\S+)\t#Aligner: (\S+)\t#Reads in subset: (\d+)\n",
            lines.pop(0)).groups()
        percent_no_hit = re.search("%Hit_no_genomes: (\S+)\n",
                                   lines.pop(-1)).groups()[0]

        summaries.append((version, aligner, reads, percent_no_hit))

        records = [x[:-1].split("\t") for x in lines if x.strip()]
        df = pd.DataFrame.from_records(records[1:], columns=records[0])
        df = df.rename(
            columns={
                'Genome': "genome",
                '#Reads_processed': "reads_processed",
                '#Unmapped': "reads_unmapped",
                '%Unmapped': "reads_unmapped_percent",
                '#One_hit_one_genome': "one_hit_one_genome",
                '%One_hit_one_genome': "one_hit_one_genome_percent",
                '#Multiple_hits_one_genome': "multiple_hits_one_genome",
                '%Multiple_hits_one_genome':
                "multiple_hits_one_genome_percent",
                '#One_hit_multiple_genomes': "one_hit_multiple_genomes",
                '%One_hit_multiple_genomes':
                "one_hit_multiple_genomes_percent",
                'Multiple_hits_multiple_genomes':
                "multiple_hits_multiple_genomes",
                '%Multiple_hits_multiple_genomes':
                "multiple_hits_multiple_genomes"
            })
        dfs.append(df)
        tracks.append(track)
    df_details = pd.concat(dfs, keys=tracks, names=["track"])
    df_details.index = df_details.index.droplevel(1)
    df_summary = pd.DataFrame.from_records(
        summaries,
        columns=["version", "aligner", "nreads", "nohit_percent"],
        index=tracks)
    df_summary.index.name = "track"
    return df_summary, df_details
Esempio n. 18
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--is-gtf",
                      dest="is_gtf",
                      action="store_true",
                      help="input is gtf instead of gff.")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-m",
                      "--merge-adjacent",
                      dest="merge",
                      action="store_true",
                      help="merge adjacent intervals with the same attributes."
                      " [default=%default]")

    parser.add_option("-e",
                      "--feature",
                      dest="feature",
                      type="string",
                      help="filter by a feature, for example 'exon', 'CDS'."
                      " If set to the empty string, all entries are output "
                      "[%default].")

    parser.add_option("-f",
                      "--maskregions-bed-file",
                      dest="filename_masks",
                      type="string",
                      metavar="gff",
                      help="mask sequences with regions given in gff file "
                      "[%default].")

    parser.add_option("--remove-masked-regions",
                      dest="remove_masked_regions",
                      action="store_true",
                      help="remove regions instead of masking [%default].")

    parser.add_option("--min-interval-length",
                      dest="min_length",
                      type="int",
                      help="set minimum length for sequences output "
                      "[%default]")

    parser.add_option("--max-length",
                      dest="max_length",
                      type="int",
                      help="set maximum length for sequences output "
                      "[%default]")

    parser.add_option("--extend-at",
                      dest="extend_at",
                      type="choice",
                      choices=("none", "3", "5", "both", "3only", "5only"),
                      help="extend at no end, 3', 5' or both ends. If "
                      "3only or 5only are set, only the added sequence "
                      "is returned [default=%default]")

    parser.add_option("--header-attributes",
                      dest="header_attr",
                      action="store_true",
                      help="add GFF entry attributes to the FASTA record"
                      " header section")

    parser.add_option("--extend-by",
                      dest="extend_by",
                      type="int",
                      help="extend by # bases [default=%default]")

    parser.add_option("--extend-with",
                      dest="extend_with",
                      type="string",
                      help="extend using base [default=%default]")

    parser.add_option("--masker",
                      dest="masker",
                      type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker [%default].")

    parser.add_option("--fold-at",
                      dest="fold_at",
                      type="int",
                      help="fold sequence every n bases[%default].")

    parser.add_option(
        "--fasta-name-attribute",
        dest="naming_attribute",
        type="string",
        help="use attribute to name fasta entry. Currently only compatable"
        " with gff format [%default].")

    parser.set_defaults(
        is_gtf=False,
        genome_file=None,
        merge=False,
        feature=None,
        filename_masks=None,
        remove_masked_regions=False,
        min_length=0,
        max_length=0,
        extend_at=None,
        extend_by=100,
        extend_with=None,
        masker=None,
        fold_at=None,
        naming_attribute=False,
        header_attr=False,
    )

    (options, args) = E.start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator(options.stdin))
    else:
        gffs = GTF.iterator(options.stdin)
        if options.merge:
            iterator = GTF.joined_iterator(gffs)
        else:
            iterator = GTF.chunk_iterator(gffs)

    masks = None
    if options.filename_masks:
        masks = {}
        with IOTools.open_file(options.filename_masks, "r") as infile:
            e = GTF.readAsIntervals(GTF.iterator(infile))

        # convert intervals to intersectors
        for contig in list(e.keys()):
            intersector = bx.intervals.intersection.Intersecter()
            for start, end in e[contig]:
                intersector.add_interval(bx.intervals.Interval(start, end))
            masks[contig] = intersector

    ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0
    nskipped_length = 0
    nskipped_noexons = 0

    feature = options.feature

    # iterator is a list containing groups (lists) of features.
    # Each group of features have in common the same transcript ID, in case of
    # GTF files.
    for ichunk in iterator:

        ninput += 1

        if feature:
            chunk = [x for x in ichunk if x.feature == feature]
        else:
            chunk = ichunk

        if len(chunk) == 0:
            nskipped_noexons += 1
            E.info("no features in entry from "
                   "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start,
                                       ichunk[0].end, str(ichunk[0])))
            continue

        contig, strand = chunk[0].contig, chunk[0].strand

        if options.is_gtf:
            name = chunk[0].transcript_id
        else:
            if options.naming_attribute:
                attr_dict = {
                    x.split("=")[0]: x.split("=")[1]
                    for x in chunk[0].attributes.split(";")
                }
                name = attr_dict[options.naming_attribute]
            else:
                name = str(chunk[0].attributes)

        lcontig = contigs[contig]
        positive = Genomics.IsPositiveStrand(strand)
        intervals = [(x.start, x.end) for x in chunk]
        intervals.sort()

        if masks:
            if contig in masks:
                masked_regions = []
                for start, end in intervals:
                    masked_regions += [(x.start, x.end)
                                       for x in masks[contig].find(start, end)]

                masked_regions = Intervals.combine(masked_regions)
                if len(masked_regions):
                    nmasked += 1

                if options.remove_masked_regions:
                    intervals = Intervals.truncate(intervals, masked_regions)
                else:
                    raise NotImplementedError("unimplemented")

                if len(intervals) == 0:
                    nskipped_masked += 1
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# skipped because fully masked: "
                            "%s: regions=%s masks=%s\n" %
                            (name, str([(x.start, x.end)
                                        for x in chunk]), masked_regions))
                    continue

        out = intervals

        if options.extend_at and not options.extend_with:
            if options.extend_at == "5only":
                intervals = [(max(0, intervals[0][0] - options.extend_by),
                              intervals[0][0])]
            elif options.extend_at == "3only":
                intervals = [(intervals[-1][1],
                              min(lcontig,
                                  intervals[-1][1] + options.extend_by))]
            else:
                if options.extend_at in ("5", "both"):
                    intervals[0] = (max(0,
                                        intervals[0][0] - options.extend_by),
                                    intervals[0][1])
                if options.extend_at in ("3", "both"):
                    intervals[-1] = (intervals[-1][0],
                                     min(lcontig,
                                         intervals[-1][1] + options.extend_by))

        if not positive:
            intervals = [(lcontig - x[1], lcontig - x[0])
                         for x in intervals[::-1]]
            out.reverse()

        s = [
            fasta.getSequence(contig, strand, start, end)
            for start, end in intervals
        ]
        # IMS: allow for masking of sequences
        s = Masker.maskSequences(s, options.masker)
        l = sum([len(x) for x in s])
        if (l < options.min_length
                or (options.max_length and l > options.max_length)):
            nskipped_length += 1
            if options.loglevel >= 1:
                options.stdlog.write("# skipped because length out of bounds "
                                     "%s: regions=%s len=%i\n" %
                                     (name, str(intervals), l))
                continue

        if options.extend_at and options.extend_with:
            extension = "".join((options.extend_with, ) * options.extend_by)

            if options.extend_at in ("5", "both"):
                s[1] = extension + s[1]
            if options.extend_at in ("3", "both"):
                s[-1] = s[-1] + extension

        if options.fold_at:
            n = options.fold_at
            s = "".join(s)
            seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)])
        else:
            seq = "\n".join(s)

        if options.header_attr:
            attributes = " ".join(
                [":".join([ax, ay]) for ax, ay in chunk[0].asDict().items()])
            options.stdout.write(
                ">%s %s:%s:%s feature:%s %s\n%s\n" %
                (name, contig, strand, ";".join(
                    ["%i-%i" % x
                     for x in out]), chunk[0].feature, attributes, seq))
        else:
            options.stdout.write(
                ">%s %s:%s:%s\n%s\n" %
                (name, contig, strand, ";".join(["%i-%i" % x
                                                 for x in out]), seq))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, "
           "nskipped_masked=%i, nskipped_length=%i" %
           (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked,
            nskipped_length))

    E.stop()
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--inplace",
                      dest="inplace",
                      action="store_true",
                      help="update option list in place. New options will"
                      "be added to the list given by --options-tsv-file. "
                      "Options will only be added, not removed [%default]")

    parser.add_option("--options-tsv-file",
                      dest="tsv_file",
                      type="string",
                      help="existing table with options. Will be updated if "
                      "--in-place is set [default]")

    parser.set_defaults(inplace=False, tsv_file=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    old_options = None
    if options.tsv_file:
        if not os.path.exists(options.tsv_file):
            raise OSError("filename %s not found, see --options-tsv-file" %
                          options.tsv_file)
        old_options = pandas.read_csv(
            IOTools.open_file(options.tsv_file),
            sep="\t",
            index_col=0,
        )
        old_options = old_options.fillna("")

    global ORIGINAL_START
    ORIGINAL_START = E.start

    all_options = collections.defaultdict(list)

    for label, expression in EXPRESSIONS:

        files = glob.glob(expression)
        files.sort()

        for f in files:

            E.debug("processing %s" % f)
            if os.path.isdir(f):
                continue
            if os.path.basename(f) in EXCLUDE:
                continue
            collected_options = collectOptionsFromScript(os.path.abspath(f))
            for o in collected_options:
                all_options[o].append(f)

    # add old options
    for x in old_options.index:
        if x not in all_options:
            all_options[x].append("--")

    if options.inplace:
        outfile = IOTools.open_file(options.tsv_file, "w")
        E.info("updating file '%s'" % options.tsv_file)
    else:
        outfile = options.stdout

    outfile.write("option\taction\tcomment\talternative\tfiles\n")
    for o, v in sorted(all_options.items()):
        try:
            action, comment, alternative, ff = old_options.xs(o)

        except KeyError:
            action, comment, alternative, ff = "", "", "", ""

        if comment == "nan":
            comment = ""
        if alternative == "nan":
            alternative = ""

        outfile.write("\t".join(
            (list(map(str, (o, action, comment, alternative, ",".join(v)))))) +
                      "\n")

    if outfile != options.stdout:
        outfile.close()

    # write footer and output benchmark information.
    E.stop()
Esempio n. 20
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--tag-tsv-file",
                      dest="input_filename_tags",
                      type="string",
                      help="input file with tag counts [default=%default].")

    parser.add_option("-d",
                      "--design-tsv-file",
                      dest="input_filename_design",
                      type="string",
                      help="input file with experimental design "
                      "[default=%default].")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("ttest", "sleuth", "edger", "deseq2", "mock",
                               "dexseq"),
                      help="differential expression method to apply "
                      "[default=%default].")

    parser.add_option("--deseq2-dispersion-method",
                      dest="deseq2_dispersion_method",
                      type="choice",
                      choices=("pooled", "per-condition", "blind"),
                      help="dispersion method for deseq2 [default=%default].")

    parser.add_option("--deseq2-fit-type",
                      dest="deseq2_fit_type",
                      type="choice",
                      choices=("parametric", "local"),
                      help="fit type for deseq2 [default=%default].")

    parser.add_option("--edger-dispersion",
                      dest="edger_dispersion",
                      type="float",
                      help="dispersion value for edgeR if there are no "
                      "replicates [default=%default].")

    parser.add_option("-f",
                      "--fdr",
                      dest="fdr",
                      type="float",
                      help="fdr to apply [default=%default].")

    # currently not implemented
    # parser.add_option("-R", "--output-R-code", dest="save_r_environment",
    #                  type="string",
    #                  help="save R environment to loc [default=%default]")

    parser.add_option("-r",
                      "--reference-group",
                      dest="ref_group",
                      type="string",
                      help="Group to use as reference to compute "
                      "fold changes against [default=$default]")

    parser.add_option("--filter-min-counts-per-row",
                      dest="filter_min_counts_per_row",
                      type="int",
                      help="remove rows with less than this "
                      "number of counts in total [default=%default].")

    parser.add_option("--filter-min-counts-per-sample",
                      dest="filter_min_counts_per_sample",
                      type="int",
                      help="remove samples with a maximum count per sample of "
                      "less than this number   [default=%default].")

    parser.add_option("--filter-percentile-rowsums",
                      dest="filter_percentile_rowsums",
                      type="int",
                      help="remove percent of rows with "
                      "lowest total counts [default=%default].")

    parser.add_option("--model",
                      dest="model",
                      type="string",
                      help=("model for GLM"))

    parser.add_option("--reduced-model",
                      dest="reduced_model",
                      type="string",
                      help=("reduced model for LRT"))

    parser.add_option("--contrast",
                      dest="contrast",
                      type="string",
                      help=("contrast for differential expression testing"))

    parser.add_option("--sleuth-counts-dir",
                      dest="sleuth_counts_dir",
                      type="string",
                      help=("directory containing expression estimates"
                            "from sleuth. Sleuth expects counts"
                            "files to be called abundance.h5"))

    parser.add_option("--dexseq-counts-dir",
                      dest="dexseq_counts_dir",
                      type="string",
                      help=("directory containing counts for dexseq. DEXSeq "
                            "expects counts files to be called .txt and"
                            "to be generated by the DEXSeq_counts.py script"))

    parser.add_option("--dexseq-flattened-file",
                      dest="dexseq_flattened_file",
                      type="string",
                      help=("directory containing flat gtf for dexseq. DEXSeq "
                            "expects this to be generated by the"
                            "DEXSeq_prepare_annotations.py script"))

    parser.add_option(
        "--outfile-sleuth-count",
        dest="outfile_sleuth_count",
        type="string",
        help=("outfile for full count table generated by sleuth"))

    parser.add_option("--outfile-sleuth-tpm",
                      dest="outfile_sleuth_tpm",
                      type="string",
                      help=("outfile for full tpm table generated by sleuth"))

    parser.add_option("--use-ihw",
                      dest="use_ihw",
                      action="store_true",
                      help=("use the independent hypothesis weighting method "
                            "to obtain weighted FDR"))

    parser.add_option(
        "--sleuth-genewise",
        dest="sleuth_genewise",
        action="store_true",
        help=("run genewise, rather than transcript level testing"))

    parser.add_option("--gene-biomart",
                      dest="gene_biomart",
                      type="string",
                      help=("name of ensemble gene biomart"))

    parser.add_option("--de-test",
                      dest="DEtest",
                      type="choice",
                      choices=("wald", "lrt"),
                      help=("Differential expression test"))

    parser.add_option("--Rhistory",
                      dest="Rhistory",
                      type="string",
                      help=("Outfile for R history"))

    parser.add_option("--Rimage",
                      dest="Rimage",
                      type="string",
                      help=("Outfile for R image"))

    parser.set_defaults(input_filename_tags="-",
                        input_filename_design=None,
                        output_filename=sys.stdout,
                        method="deseq2",
                        fdr=0.1,
                        deseq2_dispersion_method="pooled",
                        deseq2_fit_type="parametric",
                        edger_dispersion=0.4,
                        ref_group=False,
                        filter_min_counts_per_row=None,
                        filter_min_counts_per_sample=None,
                        filter_percentile_rowsums=None,
                        spike_foldchange_max=4.0,
                        spike_expression_max=5.0,
                        spike_expression_bin_width=0.5,
                        spike_foldchange_bin_width=0.5,
                        spike_max_counts_per_bin=50,
                        model=None,
                        contrast=None,
                        output_filename_pattern=None,
                        sleuth_counts_dir=None,
                        dexseq_counts_dir=None,
                        dexseq_flattened_file=None,
                        outfile_sleuth_count=None,
                        outfile_sleuth_tpm=None,
                        use_ihw=False,
                        sleuth_genewise=False,
                        gene_biomart=None,
                        DEtest="wald",
                        reduced_model=None,
                        Rhistory=None,
                        Rimage=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    RH = None
    if options.Rhistory or options.Rimage:
        RH = R.R_with_History()

    outfile_prefix = options.output_filename_pattern

    # Expression.py currently expects a refernce group for edgeR and
    # sleuth, regardless of which test is used
    if not options.ref_group and (options.method is "edger"
                                  or options.method is "sleuth"):
        raise ValueError(
            "Must provide a reference group ('--reference-group')")

    # create Design object
    design = Expression.ExperimentalDesign(
        pd.read_csv(IOTools.open_file(options.input_filename_design, "r"),
                    sep="\t",
                    index_col=0,
                    comment="#"))

    if len(set(design.table[options.contrast])) > 2:

        if options.method == "deseq2" or options.method == "sleuth":
            if options.DEtest == "wald":
                raise ValueError(
                    "Factor must have exactly two levels for Wald Test. "
                    "If you have more than two levels in your factor, "
                    "consider LRT")
        else:
            E.info('''There are more than 2 levels for the contrast
            specified" "(%s:%s). The log2fold changes in the results table
            and MA plots will be for the first two levels in the
            contrast. The p-value will be the p-value for the overall
            significance of the contrast. Hence, some genes will have a
            signficant p-value but 0-fold change between the first two
            levels''' % (options.contrast, set(design[options.contrast])))

    # Sleuth reads in data itself so we don't need to create a counts object
    if options.method == "sleuth":
        assert options.sleuth_counts_dir, (
            "need to specify the location of the abundance.h5 counts files "
            " (--sleuth-counts-dir)")

        # validate design against counts and model
        design.validate(model=options.model)

        experiment = Expression.DEExperiment_Sleuth()
        results = experiment.run(design,
                                 base_dir=options.sleuth_counts_dir,
                                 model=options.model,
                                 contrast=options.contrast,
                                 outfile_prefix=outfile_prefix,
                                 counts=options.outfile_sleuth_count,
                                 tpm=options.outfile_sleuth_tpm,
                                 fdr=options.fdr,
                                 genewise=options.sleuth_genewise,
                                 gene_biomart=options.gene_biomart,
                                 DE_test=options.DEtest,
                                 ref_group=options.ref_group,
                                 reduced_model=options.reduced_model)

    # DEXSeq reads in data itself
    elif options.method == "dexseq":
        assert options.dexseq_counts_dir, (
            "need to specify the location of the .txt counts files")

        # create Design object
        design = Expression.ExperimentalDesign(
            pd.read_csv(IOTools.open_file(options.input_filename_design, "r"),
                        sep="\t",
                        index_col=0,
                        comment="#"))

        # validate design against counts and model
        # design.validate(model=options.model)

        experiment = Expression.DEExperiment_DEXSeq()
        results = experiment.run(design,
                                 base_dir=options.dexseq_counts_dir,
                                 model=options.model,
                                 contrast=options.contrast,
                                 ref_group=options.ref_group,
                                 outfile_prefix=outfile_prefix,
                                 flattenedfile=options.dexseq_flattened_file,
                                 fdr=options.fdr)

    else:
        # create Counts object
        if options.input_filename_tags == "-":
            counts = Counts.Counts(
                pd.io.parsers.read_csv(sys.stdin,
                                       sep="\t",
                                       index_col=0,
                                       comment="#"))
        else:
            counts = Counts.Counts(
                pd.io.parsers.read_csv(IOTools.open_file(
                    options.input_filename_tags, "r"),
                                       sep="\t",
                                       index_col=0,
                                       comment="#"))

        # validate design against counts and model
        design.validate(counts, options.model)

        # restrict counts to samples in design table
        counts.restrict(design)

        # remove sample with low counts
        if options.filter_min_counts_per_sample:
            counts.removeSamples(
                min_counts_per_sample=options.filter_min_counts_per_sample)

        # remove observations with low counts
        if options.filter_min_counts_per_row:
            counts.removeObservationsFreq(
                min_counts_per_row=options.filter_min_counts_per_row)

        # remove bottom percentile of observations
        if options.filter_percentile_rowsums:
            counts.removeObservationsPerc(
                percentile_rowsums=options.filter_percentile_rowsums)

        # check samples are the same in counts and design following counts
        # filtering and, if not, restrict design table and re-validate
        design.revalidate(counts, options.model)

        # set up experiment and run tests
        if options.method == "ttest":
            experiment = Expression.DEExperiment_TTest()
            results = experiment.run(counts, design)

        elif options.method == "edger":
            experiment = Expression.DEExperiment_edgeR()
            results = experiment.run(counts,
                                     design,
                                     model=options.model,
                                     contrast=options.contrast,
                                     outfile_prefix=outfile_prefix,
                                     ref_group=options.ref_group,
                                     fdr=options.fdr,
                                     dispersion=options.edger_dispersion)

        elif options.method == "deseq2":

            experiment = Expression.DEExperiment_DESeq2()
            results = experiment.run(counts,
                                     design,
                                     model=options.model,
                                     contrast=options.contrast,
                                     outfile_prefix=outfile_prefix,
                                     fdr=options.fdr,
                                     fit_type=options.deseq2_fit_type,
                                     ref_group=options.ref_group,
                                     DEtest=options.DEtest,
                                     R=RH)

    results.getResults(fdr=options.fdr)

    if options.use_ihw:
        results.calculateIHW(alpha=options.fdr)

    for contrast in set(results.table['contrast']):
        results.plotVolcano(contrast, outfile_prefix=outfile_prefix, R=RH)
        results.plotMA(contrast, outfile_prefix=outfile_prefix, R=RH)
        results.plotPvalueHist(contrast, outfile_prefix=outfile_prefix, R=RH)
        results.plotPvalueQQ(contrast, outfile_prefix=outfile_prefix, R=RH)

    results.table.to_csv(sys.stdout, sep="\t", na_rep="NA", index=False)

    results.summariseDEResults()

    # write out summary tables for each comparison/contrast
    for test_group in list(results.Summary.keys()):
        outf = IOTools.open_file(
            "_".join([outfile_prefix, test_group, "summary.tsv"]), "w")
        outf.write("category\tcounts\n%s\n" %
                   results.Summary[test_group].asTable())
        outf.close()

    if options.Rhistory:
        RH.saveHistory(options.Rhistory)
    if options.Rimage:
        RH.saveImage(options.Rimage)

    E.stop()
Esempio n. 21
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-w",
        "--weights-tsv-file",
        dest="filename_weights",
        type="string",
        help="filename with codon frequencies. Multiple filenames "
        "can be separated by comma.")

    parser.add_option("-s",
                      "--section",
                      dest="sections",
                      type="choice",
                      action="append",
                      choices=("length", "sequence", "hid", "na", "aa", "cpg",
                               "dn", "degeneracy", "gaps", "codons",
                               "codon-usage", "codon-translator",
                               "codon-bias"),
                      help="which sections to output [%default]")

    parser.add_option(
        "-t",
        "--sequence-type",
        dest="seqtype",
        type="choice",
        choices=("na", "aa"),
        help="type of sequence: na=nucleotides, aa=amino acids [%default].")

    parser.add_option(
        "-e",
        "--regex-identifier",
        dest="regex_identifier",
        type="string",
        help="regular expression to extract identifier from fasta "
        "description line.")

    parser.add_option("--split-fasta-identifier",
                      dest="split_id",
                      action="store_true",
                      help="split fasta description line (starting >) and use "
                      "only text before first space")

    parser.add_option(
        "--add-total",
        dest="add_total",
        action="store_true",
        help="add a row with column totals at the end of the table"
        "[%default]")

    parser.set_defaults(
        filename_weights=None,
        pseudocounts=1,
        sections=[],
        regex_identifier="(.+)",
        seqtype="na",
        gap_chars='xXnN',
        split_id=False,
        add_total=False,
    )

    (options, args) = E.start(parser, argv=argv)

    rx = re.compile(options.regex_identifier)

    reference_codons = []
    if options.filename_weights:
        options.filename_weights = options.filename_weights.split(",")
        for filename in options.filename_weights:
            if filename == "uniform":
                reference_codons.append(Genomics.GetUniformCodonUsage())
            else:
                reference_codons.append(
                    IOTools.ReadMap(IOTools.open_file(filename, "r"),
                                    has_header=True,
                                    map_functions=(str, float)))

        # print codon table differences
        options.stdlog.write(
            "# Difference between supplied codon usage preferences.\n")
        for x in range(0, len(reference_codons)):
            for y in range(0, len(reference_codons)):
                if x == y:
                    continue
                # calculate KL distance
                a = reference_codons[x]
                b = reference_codons[y]
                d = 0
                for codon, p in list(a.items()):
                    if Genomics.IsStopCodon(codon):
                        continue
                    d += b[codon] * math.log(b[codon] / p)

                options.stdlog.write("# tablediff\t%s\t%s\t%f\n" %
                                     (options.filename_weights[x],
                                      options.filename_weights[y], d))

    iterator = FastaIterator.FastaIterator(options.stdin)

    def getCounter(section):

        if options.seqtype == "na":
            if section == "length":
                s = SequenceProperties.SequencePropertiesLength()
            elif section == "sequence":
                s = SequenceProperties.SequencePropertiesSequence()
            elif section == "hid":
                s = SequenceProperties.SequencePropertiesHid()
            elif section == "na":
                s = SequenceProperties.SequencePropertiesNA()
            elif section == "gaps":
                s = SequenceProperties.SequencePropertiesGaps(
                    options.gap_chars)
            elif section == "cpg":
                s = SequenceProperties.SequencePropertiesCpg()
            elif section == "dn":
                s = SequenceProperties.SequencePropertiesDN()
            # these sections requires sequence length to be a multiple of 3
            elif section == "aa":
                s = SequenceProperties.SequencePropertiesAA()
            elif section == "degeneracy":
                s = SequenceProperties.SequencePropertiesDegeneracy()
            elif section == "codon-bias":
                s = SequenceProperties.SequencePropertiesBias(reference_codons)
            elif section == "codons":
                s = SequenceProperties.SequencePropertiesCodons()
            elif section == "codon-usage":
                s = SequenceProperties.SequencePropertiesCodonUsage()
            elif section == "codon-translator":
                s = SequenceProperties.SequencePropertiesCodonTranslator()
            else:
                raise ValueError("unknown section %s" % section)
        elif options.seqtype == "aa":
            if section == "length":
                s = SequenceProperties.SequencePropertiesLength()
            elif section == "sequence":
                s = SequenceProperties.SequencePropertiesSequence()
            elif section == "hid":
                s = SequenceProperties.SequencePropertiesHid()
            elif section == "aa":
                s = SequenceProperties.SequencePropertiesAminoAcids()
            else:
                raise ValueError("unknown section %s" % section)
        return s

    # setup totals
    totals = {}
    for section in options.sections:
        totals[section] = getCounter(section)

    options.stdout.write("id")
    for section in options.sections:
        options.stdout.write("\t" + "\t".join(totals[section].getHeaders()))

    options.stdout.write("\n")
    options.stdout.flush()

    s = getCounter("hid")
    s.loadSequence("AAAAAAAAA", "na")

    for cur_record in iterator:

        sequence = re.sub(" ", "", cur_record.sequence).upper()

        if len(sequence) == 0:
            raise ValueError("empty sequence %s" % cur_record.title)

        id = rx.search(cur_record.title).groups()[0]

        if options.split_id is True:
            options.stdout.write("%s" % id.split()[0])
        else:
            options.stdout.write("%s" % id)
        options.stdout.flush()

        for section in options.sections:
            s = getCounter(section)
            s.loadSequence(sequence, options.seqtype)
            totals[section].addProperties(s)

            options.stdout.write("\t" + "\t".join(s.getFields()))

        options.stdout.write("\n")

    if options.add_total:
        options.stdout.write("total")
        for section in options.sections:
            options.stdout.write("\t" + "\t".join(totals[section].getFields()))
        options.stdout.write("\n")

    E.stop()
def __main__():

    # use argparse to ignore unknown options
    parser = argparse.ArgumentParser()

    parser.add_argument("--version", action="version", version="%(prog)s")
    parser.add_argument("--wrapper-command", dest="command", type=str)
    parser.add_argument("--wrapper-bam-file", dest="bam_file", type=str)
    parser.add_argument("--wrapper-bam-option", dest="bam_option", type=str)
    parser.add_argument("--wrapper-bai-file", dest="bai_file", type=str)
    parser.add_argument(
        "--wrapper-dry-run", dest="dry_run", action="store_true")
    parser.add_argument("--wrapper-html-dir", dest="html_dir", type=str)
    parser.add_argument("--wrapper-html-file", dest="html_file", type=str)

    options, unknown = parser.parse_known_args()

    cgat = CGATBase(options)

    option_map = []

    if options.bai_file or options.bam_file:
        if not (options.bai_file and options.bam_file):
            raise ValueError(
                "wrapper called with bam or bai file, but not both")

        if not options.bam_option:
            options.bam_option = "bam-file"

        tmp_fd, tmp_name = tempfile.mkstemp()
        tmp_bam_name = '%s.bam' % tmp_name
        tmp_bai_name = '%s.bai' % tmp_bam_name
        os.symlink(options.bam_file, tmp_bam_name)
        os.symlink(options.bai_file, tmp_bai_name)
        if options.bam_option.startswith("--"):
            # long option
            option_map.append("%s=%s" % (options.bam_option, tmp_bam_name))
        else:
            # short option
            option_map.append("%s %s" % (options.bam_option, tmp_bam_name))

    if options.html_dir:
        os.mkdir(options.html_dir)
        option_map.append("%s=%s/%%s" %
                          ("--output-filename-pattern", options.html_dir))

    statement = "python " + " ".join([options.command] + unknown + option_map)

    if options.dry_run:
        sys.stdout.write(statement + "\n")
        return

    else:
        cgat.runStatement(statement)

    if options.bai_file:
        os.unlink(tmp_bam_name)
        os.unlink(tmp_bai_name)

    if options.html_file:
        with IOTools.open_file(options.html_file, "w") as outf:
            outf.write('<h1>%s - Output</h1>' %
                       os.path.basename(options.wrapper_command))
            for fn in glob.glob(os.path.join(options.html_dir, "*.*")):
                dirname, basename = os.path.split(fn)
                outf.write('''<li><a href="%s">%s</a></li>\n''' %
                           (basename, basename))
Esempio n. 23
0
def writeSequencesForIntervals(track,
                               filename,
                               dbhandle,
                               full=False,
                               halfwidth=None,
                               maxsize=None,
                               proportion=None,
                               masker=[],
                               offset=0,
                               shuffled=False,
                               num_sequences=None,
                               min_sequences=None,
                               order="peakval",
                               shift=None):
    '''build a sequence set for motif discovery. Intervals are taken from
    the table <track>_intervals in the database *dbhandle* and save to
    *filename* in :term:`fasta` format.

    If num_shuffles is set, shuffled copies are created as well with
    the shuffled number appended to the filename.

    The sequences are masked before shuffling (is this appropriate?)

    If *full* is set, the whole intervals will be output, otherwise
    only the region around the peak given by *halfwidth*

    If *maxsize* is set, the output is truncated at *maxsize* characters
    in order to create jobs that take too long.

    If proportion is set, only the top *proportion* intervals are output
    (sorted by peakval).

    If *num_sequences* is set, the first *num_sequences* will be used.

    *masker* can be a combination of
        * dust, dustmasker: apply dustmasker
        * softmask: mask softmasked genomic regions

    *order* is the order by which peaks should be sorted. Possible
    values are 'peakval' (peak value, descending order), 'score' (peak
    score, descending order)

    If *shift* is set, intervals will be shifted. ``leftright``
    creates two intervals on the left and right of the actual
    interval. The intervals will be centered around the mid-point and
    truncated the same way as the main intervals.

    '''

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(P.get_params()["genome_dir"],
                     P.get_params()["genome"]))

    if order == "peakval":
        orderby = " ORDER BY peakval DESC"
    elif order == "max":
        orderby = " ORDER BY score DESC"
    else:
        raise ValueError(
            "Unknown value passed as order parameter, check your ini file")

    tablename = "%s_intervals" % P.tablequote(track)
    statement = '''SELECT contig, start, end, interval_id, peakcenter
    FROM %(tablename)s
    ''' % locals() + orderby

    cc = dbhandle.execute(statement)
    data = cc.fetchall()
    cc.close()

    if proportion:
        cutoff = int(len(data) * proportion) + 1
        if min_sequences:
            cutoff = max(cutoff, min_sequences)
    elif num_sequences:
        cutoff = num_sequences
    else:
        cutoff = len(data)
        L.info(
            "writeSequencesForIntervals %s: using at most %i sequences for pattern finding"
            % (track, cutoff))

    data = data[:cutoff]

    L.info("writeSequencesForIntervals %s: masker=%s" % (track, str(masker)))

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(P.get_params()["genome_dir"],
                     P.get_params()["genome"]))

    # modify the ranges
    if shift:
        if shift == "leftright":
            new_data = [(contig, start - (end - start), start,
                         str(interval_id) + "_left", peakcenter)
                        for contig, start, end, interval_id, peakcenter in data
                        ]
            new_data.extend([
                (contig, end, end + (end - start), str(interval_id) + "_right",
                 peakcenter)
                for contig, start, end, interval_id, peakcenter in data
            ])
        data = new_data

    if halfwidth:
        # center around peakcenter, add halfwidth on either side
        data = [(contig, peakcenter - halfwidth, peakcenter + halfwidth,
                 interval_id)
                for contig, start, end, interval_id, peakcenter in data]
    else:
        # remove peakcenter
        data = [(contig, start, end, interval_id)
                for contig, start, end, interval_id, peakcenter in data]

    # get the sequences - cut at number of nucleotides
    sequences = []
    current_size, nseq = 0, 0
    new_data = []
    for contig, start, end, interval_id in data:
        lcontig = fasta.getLength(contig)
        start, end = max(0, start + offset), min(end + offset, lcontig)
        if start >= end:
            L.info(
                "writeSequencesForIntervals %s: sequence %s is empty: start=%i, end=%i, offset=%i - ignored"
                % (track, id, start, end, offset))
            continue
        seq = fasta.getSequence(contig, "+", start, end)
        sequences.append(seq)
        new_data.append((start, end, interval_id, contig))
        current_size += len(seq)
        if maxsize and current_size >= maxsize:
            L.info(
                "writeSequencesForIntervals %s: maximum size (%i) reached - only %i sequences output (%i ignored)"
                % (track, maxsize, nseq, len(data) - nseq))
            break
        nseq += 1

    data = new_data

    if shuffled:
        # note that shuffling is done on the unmasked sequences
        # Otherwise N's would be interspersed with real sequence
        # messing up motif finding unfairly. Instead, masking is
        # done on the shuffled sequence.
        sequences = [list(x) for x in sequences]
        for sequence in sequences:
            random.shuffle(sequence)
        sequences = maskSequences(["".join(x) for x in sequences], masker)

    c = E.Counter()
    outs = IOTools.open_file(filename, "w")
    for masker in masker:
        if masker not in ("unmasked", "none", None):
            sequences = maskSequences(sequences, masker)

    for sequence, d in zip(sequences, data):
        c.input += 1
        if len(sequence) == 0:
            c.empty += 1
            continue
        start, end, id, contig = d
        id = "%s_%s %s:%i-%i" % (track, str(id), contig, start, end)
        outs.write(">%s\n%s\n" % (id, sequence))
        c.output += 1
    outs.close()

    E.info("%s" % c)

    return c.output
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: vcfstats_sqlite.py 0001 2011-04-13 davids $",
        usage=globals()["__doc__"])

    (options, args) = E.start(parser)

    options.filenames = args

    if len(options.filenames) < 1:
        options.stdout.write("# Error: no vcf-stats files specified/found.")
        sys.exit(1)

    E.info("Parsing %i file(s)" % len(options.filenames))

    # set up output files
    vcf_file = IOTools.open_file('vcfstats.txt', 'w')
    indel_file = IOTools.open_file('indelstats.txt', 'w')
    snp_file = IOTools.open_file('snpstats.txt', 'w')
    shared_file = IOTools.open_file('sharedstats.txt', 'w')

    for fileno, filename in enumerate(options.filenames):

        prefix = os.path.basename(filename)
        trackname = prefix.replace(".vcfstats", "")

        if os.path.exists(filename):
            lines = [x for x in IOTools.open_file(filename, "r").readlines()]
        else:
            lines = []

        if len(lines) == 0:
            options.stdout.write(
                "# Error: empty vcf-stats file found: $(filename)s")
            sys.exit(1)
        else:
            E.info("File %i contains %i lines" % (fileno, len(lines)))
            vcf_stats = dict(track=trackname)
            snp_stats = dict(track=trackname)
            indel_stats = dict()
            shared_stats = dict()
            all_vars = False
            indels = False
            snps = False
            shared = False
            for i, line in enumerate(lines):
                line = line.strip()
                if line.find("'all'") > -1:
                    all_vars = True
                    E.info("Found 'all'")
                    continue

                if all_vars:
                    if line.find("=>") > -1:
                        fields = line.split("=>")
                        key = fields[0].strip().replace("'",
                                                        "").replace(">", "_")
                        val = fields[1].strip().replace(",", "")
                    else:
                        key = "NA"
                        val = "NA"
                    if key == "indel" and val == "{":
                        indels = True
                        E.info("Found 'indels'")
                        continue
                    elif key == "snp" and val == "{":
                        snps = True
                        E.info("Found 'SNPs'")
                        continue
                    elif key == "shared" and val == "{":
                        shared = True
                        E.info("Found 'Shared'")
                        continue

                    if indels:
                        if line.find("}") > -1:
                            indels = False
                            E.info("Processed 'indels'")
                            continue
                        else:
                            indel_stats[key] = val
                    elif snps:
                        if line.find("}") > -1:
                            snps = False
                            E.info("Processed 'SNPs'")
                            continue
                        else:
                            snp_stats[key] = val
                    elif shared:
                        if line.find("}") > -1:
                            shared = False
                            E.info("Processed 'Shared'")
                            continue
                        else:
                            shared_stats[key] = val
                    elif key != "NA":
                        vcf_stats[key] = val

            # Ensure all keys are present
            allkeys = [
                "nalt_1", "nalt_2", "nalt_3", "nalt_4", "nalt_5", "track",
                "count", "snp_count", "indel_count"
            ]
            for k in allkeys:
                if k in vcf_stats:
                    continue
                else:
                    vcf_stats[k] = "0"

            # Write header (for first file only)
            if filename == options.filenames[0]:

                # Ensure keys are sorted
                srt = list(vcf_stats.keys())
                srt.sort()
                sep = ""
                for k in srt:
                    vcf_file.write("%s%s" % (sep, k))
                    sep = "\t"
                vcf_file.write("\n")

                indel_file.write("track\tindel_length\tindel_count\n")
                shared_file.write("track\tno_samples\tvar_count\n")

                sep = ""
                for k in snp_stats.keys():
                    snp_file.write("%s%s" % (sep, k))
                    sep = "\t"
                snp_file.write("\n")

            # Write data
            sep = ""
            srt = list(vcf_stats.keys())
            srt.sort()
            for k in srt:
                vcf_file.write("%s%s" % (sep, vcf_stats[k]))
                sep = "\t"
            vcf_file.write("\n")

            # Check all indel lengths are covered
            r = list(range(-20, 20, 1))
            for i in r:
                if str(i) in indel_stats:
                    continue
                else:
                    indel_stats[i] = "0"
            for k in indel_stats.keys():
                indel_file.write("%s\t%s\t%s\n" %
                                 (trackname, k, indel_stats[k]))

            for k in shared_stats.keys():
                shared_file.write("%s\t%s\t%s\n" %
                                  (trackname, k, shared_stats[k]))

            sep = ""
            for k in snp_stats.keys():
                snp_file.write("%s%s" % (sep, snp_stats[k]))
                sep = "\t"
            snp_file.write("\n")

    # close files
    vcf_file.close()
    indel_file.close()
    snp_file.close()

    E.stop()
    sys.exit(0)
Esempio n. 25
0
def loadMAST(infile, outfile):
    '''parse mast file and load into database.

    Parse several motif runs and add them to the same
    table.

    Add columns for the control data as well.
    '''

    tablename = P.to_table(outfile)

    tmpfile = P.get_temp_file(".")

    tmpfile.write(MAST.Match().header + "\tmotif\tcontig"
                  "\tl_evalue\tl_pvalue\tl_nmatches\tl_length\tl_start\tl_end"
                  "\tr_evalue\tr_pvalue\tr_nmatches\tr_length\tr_start\tr_end"
                  "\tmin_evalue\tmin_pvalue\tmax_nmatches" + "\n")

    lines = IOTools.open_file(infile).readlines()
    chunks = [x for x in range(len(lines)) if lines[x].startswith("::")]
    chunks.append(len(lines))

    def readChunk(lines, chunk):
        # use real file, as MAST parser can not deal with a
        # list of lines
        tmpfile2 = P.get_temp_file(".")
        try:
            motif, part = re.match(":: motif = (\S+) - (\S+) ::",
                                   lines[chunks[chunk]]).groups()
        except AttributeError:
            raise ValueError("parsing error in line '%s'" %
                             lines[chunks[chunk]])

        E.info("reading %s - %s" % (motif, part))

        tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]]))
        tmpfile2.close()

        mast = MAST.parse(IOTools.open_file(tmpfile2.name, "r"))

        os.unlink(tmpfile2.name)

        return motif, part, mast

    def splitId(s, mode):
        '''split background match id

        has three parts: track _ id _ pos

        track might contain '_'.
        '''
        d = match.id.split("_")
        if mode == "bg":
            return "_".join(d[:-2]), d[-2], d[-1]
        elif mode == "fg":
            return "_".join(d[:-1]), d[-1]

    for chunk in range(0, len(chunks) - 1, 2):

        motif_fg, part, mast_fg = readChunk(lines, chunk)
        assert part == "foreground"
        motif_bg, part, mast_bg = readChunk(lines, chunk + 1)
        assert part == "background"
        assert motif_fg == motif_bg

        # index control data
        controls = collections.defaultdict(dict)
        for match in mast_bg.matches:
            track, id, pos = splitId(match.id, "bg")
            controls[id][pos] = (match.evalue, match.pvalue, match.nmotifs,
                                 match.length, match.start, match.end)

        for match in mast_fg.matches:
            # remove track and pos
            track, match.id = splitId(match.id, "fg")
            # move to genomic coordinates
            contig, start, end = re.match("(\S+):(\d+)..(\d+)",
                                          match.description).groups()
            if match.nmotifs > 0:
                start, end = int(start), int(end)
                match.start += start
                match.end += start
                match.positions = [x + start for x in match.positions]

            id = match.id
            if id not in controls:
                P.warn("no controls for %s - increase MAST evalue" % id)

            if "l" not in controls[id]:
                controls[id]["l"] = (float(P.get_params()["mast_evalue"]), 1,
                                     0, 0, 0, 0)
            if "r" not in controls[id]:
                controls[id]["r"] = (float(P.get_params()["mast_evalue"]), 1,
                                     0, 0, 0, 0)

            min_evalue = min(controls[id]["l"][0], controls[id]["r"][0])
            min_pvalue = min(controls[id]["l"][1], controls[id]["r"][1])
            max_nmatches = max(controls[id]["l"][2], controls[id]["r"][2])

            tmpfile.write(
                str(match) + "\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (
                    motif_fg,
                    contig,
                    "\t".join(map(str, controls[id]["l"])),
                    "\t".join(map(str, controls[id]["r"])),
                    str(min_evalue),
                    str(min_pvalue),
                    str(max_nmatches),
                ) + "\n")

    tmpfile.close()

    P.load(tmpfile.name,
           outfile,
           options="--add-index=id "
           "--add-index=motif "
           "--add-index=id,motif "
           "--allow-empty-file "
           "--map=base_qualities:text")

    os.unlink(tmpfile.name)
Esempio n. 26
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-r",
        "--mask-bed-file",
        dest="filename_rna",
        type="string",
        metavar='GFF',
        help="gff formatted file with masking locations. The number of "
        "reads overlapping the intervals in the given file will be "
        "computed. Note that the computation currently does not take "
        "into account indels, so it is an approximate count only. "
        "[%default]")

    parser.add_option(
        "-f",
        "--ignore-masked-reads",
        dest="remove_rna",
        action="store_true",
        help="as well as counting reads in the file given by --mask-bed-file, "
        "also remove these reads for duplicate and match statistics. "
        "[%default]")

    parser.add_option(
        "-i",
        "--num-reads",
        dest="input_reads",
        type="int",
        help="the number of reads - if given, used to provide percentages "
        "[%default]")

    parser.add_option(
        "-d",
        "--output-details",
        dest="output_details",
        action="store_true",
        help="output per-read details into a separate file. Read names are "
        "md5/base64 encoded [%default]")

    parser.add_option(
        "-q",
        "--fastq-file",
        dest="filename_fastq",
        help="filename with sequences and quality scores. This file is only "
        "used to collect sequence identifiers. Thus, for paired end data a "
        "single file is sufficient [%default]")

    parser.set_defaults(
        filename_rna=None,
        remove_rna=False,
        input_reads=0,
        force_output=False,
        filename_fastq=None,
        output_details=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if options.filename_rna:
        rna = GTF.readAndIndex(
            GTF.iterator(IOTools.open_file(options.filename_rna)))
    else:
        rna = None

    if len(args) > 0:
        pysam_in = pysam.AlignmentFile(args[0], "rb")
    elif options.stdin == sys.stdin:
        pysam_in = pysam.AlignmentFile("-", "rb")
    else:
        pysam_in = pysam.AlignmentFile(options.stdin, "rb")

    if options.output_details:
        outfile_details = E.openOutputFile("details", "w")
    else:
        outfile_details = None

    if options.filename_fastq and not os.path.exists(options.filename_fastq):
        raise IOError("file %s does not exist" % options.filename_fastq)

    (counter, flags_counts, nh_filtered, nh_all,
     nm_filtered, nm_all, mapq, mapq_all, max_hi) = \
        _bam2stats.count(pysam_in,
                         options.remove_rna,
                         rna,
                         filename_fastq=options.filename_fastq,
                         outfile_details=outfile_details)

    if max_hi > 0 and max_hi != max(nh_all.keys()):
        E.warn("max_hi(%i) is inconsistent with max_nh (%i) "
               "- counts will be corrected" % (max_hi, max(nh_all.keys())))

    outs = options.stdout
    outs.write("category\tcounts\tpercent\tof\n")

    def _write(outs, text, numerator, denominator, base):
        percent = IOTools.prettyPercent(numerator, denominator)
        outs.write('%s\t%i\t%s\t%s\n' % (text, numerator, percent, base))

    ###############################
    ###############################
    ###############################
    # Output alignment information
    ###############################
    nalignments_unmapped = flags_counts["unmapped"]
    nalignments_mapped = counter.alignments_input - nalignments_unmapped

    _write(outs, "alignments_total", counter.alignments_input,
           counter.alignments_input, "alignments_total")

    if counter.alignments_input == 0:
        E.warn("no alignments in BAM file - no further output")
        E.stop()
        return

    _write(outs, "alignments_mapped", nalignments_mapped,
           counter.alignments_input, 'alignments_total')
    _write(outs, "alignments_unmapped", nalignments_unmapped,
           counter.alignments_input, 'alignments_total')

    if nalignments_mapped == 0:
        E.warn("no mapped alignments - no further output")
        E.stop()
        return

    for flag, counts in sorted(flags_counts.items()):
        if flag == "unmapped":
            continue
        _write(outs, 'alignments_' + flag, counts, nalignments_mapped,
               'alignments_mapped')

    if options.filename_rna:
        _write(outs, "alignments_rna", counter.alignments_rna,
               nalignments_mapped, 'alignments_mapped')
        _write(outs, "alignments_no_rna", counter.alignments_no_rna,
               nalignments_mapped, 'alignments_mapped')

    _write(outs, "alignments_filtered", counter.alignments_filtered,
           nalignments_mapped, "alignments_mapped")

    if counter.filtered == nalignments_mapped:
        normby = "alignments_mapped"
    else:
        normby = "alignments_filtered"

    if counter.filtered > 0:
        _write(outs, "alignments_duplicates", counter.alignments_duplicates,
               counter.alignments_filtered, normby)
        _write(outs, "alignments_unique",
               counter.aligmnments_filtered - counter.alignments_duplicates,
               counter.alignments_filtered, normby)

    ###############################
    ###############################
    ###############################
    # Output read based information
    ###############################

    # derive the number of mapped reads in file from alignment counts
    if options.filename_fastq:
        nreads_total = counter.total_read
        _write(outs, "reads_total", counter.total_read, nreads_total,
               'reads_total')
        _write(outs, "reads_unmapped", counter.total_read_is_unmapped,
               nreads_total, 'reads_total')
        _write(outs, "reads_mapped", counter.total_read_is_mapped,
               nreads_total, 'reads_total')
        _write(outs, "reads_missing", counter.total_read_is_missing,
               nreads_total, 'reads_total')
        _write(outs, "reads_mapped_unique", counter.total_read_is_mapped_uniq,
               counter.total_read_is_mapped, 'reads_mapped')
        _write(outs, "reads_multimapping", counter.total_read_is_mmap,
               counter.total_read_is_mapped, 'reads_mapped')
    else:
        E.warn('inferring read counts from alignments and NH tags')
        nreads_unmapped = flags_counts["unmapped"]
        nreads_mapped = computeMappedReadsFromAlignments(
            nalignments_mapped, nh_all, max_hi)

        nreads_missing = 0
        if options.input_reads:
            nreads_total = options.input_reads
            # unmapped reads in bam file?
            if nreads_unmapped:
                nreads_missing = nreads_total - nreads_unmapped - nreads_mapped
            else:
                nreads_unmapped = nreads_total - nreads_mapped

        elif nreads_unmapped:
            # if unmapped reads are in bam file, take those
            nreads_total = nreads_mapped + nreads_unmapped
        else:
            # otherwise normalize by mapped reads
            nreads_unmapped = 0
            nreads_total = nreads_mapped

        outs.write("reads_total\t%i\t%5.2f\treads_total\n" %
                   (nreads_total, 100.0))
        outs.write("reads_mapped\t%i\t%5.2f\treads_total\n" %
                   (nreads_mapped, 100.0 * nreads_mapped / nreads_total))
        outs.write("reads_unmapped\t%i\t%5.2f\treads_total\n" %
                   (nreads_unmapped, 100.0 * nreads_unmapped / nreads_total))
        outs.write("reads_missing\t%i\t%5.2f\treads_total\n" %
                   (nreads_missing, 100.0 * nreads_missing / nreads_total))

        if len(nh_all) > 1:
            outs.write("reads_unique\t%i\t%5.2f\treads_mapped\n" %
                       (nh_all[1], 100.0 * nh_all[1] / nreads_mapped))

        # compute after filtering
        # not that these are rough guesses
        if options.filename_rna:
            nreads_norna = computeMappedReadsFromAlignments(
                counter.filtered, nh_filtered, max_hi)
            _write(outs, "reads_norna", nreads_norna, nreads_mapped,
                   "reads_mapped")
            if len(nh_filtered) > 1:
                _write(outs, "reads_norna_unique", nh_filtered[1],
                       nreads_norna, "reads_mapped")

    pysam_in.close()

    ###############################
    ###############################
    ###############################
    # Output pair information
    ###############################
    if flags_counts["read2"] > 0:
        if options.filename_fastq:
            pairs_mapped = counter.total_pair_is_mapped

            # sanity check
            assert counter.total_pair_is_mapped == \
                (counter.total_pair_is_proper_uniq +
                 counter.total_pair_is_incomplete_uniq +
                 counter.total_pair_is_incomplete_mmap +
                 counter.total_pair_is_proper_duplicate +
                 counter.total_pair_is_proper_mmap +
                 counter.total_pair_not_proper_uniq +
                 counter.total_pair_is_other)

            outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" %
                       (counter.total_pairs,
                        100.0 * counter.total_pairs / counter.total_pairs))
            outs.write(
                "pairs_mapped\t%i\t%5.2f\tpairs_total\n" %
                (pairs_mapped, 100.0 * pairs_mapped / counter.total_pairs))
            outs.write("pairs_unmapped\t%i\t%5.2f\tpairs_total\n" %
                       (counter.total_pair_is_unmapped, 100.0 *
                        counter.total_pair_is_unmapped / counter.total_pairs))
            outs.write(
                "pairs_proper_unique\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_proper_uniq, 100.0 *
                 counter.total_pair_is_proper_uniq / counter.total_pairs))
            outs.write(
                "pairs_incomplete_unique\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_incomplete_uniq, 100.0 *
                 counter.total_pair_is_incomplete_uniq / counter.total_pairs))
            outs.write(
                "pairs_incomplete_multimapping\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_incomplete_mmap, 100.0 *
                 counter.total_pair_is_incomplete_mmap / counter.total_pairs))
            outs.write(
                "pairs_proper_duplicate\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_proper_duplicate, 100.0 *
                 counter.total_pair_is_proper_duplicate / counter.total_pairs))
            outs.write(
                "pairs_proper_multimapping\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_proper_mmap, 100.0 *
                 counter.total_pair_is_proper_mmap / counter.total_pairs))
            outs.write(
                "pairs_not_proper_unique\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_not_proper_uniq, 100.0 *
                 counter.total_pair_not_proper_uniq / counter.total_pairs))
            outs.write("pairs_other\t%i\t%5.2f\tpairs_total\n" %
                       (counter.total_pair_is_other, 100.0 *
                        counter.total_pair_is_other / counter.total_pairs))

            nread1_total = counter.total_read1
            _write(outs, "read1_total", counter.total_read1, nread1_total,
                   'read1_total')
            _write(outs, "read1_unmapped", counter.total_read1_is_unmapped,
                   nread1_total, 'read1_total')
            _write(outs, "read1_mapped", counter.total_read1_is_mapped,
                   nread1_total, 'read1_total')
            _write(outs, "read1_mapped_unique",
                   counter.total_read1_is_mapped_uniq,
                   counter.total_read1_is_mapped, 'read1_mapped')
            _write(outs, "reads_multimapping", counter.total_read1_is_mmap,
                   counter.total_read1_is_mapped, 'read1_mapped')
            _write(outs, "read1_missing", counter.total_read1_is_missing,
                   counter.total_read1_is_mapped, 'read1_total')

            nread2_total = counter.total_read2
            _write(outs, "read2_total", counter.total_read2, nread2_total,
                   'read2_total')
            _write(outs, "read2_unmapped", counter.total_read2_is_unmapped,
                   nread2_total, 'read2_total')
            _write(outs, "read2_mapped", counter.total_read2_is_mapped,
                   nread2_total, 'read2_total')
            _write(outs, "read2_mapped_unique",
                   counter.total_read2_is_mapped_uniq,
                   counter.total_read2_is_mapped, 'read2_mapped')
            _write(outs, "reads_multimapping", counter.total_read2_is_mmap,
                   counter.total_read2_is_mapped, 'read2_mapped')
            _write(outs, "read2_missing", counter.total_read2_is_missing,
                   counter.total_read2_is_mapped, 'read2_total')

        else:
            # approximate counts
            pairs_total = nreads_total // 2
            pairs_mapped = flags_counts["proper_pair"] // 2
            _write(outs, "pairs_total", pairs_total, pairs_total,
                   "pairs_total")
            _write(outs, "pairs_mapped", pairs_mapped, pairs_total,
                   "pairs_total")
    else:
        # no paired end data
        pairs_total = pairs_mapped = 0
        outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" %
                   (pairs_total, 0.0))
        outs.write("pairs_mapped\t%i\t%5.2f\tpairs_total\n" %
                   (pairs_mapped, 0.0))

    if options.force_output or len(nm_filtered) > 0:
        outfile = E.openOutputFile("nm", "w")
        outfile.write("NM\talignments\n")
        if len(nm_filtered) > 0:
            for x in range(0, max(nm_filtered.keys()) + 1):
                outfile.write("%i\t%i\n" % (x, nm_filtered[x]))
        else:
            outfile.write("0\t%i\n" % (counter.filtered))
        outfile.close()

    if options.force_output or len(nh_all) > 1:
        outfile = E.openOutputFile("nh_all", "w")
        outfile.write("NH\treads\n")
        if len(nh_all) > 0:
            writeNH(outfile, nh_all, max_hi)
        else:
            # assume all are unique if NH flag not set
            outfile.write("1\t%i\n" % (counter.mapped_reads))
        outfile.close()

    if options.force_output or len(nh_filtered) > 1:
        outfile = E.openOutputFile("nh", "w")
        outfile.write("NH\treads\n")
        if len(nh_filtered) > 0:
            writeNH(outfile, nh_filtered, max_hi)
        else:
            # assume all are unique if NH flag not set
            outfile.write("1\t%i\n" % (counter.filtered))
        outfile.close()

    if options.force_output or len(mapq_all) > 1:
        outfile = E.openOutputFile("mapq", "w")
        outfile.write("mapq\tall_reads\tfiltered_reads\n")
        for x in range(0, max(mapq_all.keys()) + 1):
            outfile.write("%i\t%i\t%i\n" % (x, mapq_all[x], mapq[x]))
        outfile.close()

    # write footer and output benchmark information.
    E.stop()
Esempio n. 27
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      action="store",
                      choices=("hierarchy", "set-field", "set-pattern",
                               "set-none"),
                      help="Method to use for conversion")

    parser.add_option(
        "-g",
        "--gene-type",
        dest="gene_type",
        type="string",
        help="feature type to get gene_id from if possible [%default]")

    parser.add_option(
        "-t",
        "--transcript-type",
        dest="transcript_type",
        type="string",
        help="feature type to get transcript_id from if possible [%default]")

    parser.add_option(
        "-d",
        "--no-discard",
        dest="discard",
        action="store_false",
        help=
        "Do not discard feature types specified by GENE_TYPE and TRANSCRIPT_TYPE"
    )

    parser.add_option(
        "--gene-id",
        dest="gene_field_or_pattern",
        type="string",
        help="Either field or pattern for the gene_id [%default]")

    parser.add_option(
        "--transcript-id",
        dest="transcript_field_or_pattern",
        type="string",
        help="Either field or pattern for the transcript_id [%default]")

    parser.add_option(
        "--parent-field",
        dest="parent",
        type="string",
        help="field that specifies the parent relationship. Currently only"
        "if left as Parent will features with multiple parents be parsed"
        "correctly"
        "")

    parser.add_option(
        "--read-twice",
        dest="read_twice",
        action="store_true",
        help=
        "Instead of holding the whole file in memory, read once for parsing the "
        "hierarchy, and then again for actaully doing the conversion. Means a real file "
        "and not a pipe must be provided."
        "")

    parser.add_option(
        "--by-chrom",
        dest="by_chrom",
        action="store_true",
        help="Parse input file one choromosome at a time. Reduces memory usage, "
        "but input must be sorted by chromosome and features may not split accross "
        " multiple chromosomes"
        "")

    parser.add_option(
        "--fail-missing-gene",
        dest="missing_gene",
        action="store_false",
        help="Fail if no feature of type GENE_TYPE is found instead of using "
        "defaulting to highest object in hierarchy"
        "")

    parser.set_defaults(method="hierarchy",
                        gene_type="gene",
                        transcript_type="mRNA",
                        discard=True,
                        gene_field_or_pattern="ID",
                        transcript_field_or_pattern="ID",
                        read_twice=False,
                        by_chrom=False,
                        missing_gene=True,
                        parent="Parent")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    gffs = GFF3.flat_file_iterator(options.stdin)

    if options.by_chrom:
        gffs = GFF3.chrom_iterator(gffs)
    else:
        gffs = [gffs]

    # running early so that fails early if configuration is wrong
    if options.read_twice:
        # Will throw IOError if options.stdin is not a normal file
        second_gff = GFF3.flat_file_iterator(
            IOTools.open_file(options.stdin.name))

        if options.by_chrom:
            second_gff = GFF3.chrom_iterator(second_gff)
        else:
            second_gff = iter([second_gff])
    else:
        second_gff = None

    for chunk in gffs:

        if options.read_twice:
            second_gff_chunk = next(second_gff)
        else:
            chunk = list(chunk)
            second_gff_chunk = chunk

        if options.method == "hierarchy":

            convert_hierarchy(chunk, second_gff_chunk, options)
        elif options.method == "set-field":
            gene_id_pattern = "%%(%s)s" % options.gene_field_or_pattern
            transcript_id_pattern = "%%(%s)s" % options.transcript_field_or_pattern
            convert_set(chunk, gene_id_pattern, transcript_id_pattern, options)
        elif options.method == "set-pattern":
            convert_set(chunk, options.gene_field_or_pattern,
                        options.transcript_field_or_pattern, options)
        elif options.method == "set-none":
            convert_set(chunk, None, None, options)

    # write footer and output benchmark information.
    E.stop()
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-d",
                      "--design-tsv-file",
                      dest="input_filename_design",
                      type="string",
                      help="input file with experimental design "
                      "[default=%default].")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("filter", "spike", "normalize"),
                      help="differential expression method to apply "
                      "[default=%default].")

    parser.add_option("--filter-min-counts-per-row",
                      dest="filter_min_counts_per_row",
                      type="int",
                      help="remove rows with less than this "
                      "number of counts in total [default=%default].")

    parser.add_option("--filter-min-counts-per-sample",
                      dest="filter_min_counts_per_sample",
                      type="int",
                      help="remove samples with a maximum count per sample of "
                      "less than this numer   [default=%default].")

    parser.add_option("--filter-percentile-rowsums",
                      dest="filter_percentile_rowsums",
                      type="int",
                      help="remove percent of rows with "
                      "lowest total counts [default=%default].")

    parser.add_option("--spike-change-bin-min",
                      dest="min_cbin",
                      type="float",
                      help="minimum bin for change bins [default=%default].")

    parser.add_option("--spike-change-bin-max",
                      dest="max_cbin",
                      type="float",
                      help="maximum bin for change bins [default=%default].")

    parser.add_option("--spike-change-bin-width",
                      dest="width_cbin",
                      type="float",
                      help="bin width for change bins [default=%default].")

    parser.add_option("--spike-initial-bin-min",
                      dest="min_ibin",
                      type="float",
                      help="minimum bin for initial bins[default=%default].")

    parser.add_option("--spike-initial-bin-max",
                      dest="max_ibin",
                      type="float",
                      help="maximum bin for intitial bins[default=%default].")

    parser.add_option("--spike-initial-bin-width",
                      dest="width_ibin",
                      type="float",
                      help="bin width intitial bins[default=%default].")

    parser.add_option(
        "--spike-minimum",
        dest="min_spike",
        type="int",
        help="minimum number of spike-ins required within each bin\
                      [default=%default].")

    parser.add_option(
        "--spike-maximum",
        dest="max_spike",
        type="int",
        help="maximum number of spike-ins allowed within each bin\
                      [default=%default].")

    parser.add_option("--spike-difference-method",
                      dest="difference",
                      type="choice",
                      choices=("relative", "logfold", "abs_logfold"),
                      help="method to use for calculating difference\
                      [default=%default].")

    parser.add_option("--spike-iterations",
                      dest="iterations",
                      type="int",
                      help="number of iterations to generate spike-ins\
                      [default=%default].")

    parser.add_option("--spike-cluster-maximum-distance",
                      dest="cluster_max_distance",
                      type="int",
                      help="maximum distance between adjacent loci in cluster\
                      [default=%default].")

    parser.add_option("--spike-cluster-minimum-size",
                      dest="cluster_min_size",
                      type="int",
                      help="minimum number of loci required per cluster\
                      [default=%default].")

    parser.add_option("--spike-type",
                      dest="spike_type",
                      type="choice",
                      choices=("row", "cluster"),
                      help="spike in type [default=%default].")

    parser.add_option("--spike-subcluster-min-size",
                      dest="min_sbin",
                      type="int",
                      help="minimum size of subcluster\
                      [default=%default].")

    parser.add_option("--spike-subcluster-max-size",
                      dest="max_sbin",
                      type="int",
                      help="maximum size of subcluster\
                      [default=%default].")

    parser.add_option("--spike-subcluster-bin-width",
                      dest="width_sbin",
                      type="int",
                      help="bin width for subcluster size\
                      [default=%default].")

    parser.add_option("--spike-output-method",
                      dest="output_method",
                      type="choice",
                      choices=("append", "seperate"),
                      help="defines whether the spike-ins should be appended\
                      to the original table or seperately [default=%default].")

    parser.add_option("--spike-shuffle-column-suffix",
                      dest="shuffle_suffix",
                      type="string",
                      help="the suffix of the columns which are to be shuffled\
                      [default=%default].")

    parser.add_option("--spike-keep-column-suffix",
                      dest="keep_suffix",
                      type="string",
                      help="a list of suffixes for the columns which are to be\
                      keep along with the shuffled columns[default=%default].")

    parser.add_option("--normalization-method",
                      dest="normalization_method",
                      type="choice",
                      choices=("deseq-size-factors", "total-count",
                               "total-column", "total-row"),
                      help="normalization method to apply [%default]")

    parser.add_option("-t",
                      "--tags-tsv-file",
                      dest="input_filename_tags",
                      type="string",
                      help="input file with tag counts [default=%default].")

    parser.set_defaults(input_filename_tags="-",
                        method="filter",
                        filter_min_counts_per_row=None,
                        filter_min_counts_per_sample=None,
                        filter_percentile_rowsums=None,
                        output_method="seperate",
                        difference="logfold",
                        spike_type="row",
                        min_cbin=0,
                        max_cbin=100,
                        width_cbin=100,
                        min_ibin=0,
                        max_ibin=100,
                        width_ibin=100,
                        max_spike=100,
                        min_spike=None,
                        iterations=1,
                        cluster_max_distance=100,
                        cluster_min_size=10,
                        min_sbin=1,
                        max_sbin=1,
                        width_sbin=1,
                        shuffle_suffix=None,
                        keep_suffix=None,
                        normalization_method="deseq-size-factors")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    # load
    if options.keep_suffix:
        # if using suffix, loadTagDataPandas will throw an error as it
        # looks for column names which exactly match the design
        # "tracks" need to write function in Counts.py to handle
        # counts table and design table + suffix
        counts = pd.read_csv(options.stdin, sep="\t", comment="#")
        inf = IOTools.open_file(options.input_filename_design)
        design = pd.read_csv(inf, sep="\t", index_col=0)
        inf.close()
        design = design[design["include"] != 0]

        if options.method in ("filter", "spike"):
            if options.input_filename_design is None:
                raise ValueError("method '%s' requires a design file" %
                                 options.method)
    else:
        # create Counts object
        # TS if spike type is cluster, need to keep "contig" and "position"
        # columns out of index
        if options.spike_type == "cluster":
            index = None,
        else:
            index = 0
        if options.input_filename_tags == "-":
            counts = Counts.Counts(
                pd.io.parsers.read_csv(options.stdin,
                                       sep="\t",
                                       index_col=index,
                                       comment="#"))
        else:
            counts = Counts.Counts(IOTools.open_file(
                options.input_filename_tags, "r"),
                                   sep="\t",
                                   index_col=index,
                                   comment="#")

        # TS normalization doesn't require a design table
        if not options.method == "normalize":

            assert options.input_filename_design and os.path.exists(
                options.input_filename_design)

            # create Design object
            design = Expression.ExperimentalDesign(
                pd.read_csv(IOTools.open_file(options.input_filename_design,
                                              "r"),
                            sep="\t",
                            index_col=0,
                            comment="#"))

    if options.method == "filter":

        assert (options.filter_min_counts_per_sample is not None or
                options.filter_min_counts_per_row is not None or
                options.filter_percentile_rowsums is not None), \
            "no filtering parameters have been suplied"

        # filter
        # remove sample with low counts
        if options.filter_min_counts_per_sample:
            counts.removeSamples(
                min_counts_per_sample=options.filter_min_counts_per_sample)

        # remove observations with low counts
        if options.filter_min_counts_per_row:
            counts.removeObservationsFreq(
                min_counts_per_row=options.filter_min_counts_per_row)

        # remove bottom percentile of observations
        if options.filter_percentile_rowsums:
            counts.removeObservationsPerc(
                percentile_rowsums=options.filter_percentile_rowsums)

        nobservations, nsamples = counts.table.shape

        if nobservations == 0:
            E.warn("no observations remaining after filtering- no output")
            return

        if nsamples == 0:
            E.warn("no samples remain after filtering - no output")
            return

        # write out
        counts.table.to_csv(options.stdout, sep="\t", header=True)

    elif options.method == "normalize":

        counts.normalise(method=options.normalization_method,
                         row_title="total")

        # write out
        counts.table.to_csv(options.stdout, sep="\t", header=True)

    elif options.method == "spike":
        # check parameters are sensible and set parameters where they
        # are not explicitly set
        if not options.min_spike:
            E.info("setting minimum number of spikes per bin to equal"
                   "maximum number of spikes per bin (%s)" % options.max_spike)
            options.min_spike = options.max_spike

        if options.spike_type == "cluster":

            assert options.max_sbin <= options.cluster_min_size, \
                ("max size of subscluster: %s is greater than min size of"
                 "cluster: %s" % (options.max_sbin, options.cluster_min_size))

            counts_columns = set(counts.table.columns.values.tolist())

            assert ("contig" in counts_columns and
                    "position" in counts_columns), \
                ("cluster analysis requires columns named 'contig' and"
                 "'position' in the dataframe")

            counts.sort(sort_columns=["contig", "position"], reset_index=True)

        # restrict design table to first pair only

        design.firstPairOnly()

        # get dictionaries to map group members to column names
        # use different methods depending on whether suffixes are supplied
        if options.keep_suffix:
            g_to_keep_tracks, g_to_spike_tracks = design.mapGroupsSuffix(
                options.shuffle_suffix, options.keep_suffix)
        else:
            # if no suffixes supplied, spike and keep tracks are the same
            g_to_track = design.getGroups2Samples()
            g_to_spike_tracks, g_to_keep_tracks = (g_to_track, g_to_track)

        # set up numpy arrays for change and initial values
        change_bins = np.arange(options.min_cbin, options.max_cbin,
                                options.width_cbin)
        initial_bins = np.arange(options.min_ibin, options.max_ibin,
                                 options.width_ibin)

        E.info("Column boundaries are: %s" % str(change_bins))
        E.info("Row boundaries are: %s" % str(initial_bins))

        # shuffle rows/clusters
        if options.spike_type == "cluster":
            E.info("looking for clusters...")
            clusters_dict = Counts.findClusters(counts_sort,
                                                options.cluster_max_distance,
                                                options.cluster_min_size,
                                                g_to_spike_tracks, groups)
            if len(clusters_dict) == 0:
                raise Exception("no clusters were found, check parameters")

            E.info("shuffling subcluster regions...")
            output_indices, counts = Counts.shuffleCluster(
                initial_bins, change_bins, g_to_spike_tracks, groups,
                options.difference, options.max_spike, options.iterations,
                clusters_dict, options.max_sbin, options.min_sbin,
                options.width_sbin)

        elif options.spike_type == "row":

            E.info("shuffling rows...")
            output_indices, bin_counts = counts.shuffleRows(
                options.min_cbin, options.max_cbin, options.width_cbin,
                options.min_ibin, options.max_ibin, options.width_ibin,
                g_to_spike_tracks, design.groups, options.difference,
                options.max_spike, options.iterations)

        filled_bins = Counts.thresholdBins(output_indices, bin_counts,
                                           options.min_spike)

        assert len(filled_bins) > 0, "No bins contained enough spike-ins"

        # write out
        counts.outputSpikes(filled_bins,
                            g_to_keep_tracks,
                            design.groups,
                            output_method=options.output_method,
                            spike_type=options.spike_type,
                            min_cbin=options.min_cbin,
                            width_cbin=options.width_cbin,
                            max_cbin=options.max_cbin,
                            min_ibin=options.min_ibin,
                            width_ibin=options.width_ibin,
                            max_ibin=options.max_ibin,
                            min_sbin=options.min_sbin,
                            width_sbin=options.width_sbin,
                            max_sbin=options.max_sbin)

    E.stop()
Esempio n. 29
0
def test_cmdline():
    '''test style of scripts
    '''

    # start script in order to build the command line parser
    global ORIGINAL_START
    if ORIGINAL_START is None:
        ORIGINAL_START = E.start

    # read the first two columns
    map_option2action = IOTools.read_map(
        IOTools.open_file(FILENAME_OPTIONLIST),
        columns=(0, 1),
        has_header=True)

    files = []
    for label, expression in EXPRESSIONS:
        f = glob.glob(expression)
        files.extend(sorted(f))

    files = filter_files(files)

    # make sure to use the current working directory as
    # primary lookup.
    sys.path.insert(0, ".")

    # files = [
    #    'scripts/check_db.py',
    #    'scripts/cgat_build_report_page.py']

    for f in files:
        if os.path.isdir(f):
            continue
        if os.path.basename(f) in EXCLUDE:
            continue

        script_name = os.path.abspath(f)
        pyxfile = (os.path.join(os.path.dirname(f), "_") +
                   os.path.basename(f) + "x")

        fail_.description = script_name
        # check if script contains getopt
        with IOTools.open_file(script_name) as inf:
            if "getopt" in inf.read():
                yield (fail_, "script uses getopt directly: %s" % script_name)
                continue

        module, modulename = load_script(script_name)
        if module is None:
            yield (fail_, "module could not be imported: %s\n" % script_name)
            continue
        E.start = LocalStart

        try:
            module.main(argv=["dummy", "--help"])
        except AttributeError:
            yield (fail_, "no main method in %s\n" % script_name)
            ok_(False, "no main method in %s" % script_name)
        except SystemExit:
            yield (fail_, "script does not use E.start() %s\n" % script_name)
        except DummyError:
            pass

        for option in PARSER.option_list:
            # ignore options added by optparse
            if option.dest is None:
                continue

            optstring = option.get_opt_string()
            if optstring.startswith("--"):
                optstring = optstring[2:]

            check_option.description = script_name + ":" + optstring

            yield (check_option, optstring, os.path.abspath(f),
                   map_option2action)

        # clear up
        del sys.modules[modulename]

        # scripts with pyximport need special handling.
        #
        # Multiple imports of pyximport seems to create
        # some confusion - here, clear up sys.meta_path after
        # each script
        if os.path.exists(pyxfile):
            sys.meta_path = []
Esempio n. 30
0
 def buildIndex(self, filename):
     return Bed.readAndIndex(IOTools.open_file(filename, "r"))