Esempio n. 1
0
def parse_commandline(argv=None, **kwargs):
    """parse command line.

    Create option parser and parse command line.

    Arguments
    ---------
    argv : list
        List of command line options to parse. If None, use sys.argv.

    **kwargs: dict
        Additional arguments overwrite default option settings.

    Returns
    -------

    options: object
       Command line options container

    args : list
       List of command line arguments

    """
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$", usage=USAGE)

    parser.add_option("--pipeline-action",
                      dest="pipeline_action",
                      type="choice",
                      choices=("make", "show", "plot", "dump", "config",
                               "clone", "check", "regenerate", "state",
                               "printconfig"),
                      help="action to take [default=%default].")

    parser.add_option("--pipeline-format",
                      dest="pipeline_format",
                      type="choice",
                      choices=("dot", "jpg", "svg", "ps", "png"),
                      help="pipeline format [default=%default].")

    parser.add_option("-n",
                      "--dry-run",
                      dest="dry_run",
                      action="store_true",
                      help="perform a dry run (do not execute any shell "
                      "commands) [default=%default].")

    parser.add_option("-c",
                      "--config-file",
                      dest="config_file",
                      help="benchmark configuration file "
                      "[default=%default].")

    parser.add_option("-f",
                      "--force-run",
                      dest="force_run",
                      type="string",
                      help="force running the pipeline even if there are "
                      "up-to-date tasks. If option is 'all', all tasks "
                      "will be rerun. Otherwise, only the tasks given as "
                      "arguments will be rerun. "
                      "[default=%default].")

    parser.add_option("-p",
                      "--multiprocess",
                      dest="multiprocess",
                      type="int",
                      help="number of parallel processes to use on "
                      "submit host "
                      "(different from number of jobs to use for "
                      "cluster jobs) "
                      "[default=%default].")

    parser.add_option("-e",
                      "--exceptions",
                      dest="log_exceptions",
                      action="store_true",
                      help="echo exceptions immediately as they occur "
                      "[default=%default].")

    parser.add_option("-i",
                      "--terminate",
                      dest="terminate",
                      action="store_true",
                      help="terminate immediately at the first exception "
                      "[default=%default].")

    parser.add_option("-d",
                      "--debug",
                      dest="debug",
                      action="store_true",
                      help="output debugging information on console, "
                      "and not the logfile "
                      "[default=%default].")

    parser.add_option("-s",
                      "--set",
                      dest="variables_to_set",
                      type="string",
                      action="append",
                      help="explicitely set paramater values "
                      "[default=%default].")

    parser.add_option("--checksums",
                      dest="ruffus_checksums_level",
                      type="int",
                      help="set the level of ruffus checksums"
                      "[default=%default].")

    parser.add_option("-t",
                      "--is-test",
                      dest="is_test",
                      action="store_true",
                      help="this is a test run"
                      "[default=%default].")

    parser.add_option("--engine",
                      dest="engine",
                      choices=("local", "arvados"),
                      help="engine to use."
                      "[default=%default].")

    parser.add_option("--always-mount",
                      dest="always_mount",
                      action="store_true",
                      help="force mounting of arvados keep [%default]")

    parser.add_option("--only-info",
                      dest="only_info",
                      action="store_true",
                      help="only update meta information, do not run "
                      "[default=%default].")

    parser.add_option(
        "--work-dir",
        dest="work_dir",
        type="string",
        help="working directory. Will be created if it does not exist "
        "[default=%default].")

    group = E.OptionGroup(parser, "Pipeline logging configuration")

    group.add_option("--pipeline-logfile",
                     dest="pipeline_logfile",
                     type="string",
                     help="primary logging destination."
                     "[default=%default].")

    group.add_option("--shell-logfile",
                     dest="shell_logfile",
                     type="string",
                     help="filename for shell debugging information. "
                     "If it is not an absolute path, "
                     "the output will be written into the current working "
                     "directory. If unset, no logging will be output. "
                     "[default=%default].")

    parser.add_option("--input-validation",
                      dest="input_validation",
                      action="store_true",
                      help="perform input validation before starting "
                      "[default=%default].")

    parser.add_option_group(group)

    parser.set_defaults(pipeline_action=None,
                        pipeline_format="svg",
                        pipeline_targets=[],
                        force_run=False,
                        multiprocess=None,
                        pipeline_logfile="pipeline.log",
                        shell_logfile=None,
                        dry_run=False,
                        log_exceptions=True,
                        engine="local",
                        exceptions_terminate_immediately=None,
                        debug=False,
                        variables_to_set=[],
                        is_test=False,
                        ruffus_checksums_level=0,
                        config_file="benchmark.yml",
                        work_dir=None,
                        always_mount=False,
                        only_info=False,
                        input_validation=False)

    parser.set_defaults(**kwargs)

    if "callback" in kwargs:
        kwargs["callback"](parser)

    logger_callback = setup_logging

    (options, args) = E.start(parser,
                              add_cluster_options=True,
                              argv=argv,
                              logger_callback=logger_callback)

    return options, args
Esempio n. 2
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "--input-filename-fasta", dest="input_filename_fasta", type="string",
        help="filename with reference sequence in fasta format [%default]")

    parser.add_option(
        "--input-filename-bam", dest="input_filename_bam", type="string",
        help="filename with aligned reads [%default]")

    parser.add_option(
        "--method", dest="methods", type="choice", action="append",
        choices=["add-strelka-genotype",
                 "lift-over"],
        help="methods to apply [%default]")

    parser.add_option(
        "--input-filename-chain", dest="input_filename_chain", type="string",
        help="filename with alignment chain for lift-over [%default]")

    parser.add_option(
        "--normal-sample-regex", dest="normal_sample_regex", type="string",
        help="regular expression to apply to header to identify normal "
        "sample id [%default]")

    parser.add_option(
        "--output-filename-unmapped", dest="output_filename_unmapped", type="string",
        help="filename with variants that could not be lifted over [%default]")

    parser.set_defaults(
        input_filename_fasta=None,
        input_filename_bam=None,
        input_filename_vcf="-",
        sample_size=0.001,
        region_size=20,
        methods=[],
        normal_sample_regex=None,
        input_filename_chain=None,
        output_filename_unmapped=None,
    )

    (options, args) = E.start(parser,
                              argv=argv,
                              add_output_options=True)

    if len(args) > 0:
        options.input_filename_vcf = args[0]

    vcf_in = pysam.VariantFile(options.input_filename_vcf)

    if "lift-over" in options.methods:
        if options.input_filename_chain is None:
            raise ValueError("--method=lift-over requires --input-filename-chain")
        if not os.path.exists(options.input_filename_chain):
            raise OSError("file {} with chain data does not exist".format(
                options.input_filename_chain))
        E.info("reading chain from {}".format(options.input_filename_chain))
        with IOTools.open_file(options.input_filename_chain) as inf:
            map_chain, map_contig2length = read_liftover_chain(inf)

    if options.input_filename_fasta:
        fasta = pysam.FastaFile(options.input_filename_fasta)
    else:
        fasta = None

    if options.input_filename_bam:
        bam = pysam.AlignmentFile(options.input_filename_bam)
    else:
        bam = None

    outf = options.stdout

    c = E.Counter()

    if "add-strelka-genotype" in options.methods:
        map_nt2gt = {"ref": "0/0",
                     "het": "0/1",
                     "hom": "1/1",
                     "conflict": "."}

        map_tumour2gt = {"ref": "0/0",
                         "het": "0/1",
                         "hom": "1/1"}

        header = str(vcf_in.header).splitlines()

        header.insert(
            len(header) - 1,
            '##FORMAT=<ID=GT,Number=1,Type=String,Description='
            '"Genotypes of reference and alternative alleles, '
            'added by CGATCore vcf2vcf.">')

        header = "\n".join(header)
        if options.normal_sample_regex:
            normal_sample = re.search(" -bam-file \S+/([^/]+)_S\d+.bam", header).groups()[0]
        else:
            normal_sample = "NORMAL"

        is_first = True

        for record in vcf_in:
            c.input += 1

            if "GT" in record.format:
                if is_first:
                    outf.write(header + "\n")
                    is_first = False
                outf.write(str(record))
                c.has_gt += 1
                continue

            gt_normal = map_nt2gt[record.info["NT"]]
            gt_tumour = record.info["SGT"]
            norm, tumour = gt_tumour.split("->")
            if gt_tumour[0] in "ACGT":
                alts = record.alts
                if alts is None:
                    c.no_alt += 1
                    continue

                if len(record.alts) > 1:
                    c.multi_allelic += 1
                    continue

                _map_tumour2gt = {
                    record.alts[0]: "1",
                    record.ref: "0"}
                try:
                    gt_tumour = "/".join(
                        sorted([_map_tumour2gt[x] for x in tumour]))
                except KeyError:
                    gt_tumour = "."
                    c.ambigous_genotype += 1
            else:
                gt_tumour = map_tumour2gt[tumour]

            fields = str(record)[:-1].split("\t")
            # FORMAT
            fields[8] = ":".join(("GT", fields[8]))
            # SAMPLES
            # makes a few assumptions, fix!
            header_insert_normal = False
            if len(fields) == 11:
                fields[9] = ":".join((gt_normal, fields[9]))
                fields[10] = ":".join((gt_tumour, fields[10]))
            elif len(fields) == 10:
                header_insert_normal = True
                values = fields[9].split(":")
                fields.append(":".join((gt_tumour, fields[9])))
                fields[9] = ":".join([gt_normal] + ["."] * len(values))
            else:
                raise NotImplementedError()

            if is_first:
                if not header_insert_normal:
                    outf.write(header + "\n")
                else:
                    header = re.sub(r"\tFORMAT\t",
                                    "\tFORMAT\t%s\t" % normal_sample, header)
                    outf.write(header + "\n")
                is_first = False
            outf.write("\t".join(fields) + "\n")
            c.output += 1

    elif "lift-over" in options.methods:
        header = str(vcf_in.header).splitlines()

        if fasta:
            # validate contig size
            expected_lengths = dict(list(zip(fasta.references, fasta.lengths)))
        else:
            expected_lengths = map_contig2length

        # update contig names and sizes in VCF header
        header = [x for x in header if not x.startswith("##contig")]
        header[-1:-1] = ["##contig=<ID={},length={}>".format(
            contig, length) for contig, length in sorted(expected_lengths.items())]

        header.insert(
            len(header) - 1,
            '##liftover=<CHAIN={},REFERENCE={}>'.format(
                options.input_filename_chain,
                options.input_filename_fasta))
        outf.write("\n".join(header) + "\n")

        unmapped_contigs = set()
        unknown_contigs = set()

        trans_genotypes = str.maketrans("01", "10")

        if fasta:
            # validate contig size
            expected_lengths = dict(list(zip(fasta.references, fasta.lengths)))
            for contig, length in list(map_contig2length.items()):
                if contig in expected_lengths:
                    if length != expected_lengths[contig]:
                        raise ValueError(
                            "contig lengths mismatch. For contig {} chain files "
                            "says {}, but fasta files says {}".format(
                                contig, length, expected_lengths[contig]))
            E.info("contig sizes in chain file and fasta files correspond.")

        if options.output_filename_unmapped:
            outfile_unmapped = IOTools.open_file(options.output_filename_unmapped, "w")
            outfile_unmapped.write("\n".join(header) + "\n")
        else:
            outfile_unmapped = None

        for record in vcf_in:
            c.input += 1

            try:
                mm = map_chain[record.contig]
            except KeyError:
                c.skipped_unmapped_contig += 1
                unmapped_contigs.add(record.contig)
                if outfile_unmapped:
                    outfile_unmapped.write("skipped_unmapped_contig\t{}".format(str(record)))
                continue

            try:
                m = mm.search(record.start, record.stop)
            except AttributeError:
                c.skipped_mapping_error += 1
                if outfile_unmapped:
                    outfile_unmapped.write("skipped_mapping_error\t{}".format(str(record)))
                continue

            if len(m) == 0:
                c.skipped_unmapped_position += 1
                if outfile_unmapped:
                    outfile_unmapped.write("skipped_unmapped_position\t{}".format(str(record)))
                continue
            elif len(m) > 1:
                c.skipped_multimapping_position += 1
                if outfile_unmapped:
                    outfile_unmapped.write("skipped_multimapping_position\t{}".format(str(record)))
                continue

            m = m[0]
            y_contig, y_start, y_end, y_invert = m.data

            if y_invert:
                y_pos = y_end - (record.start - m.start)
            else:
                y_pos = (record.start - m.start) + y_start

            if fasta:
                try:
                    ref_base = fasta.fetch(y_contig, y_pos, y_pos + len(record.ref)).upper()
                except KeyError:
                    c.skipped_unknown_contig += 1
                    unknown_contigs.add(y_contig)
                    ref_base = None
                    continue

            swap_alleles = False
            if ref_base:
                error = False
                if ref_base == record.ref:
                    c.matches += 1
                else:
                    if len(record.alts) == 1:
                        alt_base = record.alts[0]
                        if ref_base == alt_base:
                            swap_alleles = True
                            c.allele_swap_variant += 1
                        else:
                            c.error_mismatch_variant += 1
                            error = "mismatch"
                    else:
                        error = "multi-mismatch"
                        c.error_multi_mismatch_variant += 1

                if error:
                    if outfile_unmapped:
                        outfile_unmapped.write("{}\t{}".format(error, str(record)))
                    c.skipped_error_variant += 1
                    continue

            fields = str(record)[:-1].split("\t")
            fields[0] = y_contig
            fields[1] = str(y_pos)

            if swap_alleles:
                fields[4] = alt_base
                fields[5] = ref_base
                # update genotype fields
                keep = False
                for idx in range(9, len(fields)):
                    gt, rest = fields[idx].split(":", 1)
                    keep = keep or "0" in gt
                    fields[idx] = ":".join((gt.translate(trans_genotypes), rest))

                # remove reference only calls
                if not keep:
                    if outfile_unmapped:
                        outfile_unmapped.write("reference_call\t{}".format(str(record)))
                    c.skipped_allele_swap_reference += 1
                continue

            c.output += 1
            outf.write("\t".join(fields) + "\n")

        c.unmapped_contigs = len(unmapped_contigs)
        c.unknown_contigs = len(unknown_contigs)

        E.info(c.asTable())
        if unknown_contigs:
            E.info("unknown contigs: {}".format(",".join(sorted(unknown_contigs))))
        if unmapped_contigs:
            E.info("unmapped contigs: {}".format(",".join(sorted(unmapped_contigs))))

    E.stop()
Esempio n. 3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("--filter-query", dest="filename_filter_query",
                      type="string",
                      help="filename with intervals in the query "
                      "to filter (in gff format) [default=%default].")

    parser.add_option("--filter-target", dest="filename_filter_target",
                      type="string",
                      help="filename with intervals in the target to "
                      "filter (in gff format) [default=%default].")

    parser.add_option("-m", "--method", dest="methods", type="choice",
                      action="append",
                      choices=("map", "merge",
                               "add-sequence", "complement",
                               "select-query", "test",
                               "filter-keep", "filter-remove",
                               "rename-query",
                               "sanitize",
                               "filter-fasta",
                               "remove-overlapping-query",
                               "remove-overlapping-target"),
                      help="""action to perform [default=%default].""")

    parser.add_option("--select", dest="select", type="choice",
                      choices=("most-nmatches", "least-nmatches",
                               "most-nmismatches", "least-nmismatches"),
                      help="entry to select [default=%default].")

    parser.add_option("--header-names", dest="header", type="choice",
                      choices=("none", "table", "full"),
                      help="output psl header [default=%default].")

    parser.add_option("--format", dest="format", type="choice",
                      choices=("gff", "gtf"),
                      help="format of intervals [default=%default].")

    parser.add_option("--queries-tsv-file", dest="filename_queries",
                      type="string",
                      help="fasta filename with queries.")

    parser.add_option("--target-psl-file", dest="filename_sbjcts",
                      type="string",
                      help="fasta filename with sbjct [default=%default].")

    parser.add_option("--id-format", dest="id_format", type="string",
                      help="format of new identifiers for the rename "
                      "function [default=%default].")

    parser.add_option("--unique", dest="unique", action="store_true",
                      help="in the rename function, make each match "
                      "unique [default=%default].")

    parser.add_option("--output-filename-map", dest="output_filename_map",
                      type="string",
                      help="filename with map of old to new labels for "
                      "rename function [default=%default].")

    parser.add_option("--complement-min-length", dest="complement_min_length",
                      type="int",
                      help="minimum length for complemented blocks "
                      "[default=%default].")

    parser.add_option("--complement-border", dest="complement_border",
                      type="int",
                      help="number of residues to exclude before alignment "
                      "at either end [default=%default].")

    parser.add_option("--complement-aligner", dest="complement_aligner",
                      type="choice",
                      choices=("clustal", "dba", "dialign", "dialign-lgs"),
                      help="aligner for complemented segments "
                      "[default=%default].")

    parser.add_option("--threshold-merge-distance",
                      dest="threshold_merge_distance", type="int",
                      help="distance in nucleotides at which two adjacent "
                      "reads shall be merged even if they are not "
                      "overlapping [%default].")

    parser.add_option("--test", dest="test", type="int",
                      help="for debugging purposes - stop after x "
                      "iterations [default=%default].")

    parser.set_defaults(filename_filter_target=None,
                        filename_filter_query=None,
                        filename_queries=None,
                        filename_sbjcts=None,
                        threshold_merge_distance=0,
                        report_step=100000,
                        min_aligned=100,
                        methods=[],
                        format="gff",
                        select="most-nmatches",
                        id_format="%06i",
                        unique=False,
                        output_filename_map=None,
                        header=None,
                        test=None)

    (options, args) = E.start(parser, add_pipe_options=True)

    if options.filename_queries:
        query_fasta = IndexedFasta.IndexedFasta(options.filename_queries)
    else:
        query_fasta = None

    if options.filename_sbjcts:
        sbjct_fasta = IndexedFasta.IndexedFasta(options.filename_sbjcts)
    else:
        sbjct_fasta = None

    if "add-sequence" in options.methods and \
       (sbjct_fasta is None or query_fasta is None):
        raise ValueError(
            "please supply both indexed query and "
            "target/genome sequence data.")

    iterator = Blat.iterator(options.stdin)

    if options.header is not None or options.header != "none":
        if options.header == "table":
            options.stdout.write("\t".join(Blat.FIELDS) + "\n")
        elif options.header == "full":
            options.stdout.write(Blat.HEADER + "\n")

    for method in options.methods:

        if "map" == method:
            pslMap(options)
            break
        elif "filter-keep" == method:
            pslFilter(options, keep=True)
            break
        elif "filter-remove" == method:
            pslFilter(options, keep=False)
            break
        elif "merge" == method:
            pslMerge(options)
            break
        elif "add-sequence" == method:
            pslAddSequence(query_fasta, sbjct_fasta, options)
            break
        elif "complement" == method:
            pslComplement(query_fasta, sbjct_fasta, options)
            break
        elif "select-query" == method:
            pslSelectQuery(options)
            break
        elif "test" == method:
            iterator = Blat.iterator_test(iterator, options.report_step)
        elif "rename-query" == method:
            iterator = iterator_rename_query(iterator, options)
        elif "sanitize" == method:
            iterator = iterator_sanitize(
                iterator, query_fasta, sbjct_fasta, options)
        elif "filter-fasta" == method:
            iterator = iterator_filter_fasta(
                iterator, query_fasta, sbjct_fasta, options)
        elif "remove-overlapping-query" == method:
            iterator = iterator_filter_overlapping_query(iterator, options)
        elif "remove-overlapping-target" == method:
            iterator = iterator_filter_overlapping_target(iterator, options)

    for psl in iterator:
        options.stdout.write("%s\n" % str(psl))

    E.stop()
Esempio n. 4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: snp2table.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")
    parser.add_option(
        "-a",
        "--annotations-tsv-file",
        dest="filename_annotations",
        type="string",
        help=
        "filename with base annotations (output from gtf2fasta.py) [default=%default]."
    )
    parser.add_option(
        "-f",
        "--exons-file",
        dest="filename_exons",
        type="string",
        help=
        "filename with exon information (gff formatted file)  [default=%default]."
    )
    parser.add_option(
        "-j",
        "--junctions-bed-file",
        dest="filename_junctions",
        type="string",
        help=
        "filename with junction information (filename with exon junctions)  [default=%default]."
    )
    parser.add_option("-c",
                      "--vcf-file",
                      dest="filename_vcf",
                      type="string",
                      help="vcf file to parse [default=%default].")
    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("pileup", "vcf"),
                      help="input format [default=%default].")
    parser.add_option(
        "--vcf-sample",
        dest="vcf_sample",
        type="string",
        help="sample id in vcf file to analyse [default=%default].")

    parser.set_defaults(
        genome_file=None,
        filename_annotations=None,
        filename_exons=None,
        filename_junctions=None,
        input_format="pileup",
        vcf_sample=None,
        filename_vcf=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    ninput, nskipped, noutput = 0, 0, 0

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.filename_junctions:
        junctions = readJunctions(options.filename_junctions)
    else:
        junctions = None

    # setup iterator
    if options.input_format == "pileup":
        iterator = pysam.Pileup.iterate(sys.stdin)
    elif options.input_format == "vcf":
        if not options.vcf_sample:
            raise ValueError(
                "vcf format requires sample id (--vcf-sample) to be set")
        if not options.filename_vcf:
            raise ValueError(
                "reading from vcf requires vcf filename (--filename-vcf) to be set)"
            )

        iterator = pysam.Pileup.iterate_from_vcf(options.filename_vcf,
                                                 options.vcf_sample)

    modules = []
    modules.append(BaseAnnotatorSNP())

    if options.filename_exons:
        modules.append(BaseAnnotatorExons(options.filename_exons, fasta=fasta))
    if options.filename_annotations:
        modules.append(
            BaseAnnotatorCodon(options.filename_annotations,
                               fasta=fasta,
                               junctions=junctions))
    if options.filename_junctions:
        modules.append(
            BaseAnnotatorSpliceSites(options.filename_junctions, fasta=fasta))

    options.stdout.write("\t".join([x.getHeader() for x in modules]) + "\n")

    for snp in iterator:
        ninput += 1

        # translate chromosome according to fasta
        if fasta:
            try:
                snp = snp._replace(chromosome=fasta.getToken(snp.chromosome))
            except KeyError:
                E.warn("unknown contig `%s` for snp `%s`" %
                       (snp.chromosome, str(snp)))
                continue

        for module in modules:
            module.update(snp)

        options.stdout.write("\t".join(map(str, modules)) + "\n")

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.stop()
def main(argv=sys.argv):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: annotator_distance.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-a",
        "--annotations-tsv-file",
        dest="filename_annotations",
        type="string",
        help=
        "filename mapping gene ids to annotations (a tab-separated table with two-columns) [default=%default]."
    )

    parser.add_option("-r",
                      "--resolution",
                      dest="resolution",
                      type="int",
                      help="resolution of count vector [default=%default].")

    parser.add_option(
        "-b",
        "--num-bins",
        dest="num_bins",
        type="int",
        help="number of bins in count vector [default=%default].")

    parser.add_option("-i",
                      "--num-samples",
                      dest="num_samples",
                      type="int",
                      help="sample size to compute [default=%default].")

    parser.add_option(
        "-w",
        "--workspace-bed-file",
        dest="filename_workspace",
        type="string",
        help="filename with workspace information [default=%default].")

    parser.add_option(
        "--workspace-builder",
        dest="workspace_builder",
        type="choice",
        choices=("gff", "gtf-intergenic", "gtf-intronic", "gtf-genic"),
        help="given a gff/gtf file build a workspace [default=%default].")

    parser.add_option(
        "--workspace-labels",
        dest="workspace_labels",
        type="choice",
        choices=("none", "direction", "annotation"),
        help="labels to use for the workspace workspace [default=%default].")

    parser.add_option(
        "--sampler",
        dest="sampler",
        type="choice",
        choices=("permutation", "gaps"),
        help=
        "sampler to use. The sampler determines the null model of how segments are distributed in the workspace  [default=%default]"
    )

    parser.add_option(
        "--counter",
        dest="counters",
        type="choice",
        action="append",
        choices=("transcription", "closest-distance", "all-distances"),
        help=
        "counter to use. The counter computes the quantity of interest [default=%default]"
    )

    parser.add_option("--analysis",
                      dest="analysis",
                      type="choice",
                      action="append",
                      choices=("proximity", "area-under-curve"),
                      help="analysis to perform [default=%default]")

    parser.add_option("--transform-counts",
                      dest="transform_counts",
                      type="choice",
                      choices=("raw", "cumulative"),
                      help="cumulate counts [default=%default].")

    parser.add_option(
        "-s",
        "--segments",
        dest="filename_segments",
        type="string",
        help="filename with segment information [default=%default].")

    parser.add_option("--xrange",
                      dest="xrange",
                      type="string",
                      help="xrange to plot [default=%default]")

    parser.add_option("-o",
                      "--logscale",
                      dest="logscale",
                      type="string",
                      help="use logscale on x, y or xy [default=%default]")

    parser.add_option("-p",
                      "--plot",
                      dest="plot",
                      action="store_true",
                      help="output plots [default=%default]")

    parser.add_option("--hardcopy",
                      dest="hardcopy",
                      type="string",
                      help="output hardcopies to file [default=%default]")

    parser.add_option("--no-fdr",
                      dest="do_fdr",
                      action="store_false",
                      help="do not compute FDR rates [default=%default]")

    parser.add_option("--segments-format",
                      dest="segments_format",
                      type="choice",
                      choices=("gtf", "bed"),
                      help="format of segments file [default=%default].")

    parser.add_option(
        "--truncate",
        dest="truncate",
        action="store_true",
        help="truncate segments extending beyond a workspace [default=%default]"
    )

    parser.add_option(
        "--remove-overhangs",
        dest="remove_overhangs",
        action="store_true",
        help="remove segments extending beyond a workspace[default=%default]")

    parser.add_option(
        "--keep-ambiguous",
        dest="keep_ambiguous",
        action="store_true",
        help=
        "keep segments extending to more than one workspace [default=%default]"
    )

    parser.set_defaults(
        filename_annotations=None,
        filename_workspace="workspace.gff",
        filename_segments="FastDown.gtf",
        filename_annotations_gtf="../data/tg1_territories.gff",
        workspace_builder="gff",
        workspace_labels="none",
        sampler="permutation",
        truncate=False,
        num_bins=10000,
        num_samples=10,
        resolution=100,
        plot_samples=False,
        plot_envelope=True,
        counters=[],
        transform_counts="raw",
        xrange=None,
        plot=False,
        logscale=None,
        output_all=False,
        do_test=False,
        analysis=[],
        do_fdr=True,
        hardcopy="%s.png",
        segments_format="gtf",
        remove_overhangs=False,
    )

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    ###########################################
    # setup options
    if options.sampler == "permutation":
        sampler = SamplerPermutation
    elif options.sampler == "gaps":
        sampler = SamplerGaps

    if options.xrange:
        options.xrange = list(map(float, options.xrange.split(",")))

    if len(options.counters) == 0:
        raise ValueError("please specify at least one counter.")

    if len(options.analysis) == 0:
        raise ValueError("please specify at least one analysis.")

    if options.workspace_labels == "annotation" and not options.filename_annotations:
        raise ValueError(
            "please specify --annotations-tsv-file is --workspace-labels=annotations."
        )

    ###########################################
    # read data
    if options.workspace_labels == "annotation":

        def constant_factory(value):
            return itertools.repeat(value).__next__

        def dicttype():
            return collections.defaultdict(constant_factory(("unknown", )))

        map_id2annotations = IOTools.readMultiMap(open(
            options.filename_annotations, "r"),
                                                  dtype=dicttype)
    else:
        map_id2annotations = {}

    workspace = readWorkspace(open(options.filename_workspace,
                                   "r"), options.workspace_builder,
                              options.workspace_labels, map_id2annotations)

    E.info("read workspace for %i contigs" % (len(workspace)))

    indexed_workspace = indexIntervals(workspace, with_values=True)
    segments = readSegments(open(options.filename_segments, "r"),
                            indexed_workspace,
                            format=options.segments_format,
                            keep_ambiguous=options.keep_ambiguous,
                            truncate=options.truncate,
                            remove_overhangs=options.remove_overhangs)

    nsegments = 0
    for contig, vv in segments.items():
        nsegments += len(vv)

    E.info("read %i segments for %i contigs" % (nsegments, len(workspace)))
    indexed_segments = indexIntervals(segments, with_values=False)

    if nsegments == 0:
        E.warn("no segments read - no computation done.")
        E.stop()
        return

    # build labels
    labels = collections.defaultdict(int)
    for contig, vv in workspace.items():
        for start, end, v in vv:
            for l in v[0]:
                labels[l] += 1
            for l in v[1]:
                labels[l] += 1

    E.info("found %i workspace labels" % len(labels))

    ###########################################
    # setup counting containers
    counters = []
    for cc in options.counters:

        if cc == "transcription":
            counter = CounterTranscription
        elif cc == "closest-distance":
            counter = CounterClosestDistance
        elif cc == "all-distances":
            counter = CounterAllDistances

        if nsegments < 256:
            dtype = numpy.uint8
        elif nsegments < 65536:
            dtype = numpy.uint16
        elif nsegments < 4294967296:
            dtype = numpy.uint32
        else:
            dtype = numpy.int

        E.debug("choosen dtype %s" % str(dtype))

        E.info("samples space is %i bases: %i bins at %i resolution" % (
            options.num_bins * options.resolution,
            options.num_bins,
            options.resolution,
        ))

        E.info("allocating counts: %i bytes (%i labels, %i samples, %i bins)" %
               (
                   options.num_bins * len(labels) * dtype().itemsize *
                   (options.num_samples + 1),
                   len(labels),
                   options.num_samples,
                   options.num_bins,
               ))

        c = CountingResults(labels)
        c.mObservedCounts = counter(labels,
                                    options.num_bins,
                                    options.resolution,
                                    dtype=dtype)

        simulated_counts = []
        for x in range(options.num_samples):
            simulated_counts.append(
                counter(labels,
                        options.num_bins,
                        options.resolution,
                        dtype=dtype))
        c.mSimulatedCounts = simulated_counts
        c.mName = c.mObservedCounts.mName

        counters.append(c)

        E.info("allocated memory successfully")

    segments_per_workspace = []
    segment_sizes = []
    segments_per_label = collections.defaultdict(int)
    workspaces_per_label = collections.defaultdict(int)

    ############################################
    # get observed and simpulated counts
    nworkspaces, nempty_workspaces, nempty_contigs, nmiddle = 0, 0, 0, 0
    iteration2 = 0
    for contig, vv in workspace.items():

        iteration2 += 1
        E.info("counting %i/%i: %s %i segments" %
               (iteration2, len(workspace), contig, len(vv)))

        if len(vv) == 0:
            continue

        iteration1 = 0
        for work_start, work_end, v in vv:

            left_labels, right_labels = v[0], v[1]

            iteration1 += 1

            # ignore empty segments
            if contig not in indexed_segments:
                nempty_contigs += 1
                continue

            r = indexed_segments[contig].find(work_start, work_end)
            segments_per_workspace.append(len(r))

            if not r:
                nempty_workspaces += 1
                continue

            # collect segments and stats
            nworkspaces += 1
            observed = [(x.start, x.end) for x in r]
            observed.sort()
            segments_per_workspace.append(len(observed))
            segment_sizes.extend([x[1] - x[0] for x in observed])

            # collect basic counts
            for label in list(left_labels) + list(right_labels):
                workspaces_per_label[label] += 1
                segments_per_label[label] += len(observed)

            # add observed counts
            for counter in counters:
                counter.mObservedCounts.addCounts(observed, work_start,
                                                  work_end, left_labels,
                                                  right_labels)

            # create sampler
            s = sampler(observed, work_start, work_end)

            # add simulated counts
            for iteration in range(options.num_samples):
                simulated = s.sample()
                for counter in counters:
                    counter.mSimulatedCounts[iteration].addCounts(
                        simulated, work_start, work_end, left_labels,
                        right_labels)

    E.info("counting finished")
    E.info(
        "nworkspaces=%i, nmiddle=%i, nempty_workspaces=%i, nempty_contigs=%i" %
        (nworkspaces, nmiddle, nempty_workspaces, nempty_contigs))

    ######################################################
    # transform counts

    if options.transform_counts == "cumulative":
        transform = cumulative_transform
    elif options.transform_counts == "raw":
        transform = normalize_transform

    ####################################################
    # analysis

    if "proximity" in options.analysis:
        outfile_proximity = E.openOutputFile("proximity")
        outfile_proximity.write("\t".join(
            ("label", "observed", "pvalue", "expected", "CIlower", "CIupper",
             "qvalue", "segments", "workspaces")) + "\n")
    else:
        outfile_proximity = None

    if "area-under-curve" in options.analysis:
        outfile_auc = E.openOutputFile("auc")
        outfile_auc.write("label\tobserved\texpected\tCIlower\tCIupper\n")
    else:
        outfile_auc = None

    # qvalue: expected false positives at p-value
    # qvalue = expected false positives /
    if options.do_fdr:
        E.info("computing pvalues for fdr")
        for counter in counters:
            for label in labels:
                E.info("working on counter:%s label:%s" % (counter, label))

                # collect all P-Values of simulated results to compute FDR
                sim_pvalues = []
                medians = counter.getMedians(label)

                for median in medians:
                    pvalue = float(
                        scipy.stats.percentileofscore(medians, median)) / 100.0
                    sim_pvalues.append(pvalue)

        sim_pvalues.sort()
    else:
        sim_pvalues = []

    # compute observed p-values
    for counter in counters:
        counter.update()

    obs_pvalues = []
    for counter in counters:
        for label in labels:
            obs_pvalues.append(counter.mStats[label].pvalue)
        obs_pvalues.sort()

    # compute observed p-values
    if options.do_fdr:
        for counter in counters:
            counter.updateFDR(obs_pvalues, sim_pvalues)

    for counter in counters:

        outofbounds_sim, totals_sim = 0, 0
        outofbounds_obs, totals_obs = 0, 0
        for label in labels:
            for sample in range(options.num_samples):
                if counter.mSimulatedCounts[sample].mOutOfBounds[label]:
                    E.debug(
                        "out of bounds: sample %i, label %s, counts=%i" %
                        (sample, label,
                         counter.mSimulatedCounts[sample].mOutOfBounds[label]))
                    outofbounds_sim += counter.mSimulatedCounts[
                        sample].mOutOfBounds[label]
                totals_sim += counter.mSimulatedCounts[sample].mTotals[label]

            outofbounds_obs += counter.mObservedCounts.mOutOfBounds[label]
            totals_obs += counter.mObservedCounts.mTotals[label]

        E.info(
            "out of bounds observations: observed=%i/%i (%5.2f%%), simulations=%i/%i (%5.2f%%)"
            % (
                outofbounds_obs,
                totals_obs,
                100.0 * outofbounds_obs / totals_obs,
                outofbounds_sim,
                totals_sim,
                100.0 * outofbounds_sim / totals_sim,
            ))

        for label in labels:

            if outfile_auc:
                mmin, mmax, mmean = counter.getEnvelope(
                    label, transform=normalize_transform)
                obs = normalize_transform(
                    counter.mObservedCounts[label],
                    counter.mObservedCounts.mOutOfBounds[label])

                def block_iterator(a1, a2, a3, num_bins):
                    x = 0
                    while x < num_bins:
                        while x < num_bins and a1[x] <= a2[x]:
                            x += 1
                        start = x
                        while x < options.num_bins and a1[x] > a2[x]:
                            x += 1
                        end = x
                        total_a1 = a1[start:end].sum()
                        total_a3 = a3[start:end].sum()
                        if total_a1 > total_a3:
                            yield (total_a1 - total_a3, start, end, total_a1,
                                   total_a3)

                blocks = list(
                    block_iterator(obs, mmax, mmean, options.num_bins))

                if options.output_all:
                    for delta, start, end, total_obs, total_mean in blocks:
                        if end - start <= 1:
                            continue
                        outfile_auc.write(
                            "%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" %
                            (label, start * options.resolution,
                             end * options.resolution,
                             (end - start) * options.resolution, total_obs,
                             total_mean, delta, total_obs / total_mean, 100.0 *
                             (total_obs / total_mean - 1.0)))

                # output best block
                blocks.sort()
                delta, start, end, total_obs, total_mean = blocks[-1]

                outfile_auc.write(
                    "%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" %
                    (label, start * options.resolution,
                     end * options.resolution,
                     (end - start) * options.resolution, total_obs, total_mean,
                     delta, total_obs / total_mean, 100.0 *
                     (total_obs / total_mean - 1.0)))

            if outfile_proximity:

                # find error bars at median
                st = counter.mStats[label]
                outfile_proximity.write(
                    "%s\t%i\t%f\t%i\t%i\t%i\t%s\t%i\t%i\n" % (
                        label,
                        st.observed * options.resolution,
                        st.pvalue,
                        st.expected * options.resolution,
                        st.ci95lower * options.resolution,
                        st.ci95upper * options.resolution,
                        IOTools.val2str(st.qvalue),
                        segments_per_label[label],
                        workspaces_per_label[label],
                    ))

    if options.plot:

        for counter in counters:
            plotCounts(counter, options, transform)

        # plot summary stats
        plt.figure()
        plt.title("distribution of workspace length")
        data = []
        for contig, segs in workspace.items():
            if len(segs) == 0:
                continue
            data.extend([x[1] - x[0] for x in segs])

        vals, bins = numpy.histogram(data,
                                     bins=numpy.arange(0, max(data), 100),
                                     new=True)

        t = float(sum(vals))
        plt.plot(bins[:-1], numpy.cumsum(vals) / t)
        plt.gca().set_xscale('log')
        plt.legend()
        t = float(sum(vals))
        plt.xlabel("size of workspace")
        plt.ylabel("cumulative relative frequency")
        if options.hardcopy:
            plt.savefig(os.path.expanduser(options.hardcopy %
                                           "workspace_size"))

        plt.figure()
        plt.title("segments per block")
        vals, bins = numpy.histogram(segments_per_workspace,
                                     bins=numpy.arange(
                                         0, max(segments_per_workspace), 1),
                                     new=True)
        plt.plot(bins[:-1], vals)
        plt.xlabel("segments per block")
        plt.ylabel("absolute frequency")
        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "segments_per_block"))

        plt.figure()
        plt.title("workspaces per label")
        plt.barh(list(range(0, len(labels))),
                 [workspaces_per_label[x] for x in labels],
                 height=0.5)
        plt.yticks(list(range(0, len(labels))), labels)
        plt.ylabel("workspaces per label")
        plt.xlabel("absolute frequency")
        plt.gca().set_xscale('log')

        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "workspaces_per_label"))

        plt.figure()
        plt.title("segments per label")
        plt.barh(list(range(0, len(labels))),
                 [segments_per_label[x] for x in labels],
                 height=0.5)
        plt.yticks(list(range(0, len(labels))), labels)
        plt.ylabel("segments per label")
        plt.xlabel("absolute frequency")
        plt.xticks(list(range(0, len(labels))), labels)
        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "segments_per_label"))

        if not options.hardcopy:
            plt.show()

    E.stop()
Esempio n. 6
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-f",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("bam", ),
                      help="input file format [default=%default].")

    parser.add_option("-w",
                      "--window-size",
                      dest="window_size",
                      type="int",
                      help="window size [default=%default].")

    parser.add_option("-c",
                      "--control-filename",
                      dest="control_filename",
                      type="string",
                      help="filename of input/control data in "
                      "bed format [default=%default].")

    parser.add_option("-t",
                      "--threads",
                      dest="threads",
                      type="int",
                      help="number of threads to use [default=%default].")

    parser.add_option("-q",
                      "--fdr-threshold",
                      dest="fdr_threshold",
                      type="float",
                      help="fdr threshold [default=%default].")

    parser.add_option("-z",
                      "--spp-z-threshold",
                      dest="z_threshold",
                      type="float",
                      help="z threshold [default=%default].")

    parser.add_option("--bin",
                      dest="bin",
                      type="int",
                      help="bin tags within the specified number "
                      " of basepairs to speed up calculation;"
                      " increasing bin size decreases the accuracy "
                      "of the determined parameters [default=%default]")

    parser.add_option("--spp-srange-min",
                      dest="srange_min",
                      type="float",
                      help="srange gives the possible range for the "
                      " size of the protected region;"
                      " srange should be higher than tag length; "
                      " making the upper boundary too high"
                      " will increase calculation time [%default]")

    parser.add_option("--spp-srange-max",
                      dest="srange_max",
                      type="float",
                      help="srange gives the possible range for the "
                      " size of the protected region;"
                      " srange should be higher than tag length; "
                      " making the upper boundary too high"
                      " will increase calculation time [%default]")

    parser.set_defaults(
        input_format="bam",
        threads=1,
        fdr_threshold=0.05,
        window_size=1000,
        offset=125,
        srange_min=50,
        srange_max=500,
        bin=5,
        z_threshold=3,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError(
            "please specify a filename with sample data and an output file")

    filename_sample, filename_output = args[0], args[1]
    filename_control = options.control_filename

    # load Zinba
    R.library('spp')
    R.library('snow')

    # read data
    E.info("reading data")
    R('''chip.data <- read.bam.tags('%s')''' % filename_sample)
    R('''input.data <- read.bam.tags('%s')''' % filename_control)
    R('''cluster = makeCluster( %i )''' % (options.threads))

    E.info("computing binding characteristics")
    # get binding info from cross-correlation profile

    # srange gives the possible range for the size of the protected region;
    # srange should be higher than tag length; making the upper boundary too
    # high will increase calculation time

    # bin - bin tags within the specified number of basepairs to speed
    # up calculation; increasing bin size decreases the accuracy of
    # the determined parameters
    srange_min, srange_max = options.srange_min, options.srange_max
    bin = options.bin
    R('''binding.characteristics <- get.binding.characteristics(chip.data,
    srange=c(%(srange_min)i,%(srange_max)i),
    bin=%(bin)s,
    cluster=cluster);''' % locals())
    # print out binding peak separation distance
    options.stdout.write("shift\t%i\n" %
                         R('''binding.characteristics$peak$x''')[0])

    ##################################################
    ##################################################
    ##################################################
    E.info("plot cross correlation profile")
    # plot cross-correlation profile
    R('''pdf(file="%s.crosscorrelation.pdf",width=5,height=5)''' %
      filename_output)
    R('''par(mar = c(3.5,3.5,1.0,0.5), mgp = c(2,0.65,0), cex = 0.8);''')
    R('''plot(binding.characteristics$cross.correlation,
    type='l',
    xlab="strand shift",
    ylab="cross-correlation");''')
    R('''abline(v=binding.characteristics$peak$x,lty=2,col=2)''')
    R('''dev.off();''')

    E.info("selecting informative tags based on the binding characteristics")
    # select informative tags based on the binding characteristics
    R('''chip.data <- select.informative.tags(
    chip.data,binding.characteristics);''')
    R('''input.data <- select.informative.tags(
    input.data,binding.characteristics);''')

    E.info("outputting broad peaks")
    window_size, z_threshold = options.window_size, options.z_threshold
    R('''broad.clusters <- get.broad.enrichment.clusters(chip.data,input.data,
    window.size=%(window_size)i,
    z.thr=%(z_threshold)f,
    tag.shift=round(binding.characteristics$peak$x/2))''' % locals())
    # write out in broadPeak format
    R('''write.broadpeak.info(broad.clusters,"%s.broadpeak.txt")''' %
      filename_output)

    # binding detection parameters desired FDR (1%). Alternatively, an
    # E-value can be supplied to the method calls below instead of the
    # fdr parameter the binding.characteristics contains the optimized
    # half-size for binding detection window
    R('''detection.window.halfsize <- binding.characteristics$whs;''')

    # determine binding positions using wtd method
    E.info("determining binding positions using wtd method")
    fdr = options.fdr_threshold
    R('''bp <- find.binding.positions(
    signal.data=chip.data,control.data=input.data,
    fdr=%(fdr)f,whs=detection.window.halfsize,cluster=cluster)''' % locals())
    options.stdout.write(
        "detected_peaks\t%i\n" %
        R('''sum(unlist(lapply(bp$npl,function(d) length(d$x))))''')[0])

    # output detected binding positions
    R('''output.binding.results(bp,"%s.summit.txt");''' % filename_output)

    R('''bp <- add.broad.peak.regions(chip.data,input.data,bp,
    window.size=%(window_size)i,z.thr=%(z_threshold)f)''' % locals())
    # output using narrowPeak format
    R('''write.narrowpeak.binding(bp,"%s.narrowpeak.txt")''' % filename_output)

    # write footer and output benchmark information.
    E.stop()
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=('reconcile', ),
                      help="method to apply [default=%default].")

    parser.add_option("-c",
                      "--chop-identifier",
                      dest="chop",
                      action="store_true",
                      help="whether or not to trim last character of the  "
                      "sequence name. For example sometimes ids in the first "
                      "file in the pair will end with \1 and the second "
                      "with \2. If --chop-identifier is not specified "
                      "then the results will be wrong [default=%default].")

    parser.add_option("-u",
                      "--unpaired",
                      dest="unpaired",
                      action="store_true",
                      help="whether or not to write out unpaired reads "
                      "to a separate file")

    parser.add_option("--id-pattern-1",
                      dest="id_pattern_1",
                      help="If specified will use the first group from the"
                      "pattern to determine the ID for the first read",
                      default=None)

    parser.add_option("--id-pattern-2",
                      dest="id_pattern_2",
                      help="As above but for read 2",
                      default=None)

    parser.add_option("-o",
                      "--output-filename-pattern",
                      dest="output_pattern",
                      type="string",
                      help="pattern for output files [default=%default].")

    parser.set_defaults(
        method="reconcile",
        chop=False,
        unpaired=False,
        output_pattern="%s.fastq.gz",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError(
            "please supply at least two fastq files on the commandline")

    fn1, fn2 = args
    c = E.Counter()

    if options.id_pattern_1:
        id1_getter = PatternGetter(options.id_pattern_1)
    else:
        id1_getter = plain_getter

    if options.id_pattern_2:
        id2_getter = PatternGetter(options.id_pattern_2)
    else:
        id2_getter = plain_getter

    if options.method == "reconcile":

        # IMS: switching to no store second set of read names and only use
        # lazily. Since generators don't have a size must keep track
        id_lengths = {fn1: 0, fn2: 0}

        def getIds(infile, id_getter=plain_getter):
            '''return ids in infile.'''
            aread = infile.readline
            while True:
                l = [aread().rstrip("\r\n") for i in range(4)]
                if not l[0]:
                    break
                r = id_getter(l[0].split()[0])
                # decide if to chop read number off
                id_lengths[infile.name] += 1
                if options.chop:
                    yield r[:-1]
                else:
                    yield r

        def write(outfile,
                  infile,
                  take,
                  unpaired_file=None,
                  id_getter=plain_getter):
            '''filter fastq files with ids in take.'''
            aread = infile.readline
            while True:
                l = [aread().rstrip("\r\n") for i in range(4)]
                if not l[0]:
                    break
                r = id_getter(l[0].split()[0])
                if options.chop:
                    r = r[:-1]
                if r not in take:
                    if unpaired_file is None:
                        continue
                    else:
                        unpaired_file.write("\n".join(l) + "\n")
                else:
                    outfile.write("\n".join(l) + "\n")

        E.info("reading first in pair")
        inf1 = IOTools.open_file(fn1)
        ids1 = set(getIds(inf1, id1_getter))

        E.info("reading second in pair")
        inf2 = IOTools.open_file(fn2)
        # IMS: No longer keep as a set, but lazily evaluate into intersection
        # leads to large memory saving for large inf2, particularly if
        # inf1 is small.
        ids2 = getIds(inf2, id2_getter)
        take = ids1.intersection(ids2)

        E.info("first pair: %i reads, second pair: %i reads, "
               "shared: %i reads" %
               (id_lengths[fn1], id_lengths[fn2], len(take)))

        if options.unpaired:
            unpaired_filename = IOTools.open_file(
                options.output_pattern % "unpaired", "w")
        else:
            unpaired_filename = None

        with IOTools.open_file(options.output_pattern % "1", "w") as outf:
            inf = IOTools.open_file(fn1)
            E.info("writing first in pair")
            write(outf, inf, take, unpaired_filename, id1_getter)

        with IOTools.open_file(options.output_pattern % "2", "w") as outf:
            inf = IOTools.open_file(fn2)
            E.info("writing second in pair")
            write(outf, inf, take, unpaired_filename, id2_getter)

        if options.unpaired:
            unpaired_filename.close()

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.stop()
Esempio n. 8
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-d",
                      "--dest",
                      dest="destination",
                      type="string",
                      help="destination directory.")

    parser.add_option(
        "-n",
        "--name",
        "--set-name",
        dest="name",
        type="string",
        help="name of this pipeline. 'pipeline_' will be prefixed.")

    parser.add_option("-f",
                      "--force-output",
                      dest="force",
                      action="store_true",
                      help="overwrite existing files.")

    parser.add_option("-t",
                      "--pipeline-type",
                      dest="pipeline_type",
                      type="choice",
                      choices=("full", "minimal"),
                      help="type of pipeline to output. "
                      "full=a complete pipeline for the CGAT environment "
                      "minimum=minimum pipeline "
                      "[%default]")

    parser.set_defaults(
        destination=".",
        name=None,
        force=False,
        pipeline_type="full",
    )

    (options, args) = E.start(parser)

    if not options.name:
        raise ValueError("please provide a pipeline name")

    destination_dir = os.path.abspath(options.destination)
    reportdir = os.path.join(destination_dir, "src", "pipeline_docs",
                             "pipeline_%s" % options.name)
    confdir = os.path.join(destination_dir, "src",
                           "pipeline_%s" % (options.name))

    # create directories
    for d in ("", "src", "work", "src/pipeline_docs",
              "src/pipeline_%s" % options.name, reportdir,
              "%s/_templates" % reportdir, "%s/pipeline" % reportdir,
              "%s/trackers" % reportdir):

        dd = os.path.join(destination_dir, d)
        if not os.path.exists(dd):
            os.makedirs(dd)

    # copy files
    # replaces all instances of template with options.name within
    # filenames and inside files.
    rx_file = re.compile("template")
    rx_type = re.compile("_%s" % options.pipeline_type)
    rx_template = re.compile("@template@")
    rx_reportdir = re.compile("@reportdir@")

    srcdir = os.path.dirname(__file__)

    def copy(src, dst, name):

        # remove "template" and the pipeline type from file/directory
        # names.
        fn_dest = os.path.join(destination_dir, dst,
                               rx_type.sub("", rx_file.sub(name, src)))

        fn_src = os.path.join(srcdir, "pipeline_template_data", src)

        E.debug("fn_src=%s, fn_dest=%s, src=%s, dest=%s" %
                (fn_src, fn_dest, src, dst))

        if os.path.exists(fn_dest) and not options.force:
            raise OSError("file %s already exists - not overwriting." %
                          fn_dest)

        if fn_src.endswith(".png"):
            shutil.copyfile(fn_src, fn_dest)
        else:
            with IOTools.open_file(fn_dest, "w") as outfile:
                with IOTools.open_file(fn_src) as infile:
                    for line in infile:
                        outfile.write(
                            rx_reportdir.sub(reportdir,
                                             rx_template.sub(name, line)))

    def copytree(src, dst, name):

        fn_dest = os.path.join(destination_dir, dst, rx_file.sub(name, src))
        fn_src = os.path.join(srcdir, "pipeline_template_data", src)

        if os.path.exists(fn_dest) and not options.force:
            raise OSError("file %s already exists - not overwriting." %
                          fn_dest)

        shutil.copytree(fn_src, fn_dest)

    for f in ("pipeline.yml", ):
        copy(f, 'src/pipeline_%s' % options.name, name=options.name)

    # copy the script
    copy("pipeline_template_%s.py" % options.pipeline_type,
         'src',
         name=options.name)

    # create links
    for src, dest in (("pipeline.yml", "pipeline.yml"), ):
        d = os.path.join(destination_dir, "work", dest)
        if os.path.exists(d) and options.force:
            os.unlink(d)
        os.symlink(os.path.join(confdir, src), d)

    for f in ("cgat_logo.png", ):
        copy(f, "%s/_templates" % reportdir, name=options.name)

    for f in ("themes", ):
        copytree(f, "src/pipeline_docs", name=options.name)

    for f in ("contents.rst", "pipeline.rst", "__init__.py"):
        copy(f, reportdir, name=options.name)

    for f in ("Dummy.rst", "Methods.rst"):
        copy(f, "%s/pipeline" % reportdir, name=options.name)

    for f in ("TemplateReport.py", ):
        copy(f, "%s/trackers" % reportdir, name=options.name)

    absdest = os.path.abspath(destination_dir)

    name = options.name

    print("""
Welcome to your new %(name)s CGAT pipeline.

All files have been successfully copied to `%(destination_dir)s`. In
order to start the pipeline, go to `%(destination_dir)s/work`

   cd %(destination_dir)s/work

You can start the pipeline by typing:

   cgatflow %(name)s -v 5 -p 5 make full

The source code for the pipeline is in %(destination_dir)s/src.

""" % locals())

    E.stop()
Esempio n. 9
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option(
        "-p", "--pattern-identifier", dest="pattern", type="string",
        help="jobs matching `pattern` in their job "
        "description will be killed [default=%default].")

    parser.add_option("-n", "--dry-run", dest="dry_run", action="store_true",
                      help="do dry run, do not kill [default=%default].")

    parser.set_defaults(
        pattern=None,
        dry_run=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    output = StringIO.StringIO(
        subprocess.Popen(["qstat", "-xml"],
                         stdout=subprocess.PIPE).communicate()[0])

    tree = xml.etree.ElementTree.ElementTree(file=output)

    ntested = 0
    to_kill = set()

    if options.pattern:
        pattern = re.compile(options.pattern)
    else:
        pattern = None

    for x in tree.getiterator("job_list"):
        ntested += 1
        id = x.find("JB_job_number").text
        name = x.find("JB_name").text
        if pattern and pattern.search(name):
            to_kill.add(id)

    nkilled = len(to_kill)
    if not options.dry_run:
        p = subprocess.Popen(
            ["qdel", ",".join(to_kill)], stdout=subprocess.PIPE)
        stdout, stderr = p.communicate()

    E.info("ntested=%i, nkilled=%i" % (ntested, nkilled))

    # write footer and output benchmark information.
    E.Stop()
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gff_compare.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-f",
                      "--output-full",
                      dest="write_full",
                      help="write full gff entries.",
                      action="store_true")
    parser.add_option("-e",
                      "--output-matched-exons",
                      dest="write_matched_exons",
                      help="write matched exons.",
                      action="store_true")
    parser.add_option("-o",
                      "--output-missed-exons",
                      dest="write_missed_exons",
                      action="store_true",
                      help="write missed exons.")
    parser.add_option("-g",
                      "--output-missed-genes",
                      dest="write_missed_genes",
                      action="store_true",
                      help="write missed genes.")
    parser.add_option(
        "-r",
        "--regex-reference",
        dest="regex_reference",
        type="string",
        help="regular expression mapping exon to transcript in reference.")
    parser.add_option(
        "-t",
        "--regex-target",
        dest="regex_target",
        type="string",
        help="regular expression mapping exon to transcript in target.")
    parser.add_option("--no-nucleotides",
                      dest="do_nucleotides",
                      action="store_false",
                      help="skip nucleotide benchmark.")
    parser.add_option("--no-exons",
                      dest="do_exons",
                      action="store_false",
                      help="skip exon benchmark.")
    parser.add_option("--no-genes",
                      dest="do_genes",
                      action="store_false",
                      help="skip gene benchmark.")
    parser.add_option(
        "--output-filename-pattern",
        dest="outfile_pattern",
        type="string",
        help=
        "output filename pattern for extra info (%s will be substituted with reference,target)."
    )

    parser.set_defaults(
        remove_redundancy=False,
        max_exon_slippage=9,
        write_missed_exons=False,
        write_matched_exons=False,
        write_missed_genes=False,
        write_wrong_exons=False,
        write_wrong_genes=False,
        do_nucleotides=True,
        do_exons=True,
        do_genes=True,
        regex_reference=None,
        regex_target=None,
        outfile_pattern="%s.info",
    )

    (options, args) = E.start(parser)

    if len(args) != 2:
        print(USAGE)
        print("two arguments required")
        sys.exit(1)

    input_filename_target, input_filename_reference = args

    if options.loglevel >= 1:
        print("# target entries from %s" % input_filename_target)
        print("# reading target entries ...", end=' ')
        sys.stdout.flush()

    gff_targets = GTF.readFromFile(open(input_filename_target, "r"))

    if options.loglevel >= 1:
        print("finished: %i" % (len(gff_targets)))
        sys.stdout.flush()

    if options.loglevel >= 1:
        print("# reference entries from %s" % input_filename_reference)
        print("# reading reference entries ...", end=' ')
        sys.stdout.flush()

    gff_references = GTF.readFromFile(open(input_filename_reference, "r"))

    if options.loglevel >= 1:
        print("finished: %i" % (len(gff_references)))
        sys.stdout.flush()

    if options.remove_redundancy:
        gff_targets = GTF.CombineOverlaps(gff_targets)
        gff_references = GTF.CombineOverlaps(gff_references)

        if options.loglevel >= 1:
            print("# after filtering: targets=%i, references=%i" %
                  (len(gff_targets), len(gff_references)))

    ##########################################################################
    # sort exons
    if options.loglevel >= 1:
        print("# sorting exons ...", end=' ')
        sys.stdout.flush()

    gff_targets.sort(lambda x, y: cmp((x.mName, x.strand, x.start, x.end),
                                      (y.mName, y.strand, y.start, y.end)))

    gff_references.sort(lambda x, y: cmp((x.mName, x.strand, x.start, x.end),
                                         (y.mName, y.strand, y.start, y.end)))

    ntargets = len(gff_targets)
    nreferences = len(gff_references)

    if options.loglevel >= 1:
        print("finished")
        sys.stdout.flush()

    ##########################################################################
    # get nucleotide level accuracy
    # process each fragment separately
    if options.do_nucleotides:
        print(
            """############################################################""")

        headers = ("contig", "strand", "tp", "fp", "tn", "fn", "sp", "sn",
                   "cc")

        print("\t".join(headers))

        first_r, first_t = 0, 0
        r, t = 0, 0

        ttp, tfp, ttn, tfn = 0, 0, 0, 0

        # this only works, if all contigs in reference are present in target.
        while r < nreferences and t < ntargets:

            this_name = gff_references[r].mName
            this_strand = gff_references[r].strand

            # get all in references
            while r < nreferences and \
                    gff_references[r].mName == this_name and \
                    gff_references[r].strand == this_strand:
                r += 1

            # skip over extra contigs in target
            while t < ntargets and \
                (gff_targets[t].mName != this_name or
                 gff_targets[t].strand != this_strand):
                t += 1
            first_t = t

            # get all in targets
            while t < ntargets and \
                    gff_targets[t].mName == this_name and \
                    gff_targets[t].strand == this_strand:
                t += 1

            tp, fp, tn, fn = AnalyseOverlaps(gff_references[first_r:r],
                                             gff_targets[first_t:t])

            spec, sens = CalculateSpecificitySensitivity(tp, fp, tn, fn)
            cc = CalculateCorrelationCoefficient(tp, fp, tn, fn)
            print("%s\t%s\t%i\t%i\t%i\t%i\t%5.2f\t%5.2f\t%5.2f" %
                  (this_name, this_strand, tp, fp, tn, fn, spec, sens, cc))

            ttp += tp
            tfp += fp
            ttn += tn
            tfn += fn
            first_r, first_t = r, t

        spec, sens = CalculateSpecificitySensitivity(ttp, tfp, ttn, tfn)
        cc = CalculateCorrelationCoefficient(ttp, tfp, ttn, tfn)
        print("%s\t%s\t%i\t%i\t%i\t%i\t%5.2f\t%5.2f\t%5.2f" %
              ("all", "all", ttp, tfp, ttn, tfn, spec, sens, cc))

        sys.stdout.flush()

    ##########################################################################
    if options.do_exons or options.do_genes:

        print(
            """############################################################""")

        headers = ("category", "contig", "strand", "tp", "fp", "tn", "fn",
                   "sp", "sn", "cc", "me", "we", "me", "we")

        print("\t".join(headers))

        r, t = 0, 0
        next_r, next_t = r, t

        # strict false positves/negatives
        tp, fp, tn, fn = 0, 0, 0, 0
        ttp, tfp, ttn, tfn = 0, 0, 0, 0
        # partial false positives/negatives
        ptp, pfp, ptn, pfn = 0, 0, 0, 0
        tptp, tpfp, tptn, tpfn = 0, 0, 0, 0

        # missed and wrong exons
        missed_exons, wrong_exons = 0, 0
        tmissed_exons, twrong_exons = 0, 0

        # Flag set, if partial overlap in previous pair
        last_partial_overlap = False
        # Flag set, if partial overlap and reference was last increased
        last_increased_ref = False

        while r < nreferences and t < ntargets:

            this_name = gff_references[r].mName
            this_strand = gff_references[r].strand

            # get overlap segments
            if next_r == r:
                ref_overlaps, next_r, ref_start, ref_end = GetFirstOverlaps(
                    gff_references, r)
            if next_t == t:
                target_overlaps, next_t, target_start, target_end = GetFirstOverlaps(
                    gff_targets, t)

            if options.loglevel >= 3:
                print(
                    "########################################################")
                for x in ref_overlaps:
                    print("#", str(x))
                for x in target_overlaps:
                    print("#", str(x))

            do_summary = False
            # check strand switch in reference
            if next_r < nreferences and \
                (this_name != gff_references[next_r].mName or
                    this_strand != gff_references[next_r].strand):
                if options.loglevel >= 3:
                    print("# target advance")
                do_summary = True

                last_increased_ref = False
                last_partial_overlap = False

                # advance in target until next name is found
                next_name = gff_references[next_r].mName
                next_strand = gff_references[next_r].strand
                while next_t < ntargets and \
                        next_name != gff_targets[next_t].mName or \
                        next_strand != gff_targets[next_t].strand:
                    fp += 1
                    pfp += 1
                    target_overlaps, next_t, target_start, target_end = GetFirstOverlaps(
                        gff_targets, next_t)

                for x in gff_targets[t:next_t]:
                    x.mStatus = "extra"
                for x in gff_references[r:next_r]:
                    x.mStatus = "extra"

                r, t = next_r, next_t
            # check strand switch in target
            elif next_t < ntargets and \
                (this_name != gff_targets[next_t].mName or
                 this_strand != gff_targets[next_t].strand):
                # advance in reference until next name is found
                if options.loglevel >= 3:
                    print("# reference advance")
                do_summary = True

                last_increased_ref = False
                last_partial_overlap = False

                next_name = gff_targets[next_t].mName
                next_strand = gff_targets[next_t].strand
                while next_r < nreferences and \
                        next_name != gff_references[next_r].mName or \
                        next_strand != gff_references[next_r].strand:
                    fn += 1
                    pfn += 1
                    reference_overlaps, next_r, references_start, references_end = GetFirstOverlaps(
                        gff_references, next_r)

                for x in gff_targets[t:next_t]:
                    x.mStatus = "extra"
                for x in gff_references[r:next_r]:
                    x.mStatus = "extra"

                r, t = next_r, next_t
            # otherwise
            else:

                ref_status, target_status = None, None

                if options.loglevel >= 3:
                    print("# same chromosome")

                # overlap between segments
                if min(ref_end, target_end) - max(ref_start, target_start) > 0:

                    # clear flags
                    last_increased_ref = False
                    last_partial_overlap = False
                    found = False

                    for rr in ref_overlaps:
                        xfound = False
                        for tt in target_overlaps:
                            if GTF.Identity(
                                    rr, tt,
                                    max_slippage=options.max_exon_slippage):
                                xfound = True
                                break
                        if xfound:
                            found = True
                            break

                    if found:
                        ref_status = "match"
                        target_status = "match"
                        tp += 1
                        ptp += 1
                        if options.write_matched_exons:
                            print(
                                "############# matching exons ###########################"
                            )
                            for x in ref_overlaps:
                                print("#", str(x))
                            for x in target_overlaps:
                                print("#", str(x))
                    else:
                        fn += 1

                        # check for one-sided matches
                        for rr in ref_overlaps:
                            xfound = False
                            for tt in target_overlaps:
                                if GTF.HalfIdentity(rr,
                                                    tt,
                                                    max_slippage=options.
                                                    max_exon_slippage):
                                    xfound = True
                                    break
                            if xfound:
                                found = True
                                break

                        if found:
                            ptp += 1
                            code = "partial"
                            ref_status = "partial"
                            target_status = "partial"
                        else:
                            pfn += 1
                            code = "complete"
                            ref_status = "mismatch"
                            target_status = "mismatch"

                        if options.write_missed_exons:
                            print(
                                "############# %s non-overlapping exons ###########################"
                                % code)
                            for x in ref_overlaps:
                                print("#", str(x))
                            for x in target_overlaps:
                                print("#", str(x))

                    ###########################################################
                    # r, t = next_r, next_t
                    if ref_end == target_end:
                        r, t = next_r, next_t
                    elif ref_end < target_end:
                        r = next_r
                        last_increased_ref = True
                        last_partial_overlap = True
                    else:
                        t = next_t
                        last_increased_ref = False
                        last_partial_overlap = True

                # non-overlap between segments
                else:

                    if ref_end < target_start:

                        # for non-overlap, check whether there was partial overlap before
                        # and reference was not increased.
                        # if there was, just increment reference, but do not
                        # count.

                        if not (last_partial_overlap
                                and not last_increased_ref):

                            if options.write_missed_exons:
                                print(
                                    "############# missed exon ###########################"
                                )
                                for x in ref_overlaps:
                                    print("#", str(x))
                            missed_exons += 1
                            fn += 1
                            pfn += 1
                            ref_status = "extra"

                        r = next_r

                    else:

                        # for non-overlap, check whether there was partial overlap before
                        # and target was not increased.
                        # if there was, just increment target, but do not
                        # count.

                        if not (last_partial_overlap and last_increased_ref):
                            if options.write_wrong_exons:
                                print(
                                    "############# wrong exon ###########################"
                                )
                                for x in target_overlaps:
                                    print("#", str(x))

                            wrong_exons += 1
                            fp += 1
                            pfp += 1
                            target_status = "extra"

                        t = next_t

                    last_partial_overlap = False

                if options.loglevel >= 3:
                    print("# ref_status=%s, target_status=%s" %
                          (ref_status, target_status))

                if ref_status:
                    for rr in ref_overlaps:
                        rr.mStatus = ref_status

                    if ref_status in ("match", "partial") and options.do_genes:
                        for rr in ref_overlaps:
                            rr.mMatches = target_overlaps

                if target_status:
                    for tt in target_overlaps:
                        tt.mStatus = target_status

                    if target_status in ("match",
                                         "partial") and options.do_genes:
                        for tt in target_overlaps:
                            tt.mMatches = ref_overlaps

            if do_summary or r >= nreferences or t >= ntargets:
                ttp += tp
                tfp += fp
                ttn += tn
                tfn += fn

                tptp += ptp
                tpfp += pfp
                tptn += ptn
                tpfn += pfn

                tmissed_exons += missed_exons
                twrong_exons += wrong_exons

                if tp + fn != 0:
                    pmissed_exons = "%5.2f" % (float(missed_exons) / (tp + fn))
                else:
                    pmissed_exons = "0"

                if tp + fp != 0:
                    pwrong_exons = "%5.2f" % (float(wrong_exons) / (tp + fp))
                else:
                    pwrong_exons = "na"

                spec, sens = CalculateSpecificitySensitivity(tp, fp, tn, fn)
                cc = (spec + sens) / 2.0
                print(
                    "full\t%s\t%s\t%i\t%i\t%i\t%i\t%5.2f\t%5.2f\t%5.2f\t%i\t%i\t%s\t%s"
                    % (this_name, this_strand, tp, fp, tn, fn, spec, sens, cc,
                       missed_exons, wrong_exons, pmissed_exons, pwrong_exons))

                spec, sens = CalculateSpecificitySensitivity(
                    ptp, pfp, ptn, pfn)
                cc = (spec + sens) / 2.0
                print(
                    "half\t%s\t%s\t%i\t%i\t%i\t%i\t%5.2f\t%5.2f\t%5.2f\t%i\t%i\t%s\t%s"
                    % (this_name, this_strand, ptp, pfp, ptn, pfn, spec, sens,
                       cc, missed_exons, wrong_exons, pmissed_exons,
                       pwrong_exons))

                tp, fp, tn, fn = 0, 0, 0, 0
                ptp, pfp, ptn, pfn = 0, 0, 0, 0
                missed_exons, wrong_exons = 0, 0

        if t < ntargets:
            for x in gff_targets[t:ntargets]:
                x.mStatus = "extra"
        if r < nreferences:
            for x in gff_references[r:nreferences]:
                x.mStatus = "extra"

        spec, sens = CalculateSpecificitySensitivity(ttp, tfp, ttn, tfn)
        cc = (spec + sens) / 2.0
        print(
            "full\t%s\t%s\t%i\t%i\t%i\t%i\t%5.2f\t%5.2f\t%5.2f\t%i\t%i\t%5.2f\t%5.2f"
            % ("all", "all", ttp, tfp, ttn, tfn, spec, sens, cc, tmissed_exons,
               twrong_exons, float(tmissed_exons) /
               (ttp + tfn), float(twrong_exons) / (ttp + tfp)))

        spec, sens = CalculateSpecificitySensitivity(tptp, tpfp, tptn, tpfn)
        cc = (spec + sens) / 2.0
        print(
            "half\t%s\t%s\t%i\t%i\t%i\t%i\t%5.2f\t%5.2f\t%5.2f\t%i\t%i\t%5.2f\t%5.2f"
            % ("all", "all", tptp, tpfp, tptn, tpfn, spec, sens, cc,
               tmissed_exons, twrong_exons, float(tmissed_exons) /
               (ttp + tfn), float(twrong_exons) / (ttp + tfp)))

    if options.do_genes and \
            options.regex_reference and \
            options.regex_target:

        print(
            """###############################################################"""
        )

        out_options = []
        if options.write_missed_genes:
            out_options.append("missed")

        if options.loglevel >= 2:
            print("# counting matches for reference.")
            sys.stdout.flush()

        (ref_total, ref_match, ref_partial, ref_extra) =\
            CountMatchesPerGene(gff_references,
                                re.compile(options.regex_reference),
                                re.compile(options.regex_target),
                                write=out_options,
                                outfile=open(options.outfile_pattern % "reference", "w"))

        if options.loglevel >= 2:
            print("# counting matches for target.")
            sys.stdout.flush()

        (target_total, target_match, target_partial, target_extra) =\
            CountMatchesPerGene(gff_targets,
                                re.compile(options.regex_target),
                                re.compile(
                                    options.regex_reference),
                                write=out_options,
                                outfile=open(options.outfile_pattern % "target", "w"))

        if options.loglevel >= 1:
            print(
                "# reference: genes=%6i, matches=%6i, partial=%6i, extra=%6i" %
                (ref_total, ref_match, ref_partial, ref_extra))
            print(
                "# target   : genes=%6i, matches=%6i, partial=%6i, extra=%6i" %
                (target_total, target_match, target_partial, target_extra))

        headers = ("category", "tp", "fp", "tn", "fn", "sp", "sn", "cc", "mg",
                   "wg", "mg", "wg")
        print("\t".join(headers))

        tp = ref_match
        fp = target_extra
        tn = 0
        fn = ref_total - ref_match
        wrong_genes = target_extra
        missed_genes = ref_extra

        spec, sens = CalculateSpecificitySensitivity(tp, fp, tn, fn)
        cc = (spec + sens) / 2.0

        if tp + fp == 0:
            fp = nreferences

        print(
            "full\t%i\t%i\t%i\t%i\t%5.2f\t%5.2f\t%5.2f\t%i\t%i\t%5.2f\t%5.2f" %
            (tp, fp, tn, fn, spec, sens, cc, missed_genes, wrong_genes,
             float(missed_genes) / (tp + fn), float(wrong_genes) / (tp + fp)))

        tp = ref_match + ref_partial
        fp = target_extra
        tn = 0
        fn = ref_total - ref_match - ref_partial
        wrong_genes = target_extra
        missed_genes = ref_extra

        spec, sens = CalculateSpecificitySensitivity(tp, fp, tn, fn)
        cc = (spec + sens) / 2.0
        print(
            "half\t%i\t%i\t%i\t%i\t%5.2f\t%5.2f\t%5.2f\t%i\t%i\t%5.2f\t%5.2f" %
            (tp, fp, tn, fn, spec, sens, cc, missed_genes, wrong_genes,
             float(missed_genes) / (tp + fn), float(wrong_genes) / (tp + fp)))

    E.stop()
Esempio n. 11
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-e",
        "--input-bed-file",
        dest="input_bed_file",
        type="string",
        help="input file with intervals. Tab-delimited file of intervals "
        "in bed format to restrict analysis to. [%default]")

    parser.add_option(
        "-m",
        "--merge-intervals",
        dest="merge_intervals",
        action="store_true",
        help="merge intervals in bed file. Useful if you have a site bed-file "
        "[%default]")

    parser.add_option("-f",
                      "--reference-fasta-file",
                      dest="reference_fasta_file",
                      help="reference genomic sequence in fasta format. "
                      "[%default]")

    parser.add_option(
        "-c",
        "--barcode-fasta-file",
        dest="barcode_fasta_file",
        help="barcode sequence in fasta format. Variable positions "
        "should be marked by N "
        "[%default]")

    parser.set_defaults(
        reference_fasta_file=None,
        barcode_fasta_file=None,
        merge_intervals=False,
        input_bed_file=None,
        anchor=5,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if options.stdin != sys.stdin:
        bamfile = options.stdin.name
    elif args:
        if len(args) > 1:
            raise ValueError("multiple bam files provided in arguments")
        bamfile = args[0]
    else:
        bamfile = "-"

    if options.barcode_fasta_file:
        with pysam.FastxFile(options.barcode_fasta_file) as inf:
            barcode_sequence = next(inf).sequence
    else:
        barcode_sequence = None

    if not os.path.exists(options.reference_fasta_file):
        raise OSError("reference fasta file {} does not exist".format(
            options.reference_fasta_file))

    if not os.path.exists(options.input_bed_file):
        raise OSError("input bed file {} does not exist".format(
            options.input_bed_file))

    bed_in = pysam.TabixFile(options.input_bed_file)
    pysam_in = pysam.AlignmentFile(bamfile)
    anchor = options.anchor

    for region_idx, vals in enumerate(
            iterate_bed(bed_in, options.merge_intervals)):

        if region_idx > 0:
            raise NotImplementedError(
                "output for multiple regions not yet implemented")

        contig, region_start, region_end = vals
        upstream_anchors, downstream_anchors = [], []
        counter = E.Counter()

        unaligned_fn = E.get_output_file(
            "unaligned_{}.fasta".format(region_idx))
        with IOTools.open_file(unaligned_fn, "w") as outf:
            for read in pysam_in.fetch(contig, region_start, region_end):
                counter.overlapping_reads += 1
                try:
                    pairs = read.get_aligned_pairs(with_seq=True)
                except ValueError:
                    counter.no_md_tag += 1
                    continue

                map_ref2read_pos = dict(
                    (x[1], x[0]) for x in pairs if x[0] is not None)
                map_ref2ref_base = dict(
                    (x[1], x[2]) for x in pairs if x[0] is not None)

                upstream_anchor = "".join(
                    map_ref2ref_base.get(x, "")
                    for x in range(region_start - anchor, region_start))

                downstream_anchor = "".join(
                    map_ref2ref_base.get(x, "")
                    for x in range(region_end, region_end + anchor))

                # check if at least one anchor is aligned
                upstream_matches = sum([x.isupper() for x in upstream_anchor])
                downstream_matches = sum(
                    [x.isupper() for x in downstream_anchor])

                if upstream_matches < anchor and downstream_matches < anchor:
                    counter.no_anchor += 1
                    continue
                seq = read.query_alignment_sequence

                # collect full length anchors
                upstream_anchor_start, upstream_anchor_end = region_start - anchor, region_start
                downstream_anchor_start, downstream_anchor_end = region_end, region_end + anchor

                if upstream_anchor_start in map_ref2read_pos and upstream_anchor_end in map_ref2read_pos:
                    upstream_anchors.append(
                        seq[map_ref2read_pos[upstream_anchor_start]:
                            map_ref2read_pos[upstream_anchor_end]])
                if downstream_anchor_start in map_ref2read_pos and downstream_anchor_end in map_ref2read_pos:
                    downstream_anchors.append(
                        seq[map_ref2read_pos[downstream_anchor_start]:
                            map_ref2read_pos[downstream_anchor_end]])

                # get region to align
                read_start = min(
                    (map_ref2read_pos.get(x, len(seq))
                     for x in range(region_start - anchor, region_start)))
                if read_start == len(seq):
                    read_start = 0
                read_end = max(
                    (map_ref2read_pos.get(x, 0) + 1
                     for x in range(region_end, region_end + anchor)))
                if read_end == 1:
                    read_end = len(seq)
                counter.collected_reads += 1
                outf.write(">{}/{}-{}\n{}\n".format(read.query_name,
                                                    read_start, read_end,
                                                    seq[read_start:read_end]))
        counter.downstream_anchors = len(downstream_anchors)
        counter.upstream_anchors = len(upstream_anchors)

        E.info(counter)

        if counter.overlapping_reads == 0:
            E.warn("no sequences overlapping region")
            continue

        if counter.downstream_anchors == 0 or counter.upstream_anchors == 0:
            E.warn("at least one anchor undefined")
            continue

        if counter.collected_reads == 1:
            E.warn("only single sequence, multiple aligment skipped")
            with IOTools.open_file(unaligned_fn) as inf:
                stdout = inf.read()
        else:
            # G-INS-i -> global alignment algorithm
            E.info("starting mafft multiple alignment")
            stdout = E.run(
                "mafft --globalpair --maxiterate 100 --quiet --op 2 --ep 0.5 {}"
                .format(unaligned_fn),
                return_stdout=True)

        aligned_fn = E.get_output_file("aligned_{}.fasta".format(region_idx))
        with IOTools.open_file(aligned_fn, "w") as outf:
            outf.write(stdout)

        mali = stdout.splitlines()
        identifiers = [mali[x] for x in range(0, len(mali), 2)]
        sequences = [mali[x].upper() for x in range(1, len(mali), 2)]
        consensus = get_consensus(sequences)

        E.info("after alignment: consensus={}".format(consensus))

        # gap filtering -> remove highly gappy columns
        consensus = get_consensus(sequences, min_gap_proportion=0.9)

        E.info("after anchor trimming: consensus={}".format(consensus))

        take = [idx for idx, x in enumerate(consensus) if x != "-"]
        sequences = ["".join([s[x] for x in take]) for s in sequences]
        consensus = get_consensus(sequences, min_gap_proportion=0.9)

        E.info("after gap filtering: consensus={}".format(consensus))

        # get anchor consensus and chop it off
        consensus = get_consensus(sequences, ignore_gaps=True)
        upstream_anchor = get_anchor_consensus(upstream_anchors)
        downstream_anchor = get_anchor_consensus(downstream_anchors)

        upstream_anchor_start = consensus.find(upstream_anchor)
        downstream_anchor_start = consensus.rfind(downstream_anchor)

        E.info(
            "anchor consensus (no gaps)={}, upstream={}, downstream={}, upstream_idx={}, downstream_idx={}"
            .format(consensus, upstream_anchor, downstream_anchor,
                    upstream_anchor_start, downstream_anchor_start))

        if upstream_anchor_start < 0 or downstream_anchor_start < 0:
            E.warn("can't locate anchor, no output produced")
            continue

        upstream_anchor_end = upstream_anchor_start + len(upstream_anchor)
        if upstream_anchor_end >= downstream_anchor_start:
            E.warn("anchor not in correct order, no output produced")
            continue

        sequences = [
            x[upstream_anchor_end:downstream_anchor_start] for x in sequences
        ]
        consensus = get_consensus(sequences)

        E.info("after anchor trimming: consensus={}".format(consensus))

        truncated_fn = E.get_output_file(
            "aligned_truncated_{}.fasta".format(region_idx))
        with IOTools.open_file(truncated_fn, "w") as outf:
            outf.write("\n".join("{}\n{}\n".format(x, y)
                                 for x, y in zip(identifiers, sequences)))

        positions = list(zip(*sequences))
        bases = ["A", "C", "G", "T"]
        df = pandas.DataFrame([collections.Counter(x)
                               for x in positions]).fillna(0)
        for missing_base in [x for x in bases if x not in df.columns]:
            df[missing_base] = 0
        df["gapped_depth"] = df.sum(axis=1)
        df["depth"] = df[bases].sum(axis=1)
        df["consensus"] = df[bases].idxmax(axis=1)
        df["consensus_counts"] = df.lookup(df.index, df.consensus)
        df["consensus_support"] = df.consensus_counts / df.depth
        df["offconsensus_counts"] = df.depth - df.consensus_counts
        df.loc[df.consensus_counts == 0, "consensus"] = "N"
        df["region_id"] = region_idx

        # replace "gap" consensus positions with + character
        alignment = global_align(re.sub("-", "+", consensus), barcode_sequence)
        E.info("alignment: consensus {}".format(alignment[0]))
        E.info("alignment: barcode   {}".format(alignment[1]))

        barcode_idx = 0
        deleted_barcode_bases = []
        rows = []
        for c, b in zip(*alignment):
            if c == "-":
                deleted_barcode_bases.append(barcode_idx)
                barcode_idx += 1
            elif b == "N":
                rows.append((barcode_idx, "variable"))
                barcode_idx += 1
            elif b == "-":
                rows.append(("", "insertion"))
            elif b == c:
                rows.append((barcode_idx, "fixed-match"))
                barcode_idx += 1
            else:
                rows.append((barcode_idx, "fixed-mismatch"))
                barcode_idx += 1

        alignment_df = pandas.DataFrame.from_records(
            rows, columns=["barcode_pos", "barcode_class"])

        assert len(alignment_df) == len(df)
        df = pandas.concat([df, alignment_df], axis=1)
        with E.open_output_file("pileup") as outf:
            df.to_csv(outf, sep="\t", index=True, index_label="position")

        observed_barcode_sequence = "".join(
            df[df.barcode_class == "variable"].consensus)
        headers = df.consensus_support.describe().index
        eval_df = df.loc[df.barcode_class.isin(
            ("variable", "fixed-match", "fixed-mismatch")), ]
        median_consensus_depth = eval_df.consensus_counts.median()
        # zero stuff out if depth is low
        if median_consensus_depth <= 2:
            deleted_barcode_bases = []

        outf = options.stdout
        # modules to recover partial bar-codes
        outf.write("\t".join(
            map(str, [
                "barcode", "ndeleted_barcode_bases", "deleted_barcode_bases"
            ] + ["support_{}".format(x)
                 for x in headers] + ["counts_{}".format(x) for x in headers] +
                ["offcounts_{}".format(x) for x in headers])) + "\n")

        outf.write("\t".join(
            map(str, [
                observed_barcode_sequence,
                len(deleted_barcode_bases), ",".join(
                    map(str, deleted_barcode_bases))
            ] + eval_df.consensus_support.describe().tolist() +
                eval_df.consensus_counts.describe().tolist() +
                eval_df.offconsensus_counts.describe().tolist())) + "\n")

    E.stop()
Esempio n. 12
0
def main(argv=None):
    """script main.

parses command line options in sys.argv, unless *argv* is given.
"""

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option("--time",
                      dest="timepoints",
                      type="string",
                      help="a comma-separated list of time points measured")

    parser.add_option("--replicates",
                      dest="reps",
                      type="string",
                      help="a comma-separated list of replicate IDs")

    parser.add_option("--condition",
                      dest="condition",
                      type="string",
                      help="experimental condition")

    parser.add_option("--resamples",
                      dest="resamples",
                      type="string",
                      help="number of times to resample replicates to"
                      " generate pseudo datasets")

    parser.add_option("--input-gtf",
                      dest="gtf_file",
                      type="string",
                      help="reference gtf file")

    parser.add_option("--output-file-directory",
                      dest="output_dir",
                      type="string",
                      help="directory to output"
                      " resampled files to")

    # add common options (-h/--help, ...) and parse command line

    (options, args) = E.start(parser, argv=argv)

    try:
        infile = IOTools.open_file(argv[-1], "r")
    except IOError:
        infile = options.stdin

    data_frame = pd.read_table(infile, sep="\t", index_col=0, header=0)
    time_str = options.timepoints.split(",")
    time_points = [int(x) for x in time_str]
    replicates = options.reps.split(",")
    reps = int(options.resamples)

    its = [time_str, replicates]
    midx = pd.MultiIndex.from_product(its, names=['times', 'replicates'])

    TS.genResampleData(data_frame=data_frame,
                       multiple_index=midx,
                       replicates=reps,
                       sample_reps=replicates,
                       times=time_points,
                       condition=options.condition,
                       ref_gtf=options.gtf_file,
                       out_dir=options.output_dir,
                       seed=int(options.random_seed))

    # Write footer and output benchmark information.
    E.stop()
Esempio n. 13
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option(
        "--task",
        dest="task",
        type="choice",
        choices=["mafs", "penetrance", "detect_duplicates", "allele_diff"],
        help="task to perform")

    parser.add_option("--ped-file",
                      dest="ped_file",
                      type="string",
                      help="plink format .ped file")

    parser.add_option("--map-file",
                      dest="map_file",
                      type="string",
                      help="plink format .map file")

    parser.add_option("--freq-file",
                      dest="mafs",
                      type="string",
                      help="text file containing populations minor "
                      "allele frequencies of variants.  One row per "
                      "variant with ID MAF")

    parser.add_option("--groups-file",
                      dest="group_file",
                      type="string",
                      help="file containing group labels for individuals "
                      "in the provided ped file")

    parser.add_option("--ref-label",
                      dest="ref_label",
                      type="string",
                      help="group label to be used as the reference case")

    parser.add_option("--test-label",
                      dest="test_label",
                      type="string",
                      help="group label to be used as the test case")

    parser.add_option("--subset",
                      dest="subset",
                      type="choice",
                      choices=["cases", "gender"],
                      help="subset the "
                      "data by either case/control or gender")

    parser.add_option("--take-last",
                      dest="take",
                      action="store_true",
                      help="if use duplicates will take the last variant, "
                      "default behaviour is to take the first")

    parser.add_option("--outfile-pattern",
                      dest="out_pattern",
                      type="string",
                      help="outfile pattern to use for finding duplicates "
                      "and triallelic variants")

    parser.add_option("--snp-set",
                      dest="snp_subset",
                      type="string",
                      help="list of SNPs to include")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    parser.set_defaults(mafs=None, subset=None, take_last=False)

    if options.task == "mafs":
        mafs = gwas.countByVariantAllele(options.ped_file, options.map_file)

        mafs.to_csv(options.stdout, index_col=None, sep="\t")

    elif options.task == "penetrance":
        summary, pens = gwas.calcPenetrance(options.ped_file,
                                            options.map_file,
                                            subset=options.subset,
                                            mafs=options.mafs,
                                            snpset=options.snp_subset)

        pens.to_csv(options.stdout, sep="\t", index_label="SNP")
        summary.to_csv("/".join([os.getcwd(), "penetrance_summary.txt"]),
                       sep="\t",
                       index_label="SNP")

    elif options.task == "allele_diff":
        allele_diffs = gwas.calcMaxAlleleFreqDiff(
            ped_file=options.ped_file,
            map_file=options.map_file,
            group_file=options.group_file,
            test=options.test_label,
            ref=options.ref_label)

        allele_diffs.to_csv(options.stdout, sep="\t")

    elif options.task == "detect_duplicates":
        # find variants with duplicated position and shared reference
        # allele indicative of triallelic variants - also same ID
        # ouput to a filter list
        infile = argv[-1]
        dups, tris, oves = gwas.findDuplicateVariants(bim_file=infile,
                                                      take_last=options.take)

        if os.path.isabs(options.out_pattern):
            with open(options.out_pattern + ".triallelic", "w") as otfile:
                for tvar in tris:
                    otfile.write("%s\n" % tvar)

            with open(options.out_pattern + ".duplicates", "w") as odfile:
                for dvar in dups:
                    odfile.write("%s\n" % dvar)

            with open(options.out_pattern + ".overlapping", "w") as ovfile:
                for ovar in oves:
                    ovfile.write("%s\n" % ovar)
        else:
            outpattern = os.path.abspath(options.out_pattern)
            with open(outpattern + ".triallelic", "w") as otfile:
                for tvar in tris:
                    otfile.write("%s\n" % tvar)

            with open(outpattern + ".duplicates", "w") as odfile:
                for dvar in dups:
                    odfile.write("%s\n" % dvar)

            with open(outpattern + ".overlapping", "w") as ovfile:
                for ovar in oves:
                    ovfile.write("%s\n" % ovar)

    # write footer and output benchmark information.
    E.stop()
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id",
                            usage=globals()["__doc__"])

    parser.add_option("-f",
                      "--target-format",
                      dest="change_format",
                      type="choice",
                      choices=('sanger', 'solexa', 'phred64', 'integer'),
                      help="set quality scores to format "
                      "[default=%default].")

    parser.add_option(
        "--guess-format",
        dest="guess_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'integer'),
        help="quality score format to assume if ambiguous [default=%default].")

    parser.add_option("--pattern-identifier",
                      dest="pattern",
                      type="string",
                      help="filename prefix [default=%default].")

    parser.set_defaults(change_format=None, guess_format=None, pattern="%s.gz")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    c = E.Counter()

    outfile_seq = IOTools.open_file(options.pattern % "csfasta", "w")
    outfile_qual = IOTools.open_file(options.pattern % "qual", "w")

    if options.change_format:
        iter = Fastq.iterate_convert(options.stdin,
                                     format=options.change_format,
                                     guess=options.guess_format)
    else:
        iter = Fastq.iterate(options.stdin)

    for record in iter:
        c.input += 1
        outfile_seq.write(">%s\n%s\n" % (record.identifier, record.seq))
        outfile_qual.write(">%s\n%s\n" % (record.identifier, record.quals))
        c.output += 1

    outfile_seq.close()
    outfile_qual.close()

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.stop()
Esempio n. 15
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id",
                            usage=globals()["__doc__"])

    parser.add_option("-u",
                      "--ucsc-genome",
                      dest="ucsc_genome",
                      type="string",
                      help="UCSC genome identifier [default=%default].")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("--extend",
                      dest="extension",
                      type="int",
                      help="extend tags by this number of bases "
                      "[default=%default].")

    parser.add_option("--shift-size",
                      dest="shift",
                      type="int",
                      help="shift tags by this number of bases "
                      "[default=%default].")

    parser.add_option("--window-size",
                      dest="window_size",
                      type="int",
                      help="window size to be used in the analysis"
                      "[default=%default].")

    parser.add_option("--saturation-iterations",
                      dest="saturation_iterations",
                      type="int",
                      help="iterations for saturation analysis "
                      "[default=%default].")

    parser.add_option("-t",
                      "--toolset",
                      dest="toolset",
                      type="choice",
                      action="append",
                      choices=("saturation", "coverage", "enrichment", "dmr",
                               "rms", "rpm", "all", "convert"),
                      help="actions to perform [default=%default].")

    parser.add_option("-w",
                      "--bigwig-file",
                      dest="bigwig",
                      action="store_true",
                      help="store wig files as bigwig files - requires a "
                      "genome file [default=%default]")

    parser.add_option("--treatment",
                      dest="treatment_files",
                      type="string",
                      action="append",
                      help="BAM files for treatment. At least one is required "
                      "[%default]")

    parser.add_option("--control",
                      dest="control_files",
                      type="string",
                      action="append",
                      help="BAM files for control for differential "
                      "methylation analysis. Optional [%default].")

    parser.add_option("--input",
                      dest="input_files",
                      type="string",
                      action="append",
                      help="BAM files for input correction. "
                      "Optional [%default].")

    parser.add_option("--is-not-medip",
                      dest="is_medip",
                      action="store_false",
                      help="data is not MeDIP data and is not expected "
                      "to fit the calibration model. No CpG "
                      "density normalized rms data is computed"
                      "[default=%default].")

    parser.add_option("--output-rdata",
                      dest="output_rdata",
                      action="store_true",
                      help="in dmr analysis, write R session to file. "
                      "The file name "
                      "is given by --ouptut-filename-pattern [%default].")

    parser.add_option("--rdata-file",
                      dest="input_rdata",
                      type="string",
                      help="in dmr analysis, read saved R session from "
                      "file. This can be used to apply different "
                      "filters [%default]")

    parser.add_option("--fdr-threshold",
                      dest="fdr_threshold",
                      type="float",
                      help="FDR threshold to apply for selecting DMR "
                      "[default=%default].")

    parser.add_option("--fdr-method",
                      dest="fdr_method",
                      type="choice",
                      choices=("bonferroni", "BH", "holm", "hochberg",
                               "hommel", "BY", "fdr", "none"),
                      help="FDR method to apply for selecting DMR "
                      "[default=%default].")

    parser.add_option("--bwa",
                      dest="bwa",
                      action="store_true",
                      help="alignment generated with bwa"
                      "[default=%default].")

    parser.add_option("--unique",
                      dest="unique",
                      type="float",
                      help="Threshold p-value to determine which read pile\
                      ups are the result of PCR overamplification"
                      "[default=%default].")

    parser.add_option("--chroms",
                      dest="chroms",
                      type="str",
                      help="Comma delimited list of chromosomes to include"
                      "[default=%default].")

    parser.set_defaults(input_format="bam",
                        ucsc_genome="Hsapiens.UCSC.hg19",
                        genome_file=None,
                        extend=0,
                        shift=0,
                        window_size=300,
                        saturation_iterations=10,
                        toolset=[],
                        bigwig=False,
                        treatment_files=[],
                        control_files=[],
                        input_files=[],
                        output_rdata=False,
                        input_rdata=None,
                        is_medip=True,
                        fdr_threshold=0.1,
                        fdr_method="BH",
                        bwa=False,
                        unique=0.001,
                        chroms=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if "convert" in options.toolset:

        results = []
        for line in CSV.DictReader(options.stdin, dialect="excel-tab"):
            if line['edgeR.p.value'] == "NA":
                continue

            # assumes only a single treatment/control
            treatment_name = options.treatment_files[0]
            control_name = options.control_files[0]
            status = "OK"
            try:
                results.append(
                    Expression.GeneExpressionResult._make((
                        "%s:%i-%i" %
                        (line['chr'], int(line['start']), int(line['stop'])),
                        treatment_name,
                        float(line['MSets1.rpkm.mean']),
                        0,
                        control_name,
                        float(line['MSets2.rpkm.mean']),
                        0,
                        float(line['edgeR.p.value']),
                        float(line['edgeR.adj.p.value']),
                        float(line['edgeR.logFC']),
                        math.pow(2.0, float(line['edgeR.logFC'])),
                        float(line['edgeR.logFC']),  # no transform
                        ["0", "1"][float(line['edgeR.adj.p.value']) <
                                   options.fdr_threshold],
                        status)))
            except ValueError as msg:
                raise ValueError("parsing error %s in line: %s" % (msg, line))

        Expression.writeExpressionResults(options.stdout, results)
        return

    if len(options.treatment_files) < 1:
        raise ValueError("please specify a filename with sample data")

    if options.bigwig and not options.genome_file:
        raise ValueError("please provide a genome file when outputting bigwig")

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contig_sizes = fasta.getContigSizes()

    if len(options.toolset) == 0:
        options.toolset = ["all"]

    do_all = "all" in options.toolset

    if options.chroms is None:
        chrstring = ""
    else:
        chroms = options.chroms.split(",")
        chrstring = ' chr.select=c(\"%s\"), ' % '\",\"'.join(chroms)
    # load MEDIPS
    R.library('MEDIPS')
    genome_file = 'BSgenome.%s' % options.ucsc_genome
    R.library(genome_file)

    window_size = options.window_size
    extend = options.extend
    shift = options.shift
    saturation_iterations = options.saturation_iterations

    uniq = float(options.unique)

    if options.bwa is True:
        BWA = "TRUE"
    else:
        BWA = "FALSE"

    if "saturation" in options.toolset or do_all:
        E.info("saturation analysis")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''sr = MEDIPS.saturation(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            shift=%(shift)i,
            extend=%(extend)i,
            window_size=%(window_size)i,
            uniq=%(uniq)s,
            nit = %(saturation_iterations)i,
            paired = %(paired)s,
            bwa = %(BWA)s,
            %(chrstring)s
            nrit = 1)''' % locals())

            R.png(E.get_output_file("%s_saturation.png" % fn))
            R('''MEDIPS.plotSaturation(sr)''')
            R('''dev.off()''')
            R('''write.table(sr$estimation, file ='%s', sep='\t')''' %
              E.get_output_file("%s_saturation_estimation.tsv" % fn))

            outfile = IOTools.open_file(
                E.get_output_file("%s_saturation.tsv" % fn), "w")
            outfile.write("category\tvalues\n")
            outfile.write("estimated_correlation\t%s\n" %
                          ",".join(["%f" % x for x in R('''sr$maxEstCor''')]))
            outfile.write("true_correlation\t%s\n" %
                          ",".join(["%f" % x for x in R('''sr$maxTruCor''')]))
            outfile.write("nreads\t%s\n" %
                          ",".join(["%i" % x
                                    for x in R('''sr$numberReads''')]))
            outfile.close()

    if "coverage" in options.toolset or do_all:
        E.info("CpG coverage analysis")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''cr = MEDIPS.seqCoverage(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            pattern='CG',
            shift=%(shift)i,
            extend=%(extend)i,
            paired=%(paired)s,
            bwa=%(BWA)s,
            %(chrstring)s
            uniq=%(uniq)s)''' % locals())

            R.png(E.get_output_file("%s_cpg_coverage_pie.png" % fn))
            R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
            type = "pie", cov.level = c(0, 1, 2, 3, 4, 5))''')
            R('''dev.off()''')

            R.png(E.get_output_file("%s_cpg_coverage_hist.png" % fn))
            R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
            type = "hist", t=15)''')
            R('''dev.off()''')

            # note: this file is large
            R('''write.table(cr$cov.res, file=gzfile('%s','w'),
            sep='\t')''' %
              E.get_output_file("%s_saturation_coveredpos.tsv.gz" % fn))

    if 'enrichment' in options.toolset or do_all:
        E.info("CpG enrichment analysis")
        outfile = IOTools.open_file(E.get_output_file("enrichment.tsv.gz"),
                                    "w")
        slotnames = (("regions.CG", "regions_CG",
                      "%i"), ("regions.C", "regions_C",
                              "%s"), ("regions.G", "regions_G", "%f"),
                     ("regions.relH", "regions_relH",
                      "%i"), ("regions.GoGe", "regions_GoGe",
                              "%i"), ("genome.CG", "genome_CG",
                                      "%s"), ("genome.C", "genome_C", "%s"),
                     ("genome.G", "genome_G", "%i"), ("genome.relH",
                                                      "genome_relH", "%i"),
                     ("enrichment.score.relH", "enrichment_relH", "%s"),
                     ("enrichment.score.GoGe", "enrichment_GoGe", "%s"))

        outfile.write("\t".join(['sample'] + [x[1] for x in slotnames]) + "\n")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''ce = MEDIPS.CpGenrich(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            shift=%(shift)i,
            extend=%(extend)i,
            paired=%(paired)s,
            bwa=%(BWA)s,
            %(chrstring)s
            uniq=%(uniq)s)''' % locals())

            outfile.write("%s" % fn)
            for slotname, label, pattern in slotnames:
                value = tuple(R('''ce$%s''' % slotname))
                if len(value) == 0:
                    value = ""
                outfile.write("\t%s" % pattern % value[0])
            outfile.write("\n")
        outfile.close()

    if options.input_rdata:
        E.info("reading R session info from '%s'" % options.input_rdata)
        R('''load('%s')''' % options.input_rdata)

    else:
        if "dmr" in options.toolset or "correlation" in options.toolset \
           or do_all:
            # build four sets
            for x, fn in enumerate(options.treatment_files):
                paired = isPaired(fn)
                E.info("loading '%s'" % fn)
                R('''treatment_R%(x)i = MEDIPS.createSet(
                file='%(fn)s',
                BSgenome='%(genome_file)s',
                shift=%(shift)i,
                extend=%(extend)i,
                window_size=%(window_size)i,
                paired=%(paired)s,
                bwa=%(BWA)s,
                %(chrstring)s
                uniq=%(uniq)s)''' % locals())
            R('''treatment_set = c(%s)''' % ",".join([
                "treatment_R%i" % x
                for x in range(len(options.treatment_files))
            ]))

            if options.control_files:
                for x, fn in enumerate(options.control_files):
                    paired = isPaired(fn)
                    E.info("loading '%s'" % fn)
                    R('''control_R%(x)i = MEDIPS.createSet(
                    file='%(fn)s',
                    BSgenome='%(genome_file)s',
                    shift=%(shift)i,
                    extend=%(extend)i,
                    window_size=%(window_size)i,
                    paired=%(paired)s,
                    bwa=%(BWA)s,
                    %(chrstring)s
                    uniq=%(uniq)s)''' % locals())
                R('''control_set = c(%s)''' % ",".join([
                    "control_R%i" % x
                    for x in range(len(options.control_files))
                ]))

            # build coupling vector
            R('''CS = MEDIPS.couplingVector(pattern="CG",
            refObj = treatment_set[[1]])''')

            if "correlation" in options.toolset or do_all:
                R('''cor.matrix = MEDIPS.correlation(
                c(treatment_set, control_set))''')

                R('''write.table(cor.matrix,
                file='%s',
                sep="\t")''' % E.get_output_file("correlation"))

            if "dmr" in options.toolset or do_all:
                # Data that does not fit the model causes
                # "Error in 1:max_signal_index : argument of length 0"
                # The advice is to set MeDIP=FALSE
                # See: http://comments.gmane.org/
                # gmane.science.biology.informatics.conductor/52319

                if options.is_medip:
                    medip = "TRUE"
                else:
                    medip = "FALSE"
                fdr_method = options.fdr_method

                E.info("applying test for differential methylation")
                R('''meth = MEDIPS.meth(
                MSet1 = treatment_set,
                MSet2 = control_set,
                CSet = CS,
                ISet1 = NULL,
                ISet2 = NULL,
                p.adj = "%(fdr_method)s",
                diff.method = "edgeR",
                MeDIP = %(medip)s,
                CNV = F,
                minRowSum = 1)''' % locals())

                # Note: several Gb in size
                # Output full methylation data table
                R('''write.table(meth,
                file=gzfile('%s', 'w'),
                sep="\t",
                row.names=F,
                quote=F)''' % E.get_output_file("data.tsv.gz"))

                # save R session
                if options.output_rdata:
                    R('''save.image(file='%s', safe=FALSE)''' %
                      E.get_output_file("session.RData"))

    # DMR analysis - test for windows and output
    if "dmr" in options.toolset:

        E.info("selecting differentially methylated windows")

        # test windows for differential methylation
        fdr_threshold = options.fdr_threshold
        R('''tested = MEDIPS.selectSig(meth,
        adj=T,
        ratio=NULL,
        p.value=%(fdr_threshold)f,
        bg.counts=NULL,
        CNV=F)''' % locals())

        R('''write.table(tested,
        file=gzfile('%s', 'w'),
        sep="\t",
        quote=F)''' % E.get_output_file("significant_windows.gz"))

        # select gain and merge adjacent windows
        try:
            R('''gain = tested[which(tested[, grep("logFC", colnames(tested))] > 0),];
            gain_merged = MEDIPS.mergeFrames(frames=gain, distance=1)''')
            E.info('gain output: %s, merged: %s' %
                   (str(R('''dim(gain)''')), str(R('''dim(gain_merged)'''))))
            R('''of=gzfile('%s', 'w');
            write.table(gain_merged,
            file=of,
            sep="\t",
            quote=F,
            row.names=FALSE,
            col.names=FALSE); close(of)''' % E.get_output_file("gain.bed.gz"))
        except rpy2.rinterface.RRuntimeError as msg:
            E.warn("could not compute gain windows: msg=%s" % msg)
        # select loss and merge adjacent windows
        try:
            R('''loss = tested[which(tested[, grep("logFC", colnames(tested))] < 0),];
            loss_merged = MEDIPS.mergeFrames(frames=loss, distance=1)''')
            E.info('loss output: %s, merged: %s' %
                   (str(R('''dim(loss)''')), str(R('''dim(loss_merged)'''))))

            R('''of=gzfile('%s', 'w');
            write.table(loss_merged,
            file=of,
            sep="\t",
            quote=F,
            row.names=F,
            col.names=F); close(of)''' % E.get_output_file("loss.bed.gz"))
        except rpy2.rinterface.RRuntimeError as msg:
            E.warn("could not compute loss windows: msg=%s" % msg)

    # if "rpm" in options.toolset or do_all:
    #     outputfile = E.get_output_file("rpm.wig")
    #     R('''MEDIPS.exportWIG(file = '%(outputfile)s',
    #     data = CONTROL.SET, raw = T, descr = "rpm")''' %
    #       locals())
    #     if options.bigwig:
    #         bigwig(outputfile, contig_sizes)
    #     else:
    #         compress(outputfile)

    # if "rms" in options.toolset or do_all:
    #     outputfile = E.get_output_file("rms.wig")
    #     R('''MEDIPS.exportWIG(file = '%(outputfile)s',
    #     data = CONTROL.SET, raw = F, descr = "rms")''' %
    #       locals())
    #     if options.bigwig:
    #         bigwig(outputfile, contig_sizes)
    #     else:
    #         compress(outputfile)

    # write footer and output benchmark information.
    E.stop()
Esempio n. 16
0
def buildOptionParser(argv):

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id",
                            usage=globals()["__doc__"])

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="choice",
                      choices=("bam", "bigwig"),
                      help="format of genomic input files for densities "
                      "[%default]")

    parser.add_option(
        "-o",
        "--use-interval",
        dest="use_interval",
        action="store_true",
        help="only count tags that are in interval given "
        "in bed file. Otherwise, use a fixed width window (see --window-size) "
        "around peak [%default]")

    parser.add_option(
        "-w",
        "--window-size",
        dest="window_size",
        type="int",
        help="window size in bp on either side of a peak used for getting "
        "read densities. If ``--window-size`` is 1000, the actual window size"
        "will be 2kb, 1kb on either side of the peak in an interval"
        "[%default]")

    parser.add_option(
        "-b",
        "--bin-size",
        dest="bin_size",
        type="int",
        help="bin-size in bp for computing read densities. "
        "If ``--window-size`` is set to 1000 and ``--bin-size`` to 10, "
        "there will be 100 bins on either side of a peak. "
        "[%default]")

    parser.add_option(
        "--smooth-method",
        dest="smooth_method",
        type="choice",
        choices=("none", "sum", "sg"),
        help="smooting method to apply to density data before sampling "
        "according to ``bin-size``. sg=SavitzkyGolay, sum=sum density in bin, "
        "none=no smoothing "
        "[%default]")

    parser.add_option("-s",
                      "--sort-order",
                      dest="sort_orders",
                      type="choice",
                      action="append",
                      choices=("peak-height", "peak-width", "unsorted",
                               "interval-width", "interval-score"),
                      help="output sort order for matrices. "
                      "[%default]")

    parser.add_option(
        "-c",
        "--control-bam-file",
        "--control-bigwig-file",
        action="append",
        dest="control_files",
        type="string",
        help="control file. If given, two peakshapes are computed, "
        "one for the primary data and one for the control data. "
        "The control file is centered around the same "
        "base as the primary file and output in the same "
        "sort order as the primary profile to all side-by-side. "
        "comparisons. Multiple control files can be given. The "
        "control files should have the same format as the "
        "principal input file "
        "[%default]")

    parser.add_option(
        "-r",
        "--random-shift",
        dest="random_shift",
        action="store_true",
        help="shift intervals in random direction up/downstream of interval "
        "[%default]")

    parser.add_option("-e",
                      "--centring-method",
                      dest="centring_method",
                      type="choice",
                      choices=("reads", "middle"),
                      help="centring method. Available are: "
                      "reads=use density to determine peak, "
                      "middle=use middle of interval "
                      "[%default]")

    parser.add_option("-n",
                      "--normalize-matrix",
                      dest="normalization",
                      type="choice",
                      choices=("none", "sum"),
                      help="matrix normalisation to perform. "
                      "[%default]")

    parser.add_option(
        "--use-strand",
        dest="strand_specific",
        action="store_true",
        help="use strand information in intervals. Intervals on the "
        "negative strand are flipped "
        "[%default]")

    parser.add_option(
        "-i",
        "--shift-size",
        dest="shift",
        type="int",
        help="shift for reads. When processing bam files, "
        "reads will be shifted upstream/downstream by this amount. "
        "[%default]")

    parser.set_defaults(
        bin_size=10,
        shift=0,
        window_size=1000,
        sort_orders=[],
        centring_method="reads",
        control_files=[],
        random_shift=False,
        strand_specific=False,
        format="bam",
        report_step=100,
        use_interval=False,
        smooth_method=None,
    )

    return parser
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--score-method",
                      dest="method",
                      type="choice",
                      choices=[
                          "PICS", "LDscore", "ABF", "R2_rank", "get_eigen",
                          "calc_prior", "credible_set", "summarise"
                      ],
                      help="SNP scoring/prioritisation method to apply.")

    parser.add_option("--database",
                      dest="database",
                      type="string",
                      help="SQL database containing LD information "
                      "in table format. Expects columns SNP_A, "
                      "SNP_B, R2, BP_A and BP_B (Plink --r2 output)")

    parser.add_option("--ld-directory",
                      dest="ld_dir",
                      type="string",
                      help="directory containing tabix-index BGZIP "
                      "LD files.  Assumes Plink used to calculate LD")

    parser.add_option("--table-name",
                      dest="table",
                      type="string",
                      help="name of the SQL table containing the LD"
                      "values")

    parser.add_option("--chromosome",
                      dest="chromosome",
                      type="string",
                      help="chromosome to subset the association results "
                      "file on")

    parser.add_option("--ld-threshold",
                      dest="ld_threshold",
                      type="float",
                      help="the threshold of LD above which variants will "
                      "be taken forward.")

    parser.add_option("--rank-threshold",
                      dest="rank_threshold",
                      type="float",
                      help="the threshold in terms of the top n% SNPs to "
                      "output based on the ranking metric. e.g. "
                      "--rank-threshold=0.01 is the top 1% SNPs")

    parser.add_option("--credible-interval",
                      dest="interval",
                      type="float",
                      help="The credible set interval size to generate the "
                      "credible set of SNPs")

    parser.add_option("--prior-variance",
                      dest="prior_var",
                      type="float",
                      help="the prior variance used to weight the SNP "
                      "variance")

    parser.add_option("--fine-map-window",
                      dest="map_window",
                      type="int",
                      help="the region size to included around the index "
                      "SNP as the fine-mapping region.")

    parser.add_option("--eigen-score-directory",
                      dest="eigen_dir",
                      type="string",
                      help="PATH to directory containing tabix indexed "
                      "eigen score files")

    parser.add_option("--flat-prior",
                      dest="flat_prior",
                      action="store_true",
                      help="Ignore functional annotation information and "
                      "use an uninformative prior on each SNP")

    parser.add_option("--snp-set",
                      dest="snp_set",
                      type="string",
                      help="Pre-defined SNP set as a list of SNP IDs."
                      "If used to calculate priors contains column of scores.")

    parser.add_option(
        "--distribution",
        dest="dist",
        type="choice",
        choices=["normal", "t", "gamma", "lognormal", "exponential"],
        help="distribution from which to draw prior "
        "probabilities")

    parser.add_option("--distribution-parameters",
                      dest="dist_params",
                      type="string",
                      help="distribution parameters as a comma-separated list")

    parser.add_option("--lead-snp-id",
                      dest="lead_snp",
                      type="int",
                      help="0-based item number in filename")

    parser.add_option("--filename-separator",
                      dest="separator",
                      type="string",
                      help="filename separator to extract information")

    parser.add_option("--snp-column",
                      dest="snp_col",
                      type="int",
                      help="0-based index of SNP ID column number")

    parser.add_option("--probability-column",
                      dest="prob_col",
                      type="int",
                      help="0-based index of posterior probabilities column"
                      " number")

    parser.set_defaults(
        ld_dir=None,
        dist="normal",
        dist_params=None,
        snp_set=None,
        prior_var=0.04,
        interval=0.99,
        eigen_dir=None,
        map_window=100000,
        ld_threshold=0.5,
        database=None,
        table=None,
        flat_prior=False,
        lead_snp=2,
        separator="_",
        snp_col=0,
        prob_col=1,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    infile = argv[-1]

    if len(infile.split(",")) > 1:
        pass
    else:
        peek = pd.read_table(infile, nrows=5, sep="\s*", header=0)
        try:
            if len(peek["TEST"] != "ADD"):
                clean = False
            else:
                clean = True
        except KeyError:
            clean = True

    if options.method == "LDscore":
        snpscores = gwas.snpPriorityScore(gwas_results=infile,
                                          database=options.database,
                                          table_name=options.table,
                                          chromosome=options.chromosome,
                                          ld_dir=options.ld_dir,
                                          clean=clean)
        # take top 1%, all SNPs doesn't achieve anything useful
        ranks = int(len(snpscores.index) * 0.01)
        snpscores = snpscores.iloc[:ranks]

    elif options.method == "PICS":
        snp_list = {}
        if options.snp_set and not options.flat_prior:
            with IOTools.open_file(options.snp_set, "r") as sfile:
                for line in sfile.readlines():
                    snp = line.split("\t")[0]
                    try:
                        score = float(line.split("\t")[-1].rstrip("\n"))
                    except ValueError:
                        score = 0
                    snp_list[snp] = float(score)

            # get the parameter estimates for the distribution
            # if they have not been provided
            if not options.dist_params:
                dist_params = gwas.estimateDistributionParameters(
                    data=snp_list.values(), distribution=options.dist)
            else:
                dist_params = tuple(
                    [float(fx) for fx in options.dist_params.split(",")])

            E.info("Calculating priors on SNPs")
            priors = gwas.calcPriorsOnSnps(snp_list=snp_list,
                                           distribution=options.dist,
                                           params=dist_params)

        elif options.snp_set and options.flat_prior:
            with IOTools.open_file(options.snp_set, "r") as sfile:
                for line in sfile.readlines():
                    snp = line.split("\t")[0]
                    snp_list[snp] = 1.0

            priors = snp_list

        else:
            # allow for no priors or scores to be set,
            # use of priors will be ignored,
            # i.e. when prior and likelihood are not from
            # conjugate distributions
            priors = None

        # PICS scores expects the gwas results file to
        # only contain the region of interest, which
        # represents an independent association signal
        # if a SNP has not been genotyped,
        # but it is in strong LD, it will cause problems
        # downstream <- only allow SNPs that
        # are present in the analysis
        snpscores = gwas.PICSscore(gwas_results=infile,
                                   database=options.database,
                                   table_name=options.table,
                                   chromosome=options.chromosome,
                                   priors=priors,
                                   clean=clean,
                                   ld_dir=options.ld_dir,
                                   ld_threshold=options.ld_threshold)

        snpscores.columns = ["SNP", "PICS"]
        posterior_sum = 0
        snpscores.sort_values(ascending=False, inplace=True)
        post_snps = []
        for snp in snpscores.index:
            if posterior_sum < 99.0:
                posterior_sum += snpscores.loc[snp]
                post_snps.append(snp)
            else:
                break

        snpscores = snpscores.loc[post_snps]

        snpscores.drop_duplicates(inplace=True)

    elif options.method == "R2_rank":
        # rank SNPs based on their LD with the lead
        # SNP, take the top n% SNPs
        snpscores = gwas.LdRank(gwas_results=infile,
                                database=options.database,
                                table_name=options.table,
                                ld_dir=options.ld_dir,
                                chromosome=options.chromosome,
                                ld_threshold=options.ld_threshold,
                                top_snps=options.rank_threshold,
                                clean=clean)

    elif options.method == "ABF":
        snpscores = gwas.ABFScore(gwas_results=infile,
                                  region_size=options.map_window,
                                  chromosome=options.chromosome,
                                  prior_variance=options.prior_var,
                                  clean=clean)
    elif options.method == "get_eigen":
        E.info("Fetching Eigen scores")
        snpscores = gwas.getEigenScores(eigen_dir=options.eigen_dir,
                                        bim_file=infile,
                                        snp_file=options.snp_set)
        snpscores = pd.DataFrame(snpscores).T

    elif options.method == "credible_set":
        E.info("Creating credible set")

        snpscores = gwas.makeCredibleSet(probs_file=infile,
                                         credible_set=options.interval,
                                         lead_snp_indx=options.lead_snp,
                                         filename_sep=options.separator,
                                         snp_column=options.snp_col,
                                         probs_column=options.prob_col)

    elif options.method == "summarise":
        E.info("Collating SNP prioritisation resuslts")
        file_list = infile.split(",")
        snpscores = gwas.summariseResults(file_list=file_list)

    snpscores.to_csv(options.stdout, index_label="SNP", sep="\t")

    # write footer and output benchmark information.
    E.stop()
Esempio n. 18
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--R-scripts",
                      dest="scripts_r",
                      type="string",
                      help="PATH to R scripts and functions")

    parser.add_option("--trait1",
                      dest="trait1",
                      type="string",
                      help="name/column header of trait 1 in the "
                      "input data table")

    parser.add_option("--trait2",
                      dest="trait2",
                      type="string",
                      help="name/column header of trait 2 in the "
                      "input data table")

    parser.add_option("--snp-list",
                      dest="snp_list",
                      type="string",
                      help="optional list of snps on which to "
                      "restrict analysis.")

    parser.add_option("--covariates",
                      dest="covars",
                      type="string",
                      help="column headers that refer to covariates "
                      "to adjust primary traits for")

    parser.add_option("--resamples",
                      dest="resample",
                      type="int",
                      help="number of resamples with replacement "
                      "to use for bootstrapping")

    parser.add_option("--trait1-model",
                      dest="trait1_mod",
                      type="choice",
                      choices=["logistic", "linear"],
                      help="model to use to fit covariates and trait")

    parser.add_option("--trait2-model",
                      dest="trait2_mod",
                      type="choice",
                      choices=["logistic", "linear"],
                      help="model to use to fit covariates and trait")

    parser.set_defaults(resample=999, )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    infile = argv[-1]

    # snp headers are assumed to start with 'rs'
    # read the dataframe with pandas then push
    # it into R

    df = pd.read_table(infile, sep="\t", header=0, index_col=None)

    E.info("Parsing SNP IDs")
    if options.snp_list:
        snp_set = set()
        with open(options.snp_list, "r") as sfile:
            snp_list = [s.rstrip("\n") for s in sfile.readlines()]
            snp_list = set(snp_list)

        for snp in snp_list:
            snp_re = re.compile(snp)
            snp_set.update([sx for sx in df.columns if re.search(snp_re, sx)])
        snps = [st for st in snp_set]
    else:
        snp_re = re.compile("^rs")
        snps = [sx for sx in df.columns if re.search(snp_re, sx)]

    E.info("{} SNPs found in data table".format(len(snps)))

    out_df = pythonWrapper4Pet(dataframe=df,
                               snps=snps,
                               covars=options.covars,
                               scriptsdir=options.scripts_r,
                               trait1=options.trait1,
                               trait2=options.trait2,
                               model1=options.trait1_mod,
                               model2=options.trait2_mod,
                               resamples=options.resample)

    out_df.to_csv(options.stdout, sep="\t", index_label="SNP")

    # write footer and output benchmark information.
    E.stop()
Esempio n. 19
0
Type::

   python merge_tables.py --help

for command line help.

Command line options
--------------------

'''
import sys
import string
import CGATCore.Experiment as E

parser = E.OptionParser(version="%prog version: $Id$")


def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser.add_option("-t",
                      "--table",
                      dest="tables",
                      type="string",
Esempio n. 20
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: fasta2nj.py 2781 2009-09-10 11:33:14Z andreas $")

    parser.add_option(
        "-m",
        "--map",
        dest="filename_map",
        type="string",
        help="filename with mapping of species ids to swissprot species ids.")

    parser.set_defaults(
        separator="|",
        filename_map=None,
    )

    (options, args) = E.start(parser)

    if options.filename_map:
        map_species2sp = IOTools.ReadMap(open(options.filename_map, "r"))

    ninput, noutput, nerrors = 0, 0, 0
    for line in sys.stdin:
        if line[0] == ">":
            ninput += 1

            id = re.match(">([^/ \t]+)", line[:-1]).groups()[0]
            data = id.split(options.separator)

            species = data[0]

            if len(data) == 2:
                gene = data[1]
                transcript = None
            elif len(data) >= 3:
                gene = data[2]
                transcript = data[1]

            if map_species2sp:
                try:
                    species = map_species2sp[species]
                except IndexError:
                    nerrors += 1
                    if options.loglevel >= 1:
                        options.stdlog.write("# could not map species %s\n" %
                                             species)
            if transcript:
                options.stdout.write(">%s_%s GENEID=%s\n" %
                                     (transcript, species, gene))
            else:
                options.stdout.write(">%s_%s\n" % (species, gene))
            noutput += 1
        else:
            options.stdout.write(line)

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, nerrors=%i\n" %
                             (ninput, noutput, nerrors))
    E.stop()
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: calculate_histogram_2D.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-t",
        "--titles",
        dest="titles",
        action="store_true",
        help="input data has title in first row [default=%default].")

    parser.add_option(
        "--no-titles",
        dest="titles",
        action="store_false",
        help="input data has no title in first row [default=%default].")

    parser.add_option("-1",
                      "--column1",
                      dest="column1",
                      type="int",
                      help="first column to use [default=%default].")

    parser.add_option("-2",
                      "--column2",
                      dest="column2",
                      type="int",
                      help="second column to use [default=%default].")

    parser.add_option("--bin-size1",
                      dest="bin_size1",
                      type="float",
                      help="bin size for first column [default=%default].")

    parser.add_option("--bin-size2",
                      dest="bin_size2",
                      type="float",
                      help="bin size for second column [default=%default].")

    parser.set_defaults(column1=1,
                        column2=2,
                        bin_size1=1.0,
                        bin_size2=1.0,
                        titles=True)

    (options, args) = E.start(parser)
    options.column1 -= 1
    options.column2 -= 1

    histograms = []

    vals = []

    # retrieve histogram
    lines = [x for x in sys.stdin.readlines() if x[0] != "#"]

    if options.titles:
        data = lines[0][:-1].split("\t")
        print("\t".join(
            (data[options.column1], data[options.column2], "counts")))
        del lines[0]

    ninput, noutput, nskipped = 0, 0, 0

    for l in lines:
        ninput += 1
        data = l[:-1].split("\t")

        try:
            val = list(
                map(string.atof,
                    (data[options.column1], data[options.column2])))
        except IndexError:
            nskipped += 1
            continue
        except ValueError:
            nskipped += 1
            continue

        vals.append(val)
        noutput += 1

    lines = None

    h = Histogram2D.Calculate(
        vals,
        bin_function=lambda x:
        (int(x[0] / options.bin_size1), int(x[1] / options.bin_size2)))

    Histogram2D.Print(
        h,
        bin_function=lambda x:
        (x[0] * options.bin_size1, x[1] * options.bin_size2, x[2]))

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.stop()
Esempio n. 22
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "--region", dest="region", type="string",
        help="region to restrict analysis to [%default]")

    parser.add_option(
        "--window-size", dest="window_size", type="int",
        help="window size to use [%default]")

    parser.add_option(
        "--output-all-windows", dest="output_all_windows", action="store_true",
        help="output all windows. By default, windows without reads are skipped "
        "[%default]")

    parser.add_option(
        "--reference-fasta", "--input-filename-fasta",
        dest="input_filename_fasta", type="string",
        help="filename with reference sequence. If given, used to "
        "compute G+C content in windows [%default]")

    parser.set_defaults(
        force_output=False,
        region=None,
        output_all_windows=False,
        window_size=500,
        input_filename_fasta=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    is_stdin = True
    if len(args) > 0:
        pysam_in = pysam.AlignmentFile(args[0], "rb")
        if args[0] != "-":
            is_stdin = False
    elif options.stdin == sys.stdin:
        pysam_in = pysam.AlignmentFile("-", "rb")
    else:
        pysam_in = pysam.AlignmentFile(options.stdin, "rb")
        if options.stdin != "-":
            is_stdin = False

    if options.input_filename_fasta:
        fasta = pysam.FastaFile(options.input_filename_fasta)
    else:
        fasta = None

    counts_df = bam2stats_window_count(
        pysam_in,
        region=options.region,
        window_size=options.window_size,
        fasta=fasta)

    if not options.output_all_windows:
        counts_df = counts_df[counts_df.alignments > 0]

    # add G+C content
    if fasta:
        counts_df["percent_gc"] = 100.0 * counts_df.bases_gc / (counts_df.bases_gc + counts_df.bases_at)
        counts_df.fillna(0, inplace=True)

    counts_df.to_csv(
        options.stdout,
        sep="\t")

    E.stop()
Esempio n. 23
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--summarise",
                      dest="summarise",
                      type="choice",
                      choices=("level-counts", "taxa-counts", "individual"),
                      help="summarise the taxa counts - no. phyla etc")

    parser.add_option("--output-map",
                      dest="output_map",
                      action="store_true",
                      help="ouput map of taxonomy")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if options.output_map:
        found = []
        options.stdout.write("""Domain\t \
        kingdom\t \
        phylum\t \
        class\t \
        order\t \
        family\t \
        genus\t \
        species\n""")
        # only output the mapping file - do not continue
        # summarise regardless of the specified options
        for lca in LCA.iterate(options.stdin):

            # if bacteria or archaea the kingdom will
            # be the domain
            if lca.domain == "Bacteria" or lca.domain == "Archaea":
                kingdom = lca.domain
            else:
                kingdom = lca.kingdom

            hierarchy = [
                lca.domain, kingdom, lca.phylum, lca._class, lca.order,
                lca.family, lca.genus, lca.species
            ]
            if hierarchy in found:
                continue
            else:
                found.append(hierarchy)
                options.stdout.write("\t".join(hierarchy) + "\n")
        return

    if options.summarise == "level-counts":
        level_counts = collections.defaultdict(set)
        total = 0
        nreads_domain = 0
        nreads_kingdom = 0
        nreads_kingdom_plus = 0
        nreads_phylum = 0
        nreads_phylum_plus = 0
        nreads_class = 0
        nreads_class_plus = 0
        nreads_order = 0
        nreads_order_plus = 0
        nreads_family = 0
        nreads_family_plus = 0
        nreads_genus = 0
        nreads_genus_plus = 0
        nreads_species = 0
        nreads_species_plus = 0
        nreads_subspecies = 0
        nreads_subspecies_plus = 0

        c = E.Counter()
        for lca in LCA.iterate(options.stdin):
            total += 1
            if lca.domain != "NA":
                nreads_domain += 1
                level_counts["domain"].add(lca.domain)
            else:
                c.kingdom_unmapped += 1

            if lca.kingdom != "NA":
                nreads_kingdom += 1
                level_counts["kingdom"].add(lca.kingdom)
            else:
                c.kingdom_unmapped += 1

            if lca.kingdom_plus != "NA":
                nreads_kingdom_plus += 1
                level_counts["kingdom+"].add(lca.kingdom_plus)
            else:
                c.kingdom_plus_unmapped += 1

            if lca.phylum != "NA":
                nreads_phylum += 1
                level_counts["phylum"].add(lca.phylum)
            else:
                c.phylum_unmapped += 1

            if lca.phylum_plus != "NA":
                nreads_phylum_plus += 1
                level_counts["phylum+"].add(lca.phylum_plus)
            else:
                c.phylum_plus_unmapped += 1

            if lca._class != "NA":
                nreads_class += 1
                level_counts["class"].add(lca._class)
            else:
                c.class_unmapped += 1

            if lca._class_plus != "NA":
                nreads_class_plus += 1
                level_counts["class+"].add(lca._class_plus)
            else:
                c.class_plus_unmapped += 1

            if lca.order != "NA":
                nreads_order += 1
                level_counts["order"].add(lca.order)
            else:
                c.order_unmapped += 1

            if lca.order_plus != "NA":
                nreads_order_plus += 1
                level_counts["order+"].add(lca.order_plus)
            else:
                c.order_plus_unmapped += 1

            if lca.family != "NA":
                nreads_family += 1
                level_counts["family"].add(lca.family)
            else:
                c.family_unmapped += 1

            if lca.family != "NA":
                nreads_family_plus == 1
                level_counts["family+"].add(lca.family_plus)
            else:
                c.family_plus_unmapped += 1

            if lca.genus != "NA":
                nreads_genus += 1
                level_counts["genus"].add(lca.genus)
            else:
                c.genus_unmapped += 1

            if lca.genus_plus != "NA":
                nreads_genus_plus == 1
                level_counts["genus+"].add(lca.genus_plus)
            else:
                c.genus_plus_unmapped += 1

            if lca.species != "NA":
                nreads_species += 1
                level_counts["species"].add(lca.species)
            else:
                c.species_unmapped += 1

            if lca.species_plus != "NA":
                nreads_species_plus += 1
                level_counts["species+"].add(lca.species_plus)
            else:
                c.species_plus_unmapped += 1

            # removed subspecies mapping for the time
            # being

            # if lca.subspecies != "NA":
            #     nreads_subspecies += 1
            #     level_counts["subspecies"].add(lca.subspecies)
            # else:
            #     c.subspecies_unmapped += 1

            # if lca.subspecies_plus != "NA":
            #     nreads_subspecies_plus += 1
            #     level_counts["subspecies+"].add(lca.subspecies_plus)
            # else:
            #     c.subspecies_plus_unmapped += 1

        options.stdout.write("\t".join([
            "ndomain", "nkingdom", "nkingdom+", "nphylum", "nphylum+",
            "nclass", "nclass+", "norder", "norder+", "nfamily", "nfamily+",
            "ngenus", "ngenus+", "nspecies", "nspecies+", "nseqkingdom",
            "nseqkingdom+", "nseqphylum", "nseqphylum+", "nseqclass",
            "nseqclass+", "nseqorder", "nseqorder+", "nseqfamily",
            "nseqfamily+", "nseqgenus", "nseqgenus+", "nseqspecies",
            "nseqspecies+"
        ]) + "\n")

        options.stdout.write("\t".join(
            map(str, [
                len(level_counts["domain"]),
                len(level_counts["kingdom"]),
                len(level_counts["kingdom+"]),
                len(level_counts["phylum"]),
                len(level_counts["phylum+"]),
                len(level_counts["class"]),
                len(level_counts["class+"]),
                len(level_counts["order"]),
                len(level_counts["order+"]),
                len(level_counts["family"]),
                len(level_counts["family+"]),
                len(level_counts["genus"]),
                len(level_counts["genus+"]),
                len(level_counts["species"]),
                len(level_counts["species+"]), nreads_domain, nreads_kingdom,
                nreads_phylum, nreads_phylum_plus, nreads_class,
                nreads_class_plus, nreads_order, nreads_order_plus,
                nreads_family, nreads_family_plus, nreads_genus,
                nreads_genus_plus, nreads_species, nreads_species_plus
            ])) + "\n")
    elif options.summarise == "taxa-counts":
        unmapped = collections.defaultdict(int)
        total = 0
        taxa_counts = {
            "domain": collections.defaultdict(int),
            "kingdom": collections.defaultdict(int),
            "kingdom+": collections.defaultdict(int),
            "phylum": collections.defaultdict(int),
            "phylum+": collections.defaultdict(int),
            "class": collections.defaultdict(int),
            "class+": collections.defaultdict(int),
            "order": collections.defaultdict(int),
            "order+": collections.defaultdict(int),
            "family": collections.defaultdict(int),
            "family+": collections.defaultdict(int),
            "genus": collections.defaultdict(int),
            "genus+": collections.defaultdict(int),
            "species": collections.defaultdict(int),
            "species+": collections.defaultdict(int)
        }

        c = E.Counter()
        for lca in LCA.iterate(options.stdin):
            total += 1
            if lca.domain != "NA":
                taxa_counts["domain"][lca.domain] += 1
            else:
                c.kingdom_unmapped += 1
                unmapped["domain"] += 1
            if lca.kingdom != "NA":
                taxa_counts["kingdom"][lca.kingdom] += 1
            else:
                c.kingdom_unmapped += 1
                unmapped["kingdom"] += 1
            if lca.kingdom_plus != "NA":
                taxa_counts["kingdom+"][lca.kingdom_plus] += 1
            else:
                c.kingdom_plus_unmapped += 1
                unmapped["kingdom+"] += 1
            if lca.phylum != "NA":
                taxa_counts["phylum"][lca.phylum] += 1
            else:
                c.phylum_unmapped += 1
                unmapped["phylum"] += 1
            if lca.phylum_plus != "NA":
                taxa_counts["phylum+"][lca.phylum_plus] += 1
            else:
                c.phylum_plus_unmapped += 1
                unmapped["phylum+"] += 1
            if lca._class != "NA":
                taxa_counts["class"][lca._class] += 1
            else:
                c.class_unmapped += 1
                unmapped["class"] += 1
            if lca._class_plus != "NA":
                taxa_counts["class+"][lca._class_plus] += 1
            else:
                c.class_plus_unmapped += 1
                unmapped["class+"] += 1
            if lca.order != "NA":
                taxa_counts["order"][lca.order] += 1
            else:
                c.order_unmapped += 1
                unmapped["order"] += 1
            if lca.order_plus != "NA":
                taxa_counts["order+"][lca.order_plus] += 1
            else:
                c.order_plus_unmapped += 1
                unmapped["order+"] += 1
            if lca.family != "NA":
                taxa_counts["family"][lca.family] += 1
            else:
                c.family_unmapped += 1
                unmapped["family"] += 1
            if lca.family_plus != "NA":
                taxa_counts["family+"][lca.family_plus] += 1
            else:
                c.family_plus_unmapped += 1
                unmapped["family+"] += 1
            if lca.genus != "NA":
                taxa_counts["genus"][lca.genus] += 1
            else:
                c.genus_unmapped += 1
                unmapped["genus"] += 1
            if lca.genus_plus != "NA":
                taxa_counts["genus+"][lca.genus_plus] += 1
            else:
                c.genus_plus_unmapped += 1
                unmapped["genus+"] += 1
            if lca.species != "NA":
                taxa_counts["species"][lca.species] += 1
            else:
                c.species_unmapped += 1
                unmapped["species"] += 1
            if lca.species_plus != "NA":
                taxa_counts["species+"][lca.species_plus] += 1
            else:
                c.species_plus_unmapped += 1
                unmapped["species+"] += 1

        options.stdout.write("level\ttaxa\tcount\tproportion\trpm\n")
        for level, taxa_count in sorted(taxa_counts.items()):
            total_level = total - unmapped[level]
            for taxa, count in sorted(taxa_count.items()):
                options.stdout.write("\t".join([
                    level, taxa,
                    str(count), "{:.8}".format(float(count) /
                                               total_level), "{:.8}".
                    format(float(count) / (float(total_level) / 1000000))
                ]) + "\n")

        E.info(c)

    elif options.summarise == "individual":
        # each read is output with its respective
        # taxon assignments
        options.stdout.write("\t".join([
            "id", "domain", "kingdom", "kingdom+", "phylum", "phylum+",
            "class", "class+", "order", "order+", "family", "family+", "genus",
            "genus+", "species", "species+"
        ]) + "\n")
        for lca in LCA.iterate(options.stdin):
            options.stdout.write("\t".join([
                lca.identifier, lca.domain, lca.kingdom, lca.kingdom_plus,
                lca.phylum, lca.phylum_plus, lca._class, lca._class_plus,
                lca.order, lca.order_plus, lca.family, lca.family_plus,
                lca.genus, lca.genus_plus, lca.species, lca.species_plus
            ]) + "\n")

    # write footer and output benchmark information.
    E.stop()
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--input-fastq-file",
                      dest="input_fastq_file",
                      type="string",
                      help="input fastq file. "
                      "[%default]")

    parser.add_option(
        "--output-removed-tsv",
        dest="output_removed_tsv",
        type="string",
        help="if given, sequence identifiers of removed sequences will "
        "be stored in this file [%default]")

    parser.add_option(
        "--output-stats-tsv",
        dest="output_stats_tsv",
        type="string",
        help="if given, output statistics will be written to this file. "
        "[%default]")

    parser.add_option("--output-removed-fastq",
                      dest="output_removed_fastq",
                      type="string",
                      help="if given, removed fastq records will "
                      "be stored in this file [%default]")

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      action="append",
                      type="choice",
                      choices=("filter-N", "filter-identifier", "filter-ONT",
                               "offset-quality"),
                      help="methods to apply [%default]")

    parser.add_option("--set-prefix",
                      dest="set_prefix",
                      type="string",
                      help="set sequence prefix [%default]")

    parser.add_option("--input-filter-tsv",
                      dest="input_filter_tsv",
                      type="string",
                      help="list of sequence ides to filter [%default]")

    parser.add_option("--min-average-quality",
                      dest="min_average_quality",
                      type="float",
                      help="minimum average quality [%default]")

    parser.add_option("--min-length",
                      dest="min_length",
                      type="int",
                      help="minimum length [%default]")

    parser.add_option("--quality-offset",
                      dest="quality_offset",
                      type="int",
                      help="offset to modify quality values with [%default]")

    parser.set_defaults(
        methods=[],
        max_percent_N=10.0,
        input_fastq_file=None,
        set_prefix=None,
        output_removed_tsv=None,
        output_removed_fastq=None,
        output_stats_tsv=None,
        input_filter_tsv=None,
        min_average_quality=0,
        min_length=0,
        quality_offset=0,
    )

    (options, args) = E.start(parser, argv)

    if len(args) == 1:
        options.input_fastq_file = args[0]

    if options.input_fastq_file is None:
        raise ValueError("missing input fastq file")

    counter = E.Counter()

    filter_n = "filter-N" in options.methods

    filter_ont = "filter-ONT" in options.methods

    if "filter-identifier" in options.methods:
        if options.input_filter_tsv is None:
            raise ValueError(
                "please set --input-filter-tsv for method filter-identifier")
        with IOTools.open_file(options.input_filter_tsv) as inf:
            filter_identifier = set(
                [x.split()[0].strip() for x in inf.readlines()])
    else:
        filter_identifier = False

    if options.output_removed_tsv:
        outf_removed_tsv = IOTools.open_file(options.output_removed_tsv, "w")
    else:
        outf_removed_tsv = None

    if options.output_removed_fastq:
        outf_removed_fastq = IOTools.open_file(options.output_removed_fastq,
                                               "w")
    else:
        outf_removed_fastq = None

    if options.set_prefix:
        prefix = "{}".format(options.set_prefix)
    else:
        prefix = None

    quality_offset = options.quality_offset

    with pysam.FastxFile(options.input_fastq_file) as inf:
        for read in inf:
            counter.input += 1
            remove = False
            if filter_n:
                chars = collections.Counter(read.sequence)
                if "N" in chars and \
                   100.0 * chars["N"] / len(read.sequence) > options.max_percent_N:
                    remove = True
                    counter.filter_n += 1

            if filter_identifier:
                if read.name not in filter_identifier:
                    counter.filter_identifier += 1
                    remove = True

            if filter_ont:
                quals = read.get_quality_array()
                n = len(quals)
                if n < options.min_length or \
                        float(sum(quals)) / n < options.min_average_quality:
                    counter.remove_ont += 1
                    remove = True

            if remove:
                counter.removed += 1
                if outf_removed_tsv:
                    outf_removed_tsv.write(read.name + "\n")
                if outf_removed_fastq:
                    outf_removed_fastq.write(str(read) + "\n")
                continue

            if prefix:
                read.name = prefix + read.name[2:]

            if quality_offset:
                quals = numpy.array(read.get_quality_array())
                quals += quality_offset
                quals[quals < 0] = 0
                quals += 33
                # pysam fastq is read-only, so fudge it:
                # Note: not outputting description
                read = "@{}\n{}\n+\n{}".format(
                    read.name, read.sequence, "".join([chr(x) for x in quals]))

            counter.output += 1

            options.stdout.write(str(read) + "\n")

    if outf_removed_tsv:
        outf_removed_tsv.close()

    if outf_removed_fastq:
        outf_removed_fastq.close()

    if options.output_stats_tsv:
        with IOTools.open_file(options.output_stats_tsv, "w") as outf:
            outf.write(counter.asTable(as_rows=False) + "\n")

    E.info(counter)
    E.stop()
Esempio n. 25
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-f",
                      "--filename",
                      dest="filename",
                      type="string",
                      help="bamfile")

    parser.add_option("-a",
                      "--aligner",
                      dest="aligner",
                      type="string",
                      help="bamfile",
                      default="bwa")

    parser.add_option("-r",
                      "--output-report",
                      type="string",
                      dest="report",
                      help="bamfile",
                      default="")

    parser.add_option("-o",
                      "--outfile",
                      dest="outfile",
                      type="string",
                      help="bamfile",
                      default="")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    # Check the aligner is supported
    if options.aligner != "bwa":
        raise ValueError(
            "Currently only bwa is supported as aligner specific flags are used"
        )

    # Check that either a report or outfile name has been specified
    if options.report == "" and options.outfile == "":
        raise ValueError("Nothing to do")

    # Analyse the bamfile
    samfile = pysam.AlignmentFile(options.filename, "rb")
    uniq_map, best_map, uORb_map = {}, {}, {}
    properly_paired = 0

    for read in samfile.fetch():

        if read.is_proper_pair:
            tagd = dict(read.tags)
            u, b, key = False, False, read.qname

            if tagd["XT"] == "U":
                u = True
                uniq_map[key] = 1

            if "X0" in tagd:
                if tagd["X0"] == 1:
                    b = True
                    best_map[key] = 1

            if u is True or b is True:
                uORb_map[key] = 1

            properly_paired += 1

    samfile.close()

    npp = properly_paired / 2

    E.info("No proper pairs: %s" % npp)

    # Write a tabular report if report name given
    if options.report != "":

        E.info("Writing report on no. proper pairs with unique/best reads")

        def _row(x, npp=npp):
            name, d = x
            n = len(list(d.keys()))
            pc = float(n) / npp * 100
            line = "%s\t%i\t%.2f" % (name, n, pc)
            return (line)

        header = "\t".join(
            ["pair_criteria", "n_proper_pairs", "percent_proper_pairs"])

        with IOTools.open_file(options.report, "w") as report:
            report.write(header + "\n")
            for x in [("unique", uniq_map), ("best", best_map),
                      ("unique_or_best", uORb_map)]:
                report.write(_row(x) + "\n")

    # Create new bam containing uniquely mapping read pairs
    # if outfile specified
    if options.outfile != "":

        E.info("Writing proper pairs with unique or best read to %s" %
               options.outfile)

        samfile = pysam.AlignmentFile(options.filename, "rb")
        outbam = pysam.AlignmentFile(options.outfile, "wb", template=samfile)

        for read in samfile.fetch():
            if read.is_proper_pair:
                if read.qname in uORb_map:
                    outbam.write(read)
        samfile.close()
        outbam.close()
Esempio n. 26
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $1.0$",
                            usage=globals()["__doc__"])

    parser.add_option("-d",
                      "--database",
                      dest="database",
                      type="string",
                      help="Supply database name")

    parser.add_option("-u",
                      "--indivfile",
                      dest="indivfile",
                      type="string",
                      help="Supply input bed file name for individual utrons")

    parser.add_option("-p",
                      "--partfile",
                      dest="partfile",
                      type="string",
                      help="Supply input bed file name for partnered utrons")

    parser.add_option("-n",
                      "--novelfile",
                      dest="novelfile",
                      type="string",
                      help="Supply input bed file name for novel utrons")

    parser.add_option("-t",
                      "--targetfile",
                      dest="targetfile",
                      type="string",
                      help="Supply input bed file name for miRNA TSs")

    parser.add_option("-o",
                      "--outfile",
                      dest="outfile",
                      type="string",
                      help="Supply output csv file name")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    global db
    db = options.database

    #get expressions files
    expressions = PUtils.fetch_DataFrame(
        "SELECT track, match_gene_id, transfrag_id, fpkm FROM agg_agg_agg_cuffcompare_transcripts CROSS JOIN agg_agg_agg_class WHERE transfrag_id = agg_agg_agg_class.transcript_id AND fpkm > 0",
        options.database)
    expressions = expressions.set_index(
        ["track", "match_gene_id", "transfrag_id"])

    grouped_expression = expressions["fpkm"].groupby(
        level=["track", "match_gene_id"])
    ex_fracts = grouped_expression.apply(lambda x: x / x.sum())
    ex_fracts.to_csv("pruned_expressionfractions.csv")
    ex_sums = grouped_expression.apply(lambda x: x.sum())
    ex_sums.to_csv("pruned_expressionsums.csv")

    ex_sums = pd.read_csv("pruned_expressionsums.csv",
                          names=['track', 'match_gene_id', 'exp_sum'])
    ex_sums = ex_sums.set_index(['match_gene_id', 'track'])
    ex_fracts = pd.read_csv(
        "pruned_expressionfractions.csv",
        names=['track', 'match_gene_id', 'transfrag_id', 'exp_fract'])
    ex_fracts = ex_fracts.set_index(['track', 'match_gene_id', 'transfrag_id'])
    fpkm_ex_fracts = ex_fracts.join(expressions, how='inner')
    fpkm_ex_fracts = fpkm_ex_fracts.reset_index()
    fpkm_ex_fracts = fpkm_ex_fracts.set_index(['match_gene_id', 'track'])
    ex_all = fpkm_ex_fracts.join(ex_sums, how='inner')
    ex_all = ex_all.reset_index()
    ex_all.to_csv("pruned_expression_all.csv")
    ex_all = pd.read_csv("pruned_expression_all.csv")
    ex_all = ex_all.set_index('transfrag_id')

    #stop distances
    ind_utrons = pd.read_table(
        options.indivfile,
        header=0,
        sep='\t',
        names=["chrom", "start", "end", "name", "score", "strand", "stop"],
        usecols=["start", "end", "name", "strand", "stop"],
        compression='gzip')

    ind_utrons['dist'] = ind_utrons.apply(lambda row: getStopDistdf(row),
                                          axis=1)
    ind_utrons = ind_utrons.set_index('name')
    grouped_stopdist = ind_utrons.groupby(level='name')
    transcript_dist = grouped_stopdist.apply(lambda group: group['dist'].max())
    transcript_dist.name = 'dist'
    transcript_over_under_50 = transcript_dist.apply(
        lambda row: getOverUnder50(row))
    transcript_over_under_50.name = 'over_under_50'

    ex_all_dist = ex_all.join(transcript_over_under_50, how='left')
    ex_all_dist = ex_all_dist.join(transcript_dist, how='left')
    ex_all_dist['utron'] = ex_all_dist.apply(lambda row: isUtron(row), axis=1)

    #novel utrons

    novel_utrons = pd.read_table(options.novelfile,
                                 header=0,
                                 sep='\t',
                                 names=[
                                     "chrom", "start", "end", "name", "score",
                                     "strand", "a", "b", "c", "d", "e", "f"
                                 ],
                                 usecols=["start", "end", "name"],
                                 compression='gzip')
    novel_utrons = novel_utrons.set_index(novel_utrons["name"])
    novel_utrons = novel_utrons.drop_duplicates(
        subset="name"
    )  #excludes entries with different start/end utron coordinates in same transcript

    novel_utrons['novel_utron'] = novel_utrons.apply(
        lambda row: insertYesCol(row), axis=1)
    novel_utrons = novel_utrons.drop(['start', 'end', 'name'], axis=1)
    ex_all_dist_nov = ex_all_dist.join(novel_utrons, how='left')

    #TSs

    utron_TSs = pd.read_table(
        options.targetfile,
        header=0,
        sep='\t',
        names=["chrom", "start", "end", "name", "score", "strand", "stop"],
        usecols=["start", "end", "name", "strand", "stop"],
        compression='gzip')
    utron_TSs['miRNA_TS'] = utron_TSs.apply(lambda row: insertYesCol(row),
                                            axis=1)
    utron_TSs = utron_TSs.drop(["start", "end", "strand", "stop"],
                               axis=1).drop_duplicates()
    utron_TSs = utron_TSs.set_index(["name"])
    ex_all_dist_nov_TS = ex_all_dist_nov.join(utron_TSs, how='left')

    #extra utrons

    tcons_ens = pd.read_table(options.partfile,
                              header=0,
                              sep='\t',
                              names=[
                                  "chrom", "start", "end", "name", "score",
                                  "strand", "a", 'b', 'c', 'd', 'e', 'f'
                              ],
                              usecols=["start", "end", "name", "strand"],
                              compression='gzip')
    tcons_ens['TCONS_id'] = tcons_ens.apply(lambda row: get_tcons(row), axis=1)
    tcons_ens['partner_id'] = tcons_ens.apply(lambda row: get_enst(row),
                                              axis=1)
    tcons_ens = tcons_ens.set_index('TCONS_id')

    tcons_ens['partner_id_TCONS'] = tcons_ens.apply(
        lambda row: get_tcons_from_ens(row), axis=1)
    tcons_ens = tcons_ens.drop_duplicates()
    tcons_ens['extra_utron'] = tcons_ens.apply(lambda row: insertYesCol(row),
                                               axis=1)

    partners = tcons_ens[['name', 'partner_id_TCONS']]
    partners = partners[partners['partner_id_TCONS'] != 'No_id']
    partners = partners.set_index('partner_id_TCONS')
    utrons_and_partners = tcons_ens.append(partners)
    utrons_and_partners = utrons_and_partners.join(ex_all_dist_nov,
                                                   how='inner')
    utrons_and_partners = utrons_and_partners.reset_index().drop_duplicates(
        subset=['match_gene_id', 'track', 'index'])
    utrons_and_partners = utrons_and_partners.set_index(
        ['match_gene_id', 'track'])
    groups = utrons_and_partners.groupby(level=['match_gene_id', 'track'])
    sums = groups.apply(lambda group: sum(group['fpkm']))
    utrons_and_partners['partner_exp_sum'] = sums
    utrons_and_partners['partner_exp_fract'] = utrons_and_partners.apply(
        lambda row: row['fpkm'] / row['partner_exp_sum'], axis=1)
    only_utrons = utrons_and_partners[utrons_and_partners['extra_utron'] ==
                                      'Yes']
    only_utrons = only_utrons[[
        'index', 'extra_utron', 'partner_exp_sum', 'partner_exp_fract',
        'partner_id_TCONS', 'partner_id'
    ]]
    only_utrons = only_utrons.reset_index()
    only_utrons = only_utrons.dropna(
        subset=['match_gene_id', 'track', 'index'])
    only_utrons = only_utrons.set_index(['match_gene_id', 'track', 'index'])

    ex_all_dist_nov_TS = ex_all_dist_nov_TS.reset_index()
    ex_all_dist_nov_TS = ex_all_dist_nov_TS.set_index(
        ['match_gene_id', 'track', 'index'])
    ex_all_dist_nov_TS_ext = ex_all_dist_nov_TS.join(only_utrons, how='left')

    #patients and treatment
    final = ex_all_dist_nov_TS_ext.reset_index()
    final['treatment'] = final.apply(lambda row: label_treatment(row), axis=1)
    final['patient'] = final.apply(lambda row: label_patient(row), axis=1)

    final.to_csv(options.outfile)

    # write footer and output benchmark information.
    E.Stop()
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--R-script",
                      dest="scripts_r",
                      type="string",
                      help="PATH to location of R scripts and functions")

    parser.add_option("--trait1-results",
                      dest="trait1_res",
                      type="string",
                      help="summary statistics for trait 1")

    parser.add_option("--trait2-results",
                      dest="trait2_res",
                      type="string",
                      help="summary statistics for trait 2")

    parser.add_option("--maf-table",
                      dest="maf_table",
                      type="string",
                      help="Table containing allele frequency info for "
                      "all SNPs")

    parser.add_option("--maf-snp-column",
                      dest="maf_snpcol",
                      type="string",
                      help="column header containing SNP IDs")

    parser.add_option("--trait1-snplist",
                      dest="trait1_snplist",
                      type="string",
                      help="restrict the analysis to this set of SNPs "
                      "for trait1")

    parser.add_option("--trait2-snplist",
                      dest="trait2_snplist",
                      type="string",
                      help="restrict the analysis to this set of SNPs "
                      "for trait2")

    parser.add_option("--gene-list",
                      dest="gene_list",
                      type="string",
                      help="list of genes to test eQTL-trait overlap with. "
                      "Either trait1 or trait2 must contain a GENE column.")

    parser.add_option("--trait1-type",
                      dest="trait1_type",
                      type="choice",
                      choices=["quant", "cc"],
                      help="Trait 1 type, either "
                      "quantitative (quant) or binary (cc)")

    parser.add_option("--trait2-type",
                      dest="trait2_type",
                      type="choice",
                      choices=["quant", "cc"],
                      help="Trait 2 type, either "
                      "quantitative (quant) or binary (cc)")

    parser.add_option("--trait1-size",
                      dest="trait1_size",
                      type="int",
                      help="sample size for trait1 analysis, only use this "
                      "if the NMISS column is missing")

    parser.add_option("--trait2-size",
                      dest="trait2_size",
                      type="int",
                      help="sample size for trait2 analysis, only use this "
                      "if the NMISS column is missing")

    parser.add_option("--trait1-p-column",
                      dest="trait1_pcol",
                      type="string",
                      help="Column header for P-value column in trait 1 "
                      "results file, if not `P`")

    parser.add_option("--trait2-p-column",
                      dest="trait2_pcol",
                      type="string",
                      help="Column header for P-value column in trait 2 "
                      "results file, if not `P`")

    parser.add_option("--trait1-prevalence",
                      dest="trait1_prev",
                      type="float",
                      help="Prevalence of trait 1 in the population.  Only "
                      "relevant for binary traits")

    parser.add_option("--trait2-prevalence",
                      dest="trait2_prev",
                      type="float",
                      help="Prevalence of trait 2 in the population. Only "
                      "relevant for binary traits")

    parser.add_option("--chromosome",
                      dest="chrome",
                      type="string",
                      help="Restrict analysis to this chromosome.")

    parser.add_option("--restrict-from",
                      dest="restrict_from",
                      type="int",
                      help="start co-ordinate to restrict analysis.  Must "
                      "provide `--chromosome` when restricting region")

    parser.add_option("--restrict-to",
                      dest="restrict_to",
                      type="int",
                      help="end co-ordinate to restrict analysis.  Must "
                      "provide `--chromosome` when restricting region")

    parser.set_defaults(chrome=None,
                        restrict_from=None,
                        restrict_to=None,
                        trait1_prev=None,
                        trait2_prev=None,
                        trait1_pcol="P",
                        trait2_pcol="P",
                        maf_snpcol="SNP")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    # check all files contain necessary fields
    if options.trait1_res.endswith(".gz"):
        trait1_comp = "gzip"
    else:
        trait1_comp = None

    t1_nsize = False
    t2_nsize = False

    E.info("Parsing trait 1 file: {}".format(options.trait1_res))
    try:
        trait1_peek = pd.read_table(options.trait1_res,
                                    nrows=5,
                                    sep="\s*",
                                    header=0,
                                    index_col=None,
                                    compression=trait1_comp,
                                    engine='python')
        try:
            len_cols = len(
                set(trait1_peek.columns).intersection(
                    ["SNP", "NMISS", "{}".format(options.trait1_pcol)]))
            assert len_cols == 3
            trait1_sep = "\s*"
        except AssertionError:
            if options.trait1_size:
                t1_nsize = True
                trait1_sep = "\s*"
                E.warn("NMISS column is not present, "
                       "using input sample size n={}".format(
                           options.trait1_size))
            else:
                raise IOError("Trait-1 input file does not contain "
                              "SNP, NMISS or P columns")
    except StopIteration:
        trait1_peek = pd.read_table(options.trait1_res,
                                    nrows=5,
                                    sep="\t",
                                    header=0,
                                    compression=trait1_comp,
                                    index_col=None)
        try:
            len_cols = len(
                set(trait1_peek.columns).intersection(
                    ["SNP", "NMISS", "{}".format(options.trait1_pcol)]))
            assert len_cols == 3
            trait1_sep = "\t"
        except AssertionError:
            if options.trait1_size:
                t1_nsize = True
                trait1_sep = "\t"
                E.warn("NMISS column is not present, "
                       "using input sample size n={}".format(
                           options.trait1_size))
            else:
                raise IOError("Trait-1 input file does not contain "
                              "SNP, NMISS or P columns")

    if options.trait2_res.endswith(".gz"):
        trait2_comp = "gzip"
    else:
        trait2_comp = None

    E.info("Parsing trait 2 file: {}".format(options.trait2_res))
    try:
        trait2_peek = pd.read_table(options.trait2_res,
                                    nrows=5,
                                    sep="\s*",
                                    header=0,
                                    index_col=None,
                                    compression=trait2_comp,
                                    engine='python')
        try:
            len_cols = len(
                set(trait2_peek.columns).intersection(
                    ["SNP", "NMISS", "{}".format(options.trait2_pcol)]))
            assert len_cols == 3
            trait2_sep = "\s*"
        except AssertionError:
            if options.trait2_size:
                t2_nsize = True
                trait2_sep = "\s*"
                E.warn("NMISS column is not present, "
                       "using input sample size n={}".format(
                           options.trait2_size))
            else:
                raise IOError("Trait-2 input file does not contain "
                              "SNP, NMISS or P columns")
    except StopIteration:
        trait2_peek = pd.read_table(options.trait2_res,
                                    nrows=5,
                                    sep="\t",
                                    header=0,
                                    compression=trait2_comp,
                                    index_col=None)
        try:
            len_cols = len(
                set(trait2_peek.columns).intersection(
                    ["SNP"
                     "NMISS", "{}".format(options.trait2_pcol)]))
            assert len_cols == 3
            trait2_sep = "\t"
        except AssertionError:
            if options.trait2_size:
                t2_nsize = True
                trait2_sep = "\t"
                E.warn("NMISS column is not present, "
                       "using input sample size n={}".format(
                           options.trait2_size))
            else:
                raise IOError("Trait-2 input file does not contain "
                              "SNP, NMISS or P columns")

    E.info("Parsing MAF table file: {}".format(options.maf_table))
    if options.maf_table.endswith(".gz"):
        maf_comp = "gzip"
    else:
        maf_comp = None
    try:
        maf_peek = pd.read_table(options.maf_table,
                                 nrows=5,
                                 sep="\s*",
                                 header=0,
                                 index_col=None,
                                 compression=maf_comp,
                                 engine='python')
    except StopIteration:
        maf_peek = pd.read_table(options.maf_table,
                                 nrows=5,
                                 sep="\t",
                                 header=0,
                                 compression=maf_comp,
                                 index_col=None)
    try:
        len_cols = len(
            set(maf_peek.columns).intersection(
                ["{}".format(options.maf_snpcol), "MAF"]))
        assert len_cols == 2
        maf_sep = "\s*"
    except AssertionError:
        raise IOError("Frequency table does not contain " "SNP or MAF columns")

    trait1_results = pd.read_table(options.trait1_res,
                                   sep=trait1_sep,
                                   header=0,
                                   compression=trait1_comp,
                                   index_col=None)

    trait2_results = pd.read_table(options.trait2_res,
                                   sep=trait2_sep,
                                   header=0,
                                   compression=trait2_comp,
                                   index_col=None)
    if options.trait1_pcol != "P":
        trait1_results.loc[:, "P"] = trait1_results[:, options.trait1_pcol]
    else:
        pass

    if options.trait2_pcol != "P":
        trait2_results.loc[:, "P"] = trait2_results.loc[:, options.trait2_pcol]
    else:
        pass

    if t1_nsize:
        trait1_results.loc[:, "NMISS"] = options.trait1_size
    else:
        pass

    if t2_nsize:
        trait2_results.loc[:, "NMISS"] = options.trait2_size
    else:
        pass

    maf_table = pd.read_table(options.maf_table,
                              sep=maf_sep,
                              header=0,
                              compression=maf_comp,
                              index_col=None)

    if options.maf_snpcol != "SNP":
        maf_table.loc[:, "SNP"] = maf_table.loc[:, options.maf_snpcol]
    else:
        pass

    if options.gene_list:
        gene_list = set()
        with open(options.gene_list, "r") as gfile:
            for gene in gfile.readlines():
                gene_list.add(gene.rstrip("\n"))
    else:
        gene_list = None

    # restrict analysis to a specific set of SNP
    # good for picking just SNPs part of independent association
    # signals
    if options.trait1_snplist:
        t1_snplist = set()
        with open(options.trait1_snplist, "r") as t1_sfile:
            for t1snp in t1_sfile.readlines():
                t1_snplist.add(t1snp.rstrip("\n"))

        trait1_results = trait1_results.loc[trait1_results["SNP"].isin(
            t1_snplist)]
    else:
        pass

    if options.trait2_snplist:
        t2_snplist = set()
        with open(options.trait2_snplist, "r") as t2_sfile:
            for t2snp in t2_sfile.readlines():
                t2_snplist.add(t2snp.rstrip("\n"))

        trait2_results = trait2_results.loc[trait2_results["SNP"].isin(
            t2_snplist)]
    else:
        pass

    out_df = testColoc(trait1=trait1_results,
                       trait2=trait2_results,
                       trait1_type=options.trait1_type,
                       trait2_type=options.trait2_type,
                       scriptsdir=options.scripts_r,
                       gene_list=gene_list,
                       maf_table=maf_table,
                       trait1_prev=options.trait1_prev,
                       trait2_prev=options.trait2_prev,
                       chromosome=options.chrome,
                       start=options.restrict_from,
                       end=options.restrict_to)

    out_df.to_csv(options.stdout, index_label="Trait", sep="\t")

    # write footer and output benchmark information.
    E.stop()
Esempio n. 28
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id",
                            usage=globals()["__doc__"])

    parser.add_option("-p", "--path", dest="path", type="string",
                      help="path to scan for files [%default]")

    parser.add_option("-d", "--destination", dest="destination", type="string",
                      help="path to deposit files into [%defaul]")

    parser.set_defaults(path='/ifs/projects/sftp',
                        url='http://www.cgat.org/downloads/',
                        dest='/ifs/projects/overview')

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    statement = "find %s -name 'index.html'" % options.path

    process = subprocess.Popen(statement,
                               shell=True,
                               stdin=subprocess.PIPE,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)

    stdout, stderr = process.communicate()

    files = stdout.split('\n')
    files.sort()

    outfile = IOTools.openFile(os.path.join(options.dest, "index.html"), "w")

    outfile.write('''
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html>
<head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <title>CGAT project reports</title>
    <link rel="stylesheet" href="cgat.css" type="text/css" />
    <link rel="stylesheet" href="pygments.css" type="text/css" />
    <link rel="shortcut icon" href="http://cgatwiki.anat.ox.ac.uk/favicon.ico">
    <script type="text/javascript" src="sorttable.js"></script>
</head>

  <body>
    <div class="related">
      <h3>Navigation</h3>
      <ul>
        <li><a href="index.html">CGAT Projects Overview</a> &raquo;</li>
      </ul>
    </div>

    <div class="document">
      <div class="documentwrapper">
        <div class="bodywrapper">
          <div class="body">
 <div class="section" id="cgat-pipelines">
<H1>CGAT exported project pages</H1>

<p> 
This page is for internal use only. Do not distribute outside of
CGAT and do not make this page available on the world wide web.
</p>

<table class="sortable">\n''')

    outfile.write(
        '''<tr><th>Project</th><th>Report</th><th>Title</th></tr>\n''')

    for f in files:
        if f == '':
            continue

        proj = re.search('(proj\d+)', f).groups()[0]
        relpath = re.sub('.*proj\d+/', '', f)
        report = re.sub('^[^/]*/', '', os.path.dirname(relpath))

        lines = IOTools.openFile(f).readlines()
        titles = [x for x in lines if "<title>" in x]
        if titles:
            title = re.search("<title>(.*)</title>", titles[0]).groups()[0]
        else:
            title = "NA"

        if title.endswith("documentation"):
            title = title[:-len("documentation")]

        url = os.path.join(options.url, relpath)
        outfile.write(
            '<tr><td>%(proj)s</td><td><a HREF="%(url)s">%(report)s</td><td>%(title)s</td></tr>\n' % locals())

    outfile.write('''
</table>

</div>
</div>


          </div>
        </div>
      </div>
      <div class="sphinxsidebar">
        <div class="sphinxsidebarwrapper">
            <p class="logo"><a href="contents.html">
              <img class="logo" src="cgat_logo.png" alt="Logo"/>
            </a></p>





</body>
</html>\n''')

    outfile.close()

    E.info('created output file %s' % outfile.name)
    # write footer and output benchmark information.
    E.Stop()
Esempio n. 29
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-g", "--gtf-file", dest="filename_gtf", type="string",
        help="filename with gene models in gtf format [%default]")

    parser.add_option(
        "-m", "--filename-mismapped", dest="filename_mismapped", type="string",
        help="output bam file for mismapped reads [%default]")

    parser.add_option(
        "-j", "--junctions-bed-file", dest="filename_junctions", type="string",
        help="bam file with reads mapped across junctions [%default]")

    parser.add_option(
        "-r", "--filename-regions", dest="filename_regions", type="string",
        help="filename with regions to remove in bed format [%default]")

    parser.add_option(
        "-t", "--transcripts-gtf-file", dest="filename_transcriptome",
        type="string",
        help="bam file with reads mapped against transcripts [%default]")

    parser.add_option(
        "-p", "--map-tsv-file", dest="filename_map", type="string",
        help="filename mapping transcript numbers (used by "
        "--filename-transciptome) to transcript names "
        "(used by --filename-gtf) [%default]")

    parser.add_option(
        "-s", "--filename-stats", dest="filename_stats", type="string",
        help="filename to output stats to [%default]")

    parser.add_option(
        "-o", "--colour",
        dest="colour_mismatches", action="store_true",
        help="mismatches will use colour differences (CM tag) [%default]")

    parser.add_option(
        "-i", "--ignore-mismatches",
        dest="ignore_mismatches", action="store_true",
        help="ignore mismatches [%default]")

    parser.add_option(
        "-c", "--remove-contigs", dest="remove_contigs", type="string",
        help="','-separated list of contigs to remove [%default]")

    parser.add_option(
        "-f", "--force-output", dest="force", action="store_true",
        help="force overwriting of existing files [%default]")

    parser.add_option("-u", "--unique", dest="unique", action="store_true",
                      help="remove reads not matching uniquely [%default]")

    parser.add_option("--output-sam", dest="output_sam", action="store_true",
                      help="output in sam format [%default]")

    parser.set_defaults(
        filename_gtf=None,
        filename_mismapped=None,
        filename_junctions=None,
        filename_transcriptome=None,
        filename_map=None,
        remove_contigs=None,
        force=False,
        unique=False,
        colour_mismatches=False,
        ignore_mismatches=False,
        output_sam=False,
        filename_table=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) != 1:
        raise ValueError("please supply one bam file")

    bamfile_genome = args[0]
    genome_samfile = pysam.AlignmentFile(bamfile_genome, "rb")

    if options.remove_contigs:
        options.remove_contigs = options.remove_contigs.split(",")

    if options.filename_map:
        E.info("reading map")
        id_map = IOTools.read_map(
            IOTools.open_file(options.filename_map), has_header=True)
        id_map = dict([(y, x) for x, y in id_map.items()])
    else:
        id_map = None

    transcripts = {}
    if options.filename_gtf:
        E.info("indexing geneset")
        mapped, missed = 0, 0
        for gtf in GTF.transcript_iterator(
                GTF.iterator(IOTools.open_file(options.filename_gtf))):
            gtf.sort(key=lambda x: x.start)
            transcript_id = gtf[0].transcript_id
            if id_map:
                try:
                    transcript_id = id_map[transcript_id]
                    mapped += 1
                except KeyError:
                    missed += 1
                    continue
            transcripts[transcript_id] = gtf

        E.info("read %i transcripts from geneset (%i mapped, %i missed)" %
               (len(transcripts), mapped, missed))

    regions_to_remove = None
    if options.filename_regions:
        E.info("indexing regions")
        regions_to_remove = IndexedGenome.Simple()
        for bed in Bed.iterator(IOTools.open_file(options.filename_regions)):
            regions_to_remove.add(bed.contig, bed.start, bed.end)
        E.info("read %i regions" % len(regions_to_remove))

    if options.filename_transcriptome:
        transcripts_samfile = pysam.AlignmentFile(options.filename_transcriptome,
                                                  "rb")
    else:
        transcripts_samfile = None

    if options.output_sam:
        output_samfile = pysam.AlignmentFile("-", "wh", template=genome_samfile)
    else:
        output_samfile = pysam.AlignmentFile("-", "wb", template=genome_samfile)

    if options.filename_mismapped:
        if not options.force and os.path.exists(options.filename_mismapped):
            raise IOError("output file %s already exists" %
                          options.filename_mismapped)
        output_mismapped = pysam.AlignmentFile(options.filename_mismapped,
                                               "wb",
                                               template=genome_samfile)
    else:
        output_mismapped = None

    if options.filename_junctions:
        junctions_samfile = pysam.AlignmentFile(options.filename_junctions,
                                                "rb")
    else:
        junctions_samfile = None

    c = bams2bam_filter(genome_samfile,
                        output_samfile,
                        output_mismapped,
                        transcripts_samfile,
                        junctions_samfile,
                        transcripts,
                        regions=regions_to_remove,
                        unique=options.unique,
                        remove_contigs=options.remove_contigs,
                        colour_mismatches=options.colour_mismatches,
                        ignore_mismatches=options.ignore_mismatches,
                        ignore_transcripts=transcripts_samfile is None,
                        ignore_junctions=junctions_samfile is None)

    if options.filename_stats:
        outf = IOTools.open_file(options.filename_stats, "w")
        outf.write("category\tcounts\n%s\n" % c.asTable())
        outf.close()

    if options.filename_transcriptome:
        transcripts_samfile.close()

    genome_samfile.close()
    output_samfile.close()
    if output_mismapped:
        output_mismapped.close()

    # write footer and output benchmark information.
    E.stop()
Esempio n. 30
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: r_test.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      help="method to use [t-test=t-test,wilcox=wilcox]",
                      choices=("t-test", "wilcox"))
    parser.add_option("-1",
                      "--infile",
                      dest="filename_input",
                      type="string",
                      help="input filename with vector of values.")
    parser.add_option("-2",
                      "--infile2",
                      dest="filename_input2",
                      type="string",
                      help="input filename with vector of values.")
    parser.add_option("--header-names",
                      dest="header",
                      type="string",
                      help="""header of value column [default=%default].""")

    parser.set_defaults(
        method="t-test",
        filename_input=None,
        header="value",
    )

    (options, args) = E.start(parser, add_pipe_options=True)

    if options.filename_input:
        infile = IOTools.open_file(options.filename_input, "r")
    else:
        infile = sys.stdin

    values, errors = IOTools.ReadList(infile, map_function=float)
    if options.filename_input:
        infile.close()

    if errors:
        E.warn("errors in input: %s" % ";".join(map(str, errors)))

    kwargs = {}
    xargs = []
    for arg in args:
        if "=" in arg:
            key, value = arg.split("=")
            kwargs[key] = value
        else:
            xargs.append(arg)

    if options.filename_input2:
        infile = IOTools.open_file(options.filename_input2, "r")
        values2, errors2 = IOTools.ReadList(infile, map_function=float)
        infile.close()
    else:
        values2 = None

    stat = Stats.Summary(values)

    power, diff_at_power95 = None, None
    if options.method == "t-test":
        if values2:
            result = R.t_test(values, values2, *xargs, **kwargs)
        else:
            result = R.t_test(values, *xargs, **kwargs)
            # compute power of test
            power = R.power_t_test(n=len(values),
                                   delta=abs(stat["mean"]),
                                   sd=stat["stddev"],
                                   sig_level=0.05)['power']
            diff_at_power95 = R.power_t_test(n=len(values),
                                             power=0.95,
                                             sd=stat["stddev"],
                                             sig_level=0.05)['delta']

    if options.method == "wilcox":
        result = R.wilcox_test(values, *xargs, **kwargs)

    options.stdout.write("%s\t%s\n" % ("key", options.header))

    for key, value in sorted(result.items()):
        if key == "data.name":
            continue
        if key == "p.value":
            options.stdout.write("%s\t%5.2e\n" % (str(key), value))
        else:
            options.stdout.write("%s\t%s\n" % (str(key), str(value)))

    for key, value in list(stat.items()):
        options.stdout.write("%s\t%s\n" % (str(key), str(value)))

    if power:
        options.stdout.write("1-power\t%5.2e\n" % (1.0 - power))
        options.stdout.write("diff_at_power95\t%f\n" % diff_at_power95)

    E.stop()