Esempio n. 1
0
def getRepeatDataFromUCSC(dbhandle,
                          repclasses,
                          outfile,
                          remove_contigs_regex=None,
                          job_memory="4G"):
    '''download data from UCSC database and write to `outfile` in
    :term:`gff` format.

    This method downloads repeats from the repeatmasker track at
    the UCSC.

    Arguments
    ---------
    dbhandle : object
       Database handle to UCSC mysql database
    repclasses : list
       List of repeat classes to select. If empty, all repeat classes
       will be collected.
    outfile : string
       Filename of output file in :term:`gff` format.
    remove_contigs_regex : string
       If given, remove repeats on contigs matching the regular
       expression given.

    '''
    cc = dbhandle.execute("SHOW TABLES LIKE '%%rmsk'")
    tables = [x[0] for x in cc.fetchall()]
    if len(tables) == 0:
        raise ValueError("could not find any `rmsk` tables")

    # now collect repeats
    tmpfile = P.get_temp_file(".")

    for table in tables:

        sql = """SELECT genoName, 'repeat', 'exon', genoStart+1, genoEnd,
        '.', strand, '.',
        CONCAT('class \\"', repClass, '\\"; family \\"',
        repFamily, '\\"; repName \\"', repName, '\\";')
        FROM %(table)s"""

        if repclasses:
            repclasses_str = ",".join(
                ["'" + x.strip() + "'" for x in repclasses])
            sql += ''' WHERE repClass in (%(repclasses_str)s) ''' % locals()

        sql = sql % locals()

        E.debug("executing sql statement: %s" % sql)
        cc = dbhandle.execute(sql)
        for data in cc.fetchall():
            tmpfile.write("\t".join(map(str, data)) + "\n")

    tmpfile.close()

    # sort gff and make sure that names are correct
    tmpfilename = tmpfile.name

    statement = [
        '''cat %(tmpfilename)s
    | sort -t$'\\t' -k1,1 -k4,4n
    | cgat gff2gff
    --method=sanitize
    --sanitize-method=genome
    --skip-missing
    --genome-file=%(genome_dir)s/%(genome)s
    --log=%(outfile)s.log '''
    ]

    if remove_contigs_regex:
        statement.append('--contig-pattern="{}"'.format(
            ",".join(remove_contigs_regex)))

    statement.append('''| gzip > %(outfile)s ''')

    statement = " ".join(statement)

    P.run(statement, job_memory=job_memory)

    os.unlink(tmpfilename)
Esempio n. 2
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--input-vcf",
                      dest="input_vcf_file",
                      type="string",
                      help="input vcf file")

    parser.add_option("-t",
                      "--truth-vcf",
                      dest="truth_vcf_file",
                      type="string",
                      help="truth vcf file")

    parser.add_option(
        "-f",
        "--input-fasta",
        dest="input_fasta_file",
        type="string",
        help="input fasta file. faidx indexed reference sequence file to "
        "determine INDEL context [%default]")

    parser.add_option(
        "-e",
        "--input-bed",
        dest="input_bed_file",
        type="string",
        help="input file with intervals. Tab-delimited file of intervals "
        "in bed format to restrict analysis to. [%default]")

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      action="append",
                      type="choice",
                      choices=("mutational-signature", "kinship"),
                      help="methods to apply [%default]")

    parser.set_defaults(
        methods=[],
        input_vcf_file=None,
        input_bed_file=None,
        input_fasta_file=None,
        truth_vcf_file=None,
    )

    (options, args) = E.start(parser, argv, add_output_options=True)

    if len(args) == 1:
        options.input_vcf_file = args[0]

    if options.input_vcf_file is None:
        raise ValueError("please supply a VCF file")

    if options.truth_vcf_file is None:
        raise ValueError("please supply a VCF file with truth data")

    if options.input_fasta_file is None:
        raise ValueError(
            "please supply a fasta file with the reference genome")

    if not os.path.exists(options.input_vcf_file):
        raise OSError("input vcf file {} does not exist".format(
            options.input_vcf_file))

    if not os.path.exists(options.input_vcf_file + ".tbi"):
        raise OSError("input vcf file {} needs to be indexed".format(
            options.input_vcf_file))

    if not os.path.exists(options.truth_vcf_file):
        raise OSError("truth vcf file {} does not exist".format(
            options.truth_vcf_file))

    if not os.path.exists(options.truth_vcf_file + ".tbi"):
        raise OSError("truth vcf file {} needs to be indexed".format(
            options.truth_vcf_file))

    if not os.path.exists(options.input_fasta_file):
        raise OSError("input fasta file {} does not exist".format(
            options.input_fasta_file))

    if not os.path.exists(options.input_fasta_file + ".fai"):
        raise OSError("input fasta file {} needs to be indexed".format(
            options.input_fasta_file))

    # update paths to absolute
    options.input_fasta_file = os.path.abspath(options.input_fasta_file)
    options.input_vcf_file = os.path.abspath(options.input_vcf_file)
    options.truth_vcf_file = os.path.abspath(options.truth_vcf_file)

    test_vcf = pysam.VariantFile(options.input_vcf_file)
    truth_vcf = pysam.VariantFile(options.truth_vcf_file)
    contigs = test_vcf.header.contigs
    truth_contigs = set(truth_vcf.header.contigs)

    test_vcf_samples = set(test_vcf.header.samples)
    truth_vcf_samples = set(truth_vcf.header.samples)

    common_samples = test_vcf_samples.intersection(truth_vcf_samples)
    if len(common_samples) == 0:
        raise ValueError("no common samples in test/truth VCFs")

    def pair_iterator(test_vcf, truth_vcf, contig):
        counter = E.Counter()
        test_iter = test_vcf.fetch(contig)
        truth_iter = truth_vcf.fetch(contig)

        test_record = next(test_iter)
        truth_record = next(truth_iter)
        try:
            while 1:
                if test_record.pos < truth_record.pos:
                    test_record = next(test_iter)
                    continue

                elif test_record.pos > truth_record.pos:
                    truth_record = next(truth_iter)
                    continue

                elif len(test_record.alts) > 1:
                    counter.skip_test_truth += 1
                    test_record = next(test_iter)
                    continue

                elif len(truth_record.alts) > 1:
                    counter.skip_multiallelic_truth += 1
                    truth_record = next(truth_iter)
                    continue

                elif test_record.alts != truth_record.alts:
                    counter.skip_genotype_difference += 1
                    test_record = next(test_iter)
                    truth_record = next(truth_iter)
                    continue

                if test_record.ref != truth_record.ref:
                    # todo: deal with indels
                    raise ValueError("mismatching reference bases at position "
                                     "{}:{}".format(test_record.chrom,
                                                    test_record.pos))

                yield test_record, truth_record
                test_record = next(test_iter)
                truth_record = next(truth_iter)

        except StopIteration:
            pass

        E.debug(str(counter))

    counters_per_contig = {}

    for contig in contigs:
        counter_contig = collections.defaultdict(E.Counter)
        counters_per_contig[contig] = counter_contig

        E.info("processing contig {}".format(contig))

        if contig not in truth_contigs:
            E.warn(
                "skipping contig {} as it is not in truth data".format(contig))
            continue

        switch = False
        last_is_unphased = True

        for test_record, truth_record in pair_iterator(test_vcf, truth_vcf,
                                                       contig):

            for sample in common_samples:
                counter = counter_contig[sample]

                truth_phased = truth_record.samples[sample].phased
                test_phased = test_record.samples[sample].phased
                truth_genotype = truth_record.samples[sample]["GT"]
                test_genotype = test_record.samples[sample]["GT"]
                truth_alleles = set(truth_genotype)
                test_alleles = set(test_genotype)

                ignore = False
                if not truth_phased:
                    counter.truth_unphased += 1
                    ignore = True
                if not test_phased:
                    counter.test_unphased += 1
                    ignore = True
                    last_is_unphased = True
                else:
                    last_is_unphased = False

                if len(test_alleles) == 1:
                    counter.test_homozygous += 1
                    ignore = True
                else:
                    if not test_phased:
                        counter.test_unphased_hets += 1

                if len(truth_alleles) == 1:
                    counter.truth_homozygous += 1
                    ignore = True

                if ignore:
                    counter.ignore += 1
                    continue

                E.debug("comparing: {}:{} {} -> {}: {} {}".format(
                    test_record.chrom, test_record.pos, test_record.ref,
                    test_record.alts, test_genotype, truth_genotype))

                if switch:
                    truth_genotype = truth_genotype[::-1]

                counter.test_phased_hets += 1

                if truth_genotype != test_genotype:
                    if not last_is_unphased:
                        E.debug("SWITCH: {}".format(switch))
                        counter.switch += 1
                    switch = not switch

    outf = options.stdout
    outf.write("\t".join(
        ("contig", "sample", "switch_error_percent", "false_negative_rate",
         "switches", "test_phased_hets", "test_unphased_hets", "test_unphased",
         "truth_unphased", "test_homozygous", "truth_homozygous")) + "\n")

    for contig, contig_dict in list(counters_per_contig.items()):
        for sample, c in list(contig_dict.items()):
            outf.write("\t".join(
                map(str, (contig, sample,
                          "{:6.4f}".format(100.0 * c.switch /
                                           (c.test_phased_hets + 1)), "{:6.4f}"
                          .format(100.0 * c.test_unphased_hets /
                                  (c.test_phased_hets + c.test_unphased_hets)),
                          c.switch, c.test_phased_hets, c.test_unphased_hets,
                          c.test_unphased, c.truth_unphased, c.test_homozygous,
                          c.truth_homozygous))) + "\n")

    E.stop()
Esempio n. 3
0
def join_tables(outfile, options, args):
    '''join tables.'''

    if options.headers and options.headers[0] != "auto" and \
            len(options.headers) != len(options.filenames):
        raise ValueError("number of provided headers (%i) "
                         "is not equal to number filenames (%i)." %
                         (len(options.headers), len(options.filenames)))

    tables = []
    keys = {}
    sorted_keys = []
    sizes = {}

    if options.merge:
        titles = ["count"]
    else:
        titles = []

    headers_to_delete = []

    if options.prefixes:
        prefixes = [x.strip() for x in options.prefixes.split(",")]
        if len(prefixes) != len(options.filenames):
            raise ValueError(
                ("number of prefixes (%i) and tables (%i) "
                 "do not match") % (len(prefixes), len(options.filenames)))
    else:
        prefixes = None

    E.debug("joining on columns %s and taking columns %s" %
            (options.columns, options.take))

    for nindex, filename in enumerate(options.filenames):

        E.info("processing %s (%i/%i)" %
               (filename, nindex + 1, len(options.filenames)))

        prefix = os.path.basename(filename)

        lines = read_table(filename, options)

        try:
            # check if the table is empty
            data = next(lines).split()
        except StopIteration:
            # an empty table will raise a StopIteration
            # skip (or not skip) empty tables
            if options.ignore_empty:
                E.warn("%s is empty - skipped" % filename)
                headers_to_delete.append(nindex)
                continue

        table = {}
        sizes = {}
        max_size = 0
        ncolumns = 0

        if options.input_has_titles:
            # See https://github.com/cgat-developers/cgat-core/pull/53
            # data = next(lines).split()
            # no titles have been defined so far
            if not titles:
                key = "-".join([data[x] for x in options.columns])
                titles = [key]

            # set take based on column titles or numerically
            if options.take:
                take = []
                # convert numeric columns for filtering
                for x in options.take:
                    try:
                        take.append(int(x) - 1)
                    except ValueError:
                        # will raise error if x is not present
                        take.append(data.index(x))
            else:
                # tables with max 100 columns
                take = None

            for x in range(len(data)):
                if x in options.columns or (take and x not in take):
                    continue
                ncolumns += 1
                if options.add_file_prefix:
                    try:
                        p = re.search(options.regex_filename,
                                      prefix).groups()[0]
                    except AttributeError:
                        E.warn("can't extract title from filename %s" % prefix)
                        p = "unknown"
                    titles.append("%s_%s" % (p, data[x]))
                elif options.use_file_prefix:
                    try:
                        p = re.search(options.regex_filename,
                                      prefix).groups()[0]
                    except:
                        E.warn("can't extract title from filename %s" % prefix)
                        p = "unknown"
                    titles.append("%s" % p)
                elif prefixes:
                    titles.append("%s_%s" % (prefixes[nindex], data[x]))
                else:
                    titles.append(data[x])

        else:

            # set take based on numeric columns if no titles are present
            if options.take:
                take = []
                # convert numeric columns for filtering
                for x in options.take:
                    take.append(int(x) - 1)
            else:
                # tables with max 100 columns
                take = None

            # IMS: We might still want filename titles even if the input
            # columns don't have titles.
            if options.add_file_prefix:
                if not titles:
                    titles = ["ID"]
                try:
                    p = re.search(options.regex_filename, prefix).groups()[0]
                except AttributeError:
                    E.warn("can't extract title from filename %s" % prefix)
                    p = "unknown"
                titles.append("%s_%s" % (p, data[x]))
            elif options.use_file_prefix:
                if not titles:
                    titles = ["ID"]
                try:
                    p = re.search(options.regex_filename, prefix).groups()[0]
                except:
                    E.warn("can't extract title from filename %s" % prefix)
                    p = "unknown"
                titles.append("%s" % p)
            ncolumns = 1

        n = 0
        for line in lines:
            data = line[:-1].split("\t")
            try:
                row_keys = [data[x] for x in options.columns]
            except IndexError as msg:
                raise IndexError("error while parsing %s: %s" %
                                 (filename, msg))
            if options.sort_keys:
                if options.sort_keys == "numeric":
                    row_keys.sort(lambda x, y: cmp(float(x), float(y)))
                else:
                    row_keys.sort()
            if options.merge:
                key = n
            else:
                key = "-".join(row_keys)

            if key not in keys:
                sorted_keys.append(key)
                keys[key] = 1
                sizes[key] = 0

            if take:
                max_size = len(take)
                table[key] = [data[x] for x in take]
            else:
                max_size = max(len(data) - len(options.columns), max_size)
                table[key] = [
                    data[x] for x in range(0, len(data))
                    if x not in options.columns
                ]
            n += 1

        # enter columns of "na" for empty tables.
        if max_size == 0:
            max_size = ncolumns

        tables.append((max_size, table))

    # delete in reverse order
    if options.headers:
        for nindex in headers_to_delete[::-1]:
            del options.headers[nindex]

    if len(tables) == len(titles) - 1:

        if options.headers:
            headers = ["bin"]
            if options.headers[0] == 'auto':
                for t in range(len(tables)):
                    headers.append(os.path.basename(options.filenames[t]))
                    headers += [""] * (tables[t][0] - 1)

            else:
                for t in range(len(tables)):
                    headers.append(options.headers[t])
                    headers += [""] * (tables[t][0] - 1)

            # use headers as titles, if headers is given and skip-titles is
            # turned on
            if options.input_has_titles and options.skip_titles:
                titles = headers
            else:
                # otherwise: print the headers out right away
                outfile.write("\t".join(headers) + "\n")

        order = list(range(0, len(tables) + 1))

        if options.input_has_titles or \
           (options.use_file_prefix or options.add_file_prefix):

            if options.sort:
                sort_order = []

                if options.sort == "numeric":
                    t = list(
                        zip(list(map(int, titles[1:])),
                            list(range(1,
                                       len(titles) + 1))))
                    t.sort()

                    for tt in t:
                        sort_order.append(titles[tt[1]])

                elif options.sort == "alphabetical":
                    t = list(zip(titles[1:], list(range(1, len(titles) + 1))))
                    t.sort()

                    for tt in t:
                        sort_order.append(titles[tt[1]])
                else:
                    sort_order = options.sort

                map_title2pos = {}
                for x in range(1, len(titles)):
                    map_title2pos[titles[x]] = x

                order = [
                    0,
                ]
                for x in sort_order:
                    if x in map_title2pos:
                        order.append(map_title2pos[x])

            else:
                order = list(range(0, len(titles)))

            outfile.write("\t".join(
                [titles[order[x]] for x in range(len(titles))]))
            outfile.write("\n")

        if options.sort_keys:
            if options.sort_keys:
                if options.sort_keys == "numeric":
                    sorted_keys.sort(lambda x, y: cmp(float(x), float(y)))
                else:
                    sorted_keys.sort()

        for key in sorted_keys:

            outfile.write("%s" % key)

            for x in order[1:]:

                max_size, table = tables[x - 1]
                c = 0
                if key in table:
                    outfile.write("\t")
                    outfile.write("\t".join(table[key]))
                    c = len(table[key])

                assert (max_size == 1)

                outfile.write("\t%s" % options.missing_value * (max_size - c))

            outfile.write("\n")

    else:

        # for multi-column table, just write
        if options.input_has_titles:
            outfile.write("\t".join([titles[x] for x in range(len(titles))]))
            outfile.write("\n")

        for key in sorted_keys:

            outfile.write("%s" % key)

            for x in range(len(tables)):

                max_size, table = tables[x]
                c = 0
                if key in table:
                    outfile.write("\t")
                    outfile.write("\t".join(table[key]))
                    c = len(table[key])

                outfile.write("\t%s" % options.missing_value * (max_size - c))

            outfile.write("\n")
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-d", "--data-directory", dest="data_directory", type="string",
        help="directory in which to create links. "
        "[%default]")

    parser.add_option(
        "-f", "--force-first", dest="force_first", action="store_true",
        help="force running of pipeline in first instance [%default]")

    parser.add_option(
        "-k", "--keep-level", dest="keep_level", type="int",
        help="level to keep for the directories that are being watched [%default]")

    parser.add_option(
        "-c", "--command", dest="command", type="string",
        help="command to run when new data appears [%default]")

    parser.set_defaults(
        data_directory="data",
        last_update=3600,
        input_fastq_file=None,
        keep_level=3,
        force_first=False,
        sleep=60,
        command="daisy run -v 5 -p 100 make all",
    )

    (options, args) = E.start(parser, argv)

    if not os.path.exists("benchmark.yml"):
        raise ValueError("config file {} does not exist".format(
            "benchmark.yml"))

    with IOTools.open_file("benchmark.yml") as inf:
        config = yaml.load(inf, Loader=yaml.FullLoader)

    if "watch" not in config:
        raise ValueError("config file needs to contain a 'watch' section")

    if isinstance(config["watch"], list):
        watchlist = config["watch"]
    else:
        watchlist = [config["watch"]]

    E.info("watching with {} glob expressions".format(len(watchlist)))

    while 1:

        current_time = time.time()

        c = E.Counter()

        iteration = 1

        for glob_expr in watchlist:
            filenames = glob.glob(glob_expr)
            E.debug("found {} files for {}".format(len(filenames),
                                                   glob_expr))

            for fn in filenames:
                c.found += 1
                parts = os.path.abspath(fn).split(os.sep)

                dest_fn = os.path.abspath(
                    os.path.join(
                        options.data_directory,
                        os.sep.join(parts[-options.keep_level:])))

                dirname = os.path.dirname(dest_fn)
                if not os.path.exists(dirname):
                    E.info("creating new directory {}".format(dirname))
                    os.makedirs(dirname)

                if not os.path.exists(dest_fn):
                    modification_time = os.path.getmtime(fn)
                    timedelta = current_time - modification_time
                    if timedelta > options.last_update:
                        E.info("new file detected, creating link: {}".format(dest_fn))
                        c.new_file_create += 1
                        os.symlink(os.path.abspath(fn), dest_fn)
                    else:
                        E.info(
                            "new file detected, but too recent ({}s): {}".format(
                                timedelta,
                                dest_fn))
                        c.new_file_wait += 1
                else:
                    c.existing += 1

        E.info("iteration {}: {}".format(iteration, str(c)))

        if iteration == 1 and options.force_first:
            E.run(options.command)
        elif c.new_file_create == 0:
            E.info("found no new files, waiting for {} seconds".format(options.sleep))
            time.sleep(options.sleep)
        else:
            E.run(options.command)

        iteration += 1
Esempio n. 5
0
def annotateGenome(iterator, fasta, options, default_code=DEFAULT_CODE):
    """annotate a genome given by the indexed *fasta* file and
    an iterator over gtf annotations.
    """

    annotations = {}
    contig_sizes = fasta.getContigSizes(with_synonyms=False)
    E.info("allocating memory for %i contigs and %i bytes" %
           (len(contig_sizes),
            sum(contig_sizes.values()) * array.array("B").itemsize))
    # AString.AString( "a").itemsize ))

    for contig, size in list(contig_sizes.items()):
        E.debug("allocating %s: %i bases" % (contig, size))
        # annotations[contig] = AString.AString( default_code * size )
        # annotations[contig] = array.array("", default_code * size)
        # Go to list for py3 compatibility, patch
        annotations[contig] = [default_code] * size

    E.info("allocated memory for %i contigs" % len(fasta))

    counter = E.Counter()

    # output splice junctions
    outfile_junctions = E.open_output_file("junctions")
    outfile_junctions.write(
        "contig\tstrand\tpos1\tpos2\tframe\tgene_id\ttranscript_id\n")
    for gtfs in iterator:

        counter.input += 1

        if counter.input % options.report_step == 0:
            E.info("iteration %i" % counter.input)

        try:
            contig = fasta.getToken(gtfs[0].contig)
        except KeyError as msg:
            E.warn("contig %s not found - annotation ignored" % gtfs[0].contig)
            counter.skipped_contig += 1
            continue

        lcontig = fasta.getLength(contig)

        # make sure that exons are sorted by coordinate
        gtfs.sort(key=lambda x: x.start)

        is_positive = Genomics.IsPositiveStrand(gtfs[0].strand)
        source = gtfs[0].source

        # process non-coding data
        if source in MAP_ENSEMBL:
            code = MAP_ENSEMBL[source]

            intervals = [(x.start, x.end) for x in gtfs]
            addSegments(annotations[contig], intervals, is_positive, code)

        elif source == "protein_coding":

            # collect exons for utr
            exons = [(x.start, x.end) for x in gtfs if x.feature == "exon"]
            cds = [(x.start, x.end) for x in gtfs if x.feature == "CDS"]
            if len(cds) == 0:
                counter.skipped_transcripts += 1
                E.warn("protein-coding transcript %s without CDS - skipped" %
                       gtfs[0].transcript_id)
                continue

            exons = Intervals.truncate(exons, cds)
            start, end = cds[0][0], cds[-1][1]

            UTR5 = [x for x in exons if x[1] < start]
            UTR3 = [x for x in exons if x[0] >= end]

            if not is_positive:
                UTR5, UTR3 = UTR3, UTR5
                splice_code = "S"
            else:
                splice_code = "s"

            addSegments(annotations[contig], UTR5, is_positive, "u")

            addIntrons(annotations[contig], UTR5, is_positive,
                       options.max_frameshift_length)

            addSegments(annotations[contig], UTR3, is_positive, "v")

            addIntrons(annotations[contig], UTR3, is_positive,
                       options.max_frameshift_length)

            # output CDS according to frame
            addCDS(annotations[contig],
                   [x for x in gtfs if x.feature == "CDS"], is_positive)

            # add introns between CDS
            addIntrons(annotations[contig], cds, is_positive,
                       options.max_frameshift_length)

            # output splice junctions
            cds = [x for x in gtfs if x.feature == "CDS"]

            # apply corrections for 1-past end coordinates
            # to point between residues within CDS
            if is_positive:
                ender = lambda x: x.end - 1
                starter = lambda x: x.start
                out_positive = "+"
            else:
                ender = lambda x: lcontig - x.start - 1
                starter = lambda x: lcontig - x.end
                out_positive = "-"
                cds.reverse()

            end = ender(cds[0])
            for c in cds[1:]:
                start = starter(c)
                outfile_junctions.write("%s\t%s\t%i\t%i\t%s\t%s\t%s\n" % (
                    contig,
                    out_positive,
                    end,
                    start,
                    c.frame,
                    c.gene_id,
                    c.transcript_id,
                ))
                end = ender(c)

    E.info("finished reading genes: %s" % str(counter))

    outfile_junctions.close()

    E.info("started counting")
    outfile = E.open_output_file("counts")
    outputCounts(outfile, annotations)
    outfile.close()

    E.info("started output")
    for k in sorted(annotations.keys()):
        # options.stdout.write(">%s\n%s\n" % (k, annotations[k].tostring()))
        options.stdout.write(">%s\n%s\n" % (k, "".join(annotations[k])))
Esempio n. 6
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument(
        "-m",
        "--method",
        dest="method",
        type=str,
        choices=("add-flank", "add-upstream-flank", "add-downstream-flank",
                 "crop", "crop-unique", "complement-groups", "combine-groups",
                 "filter-range", "join-features", "merge-features", "sanitize",
                 "to-forward-coordinates", "to-forward-strand", "rename-chr"),
        help="method to apply ")

    parser.add_argument("--ignore-strand",
                        dest="ignore_strand",
                        help="ignore strand information.",
                        action="store_true")

    parser.add_argument("--is-gtf",
                        dest="is_gtf",
                        action="store_true",
                        help="input will be treated as gtf.")

    parser.add_argument("-c",
                        "--contigs-tsv-file",
                        dest="input_filename_contigs",
                        type=str,
                        help="filename with contig lengths.")

    parser.add_argument(
        "--agp-file",
        dest="input_filename_agp",
        type=str,
        help="agp file to map coordinates from contigs to scaffolds.")

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome.")

    parser.add_argument("--crop-gff-file",
                        dest="filename_crop_gff",
                        type=str,
                        help="GFF/GTF file to crop against.")

    parser.add_argument(
        "--group-field",
        dest="group_field",
        type=str,
        help="""gff field/attribute to group by such as gene_id, "
        "transcript_id, ... .""")

    parser.add_argument(
        "--filter-range",
        dest="filter_range",
        type=str,
        help="extract all elements overlapping a range. A range is "
        "specified by eithor 'contig:from..to', 'contig:+:from..to', "
        "or 'from,to' .")

    parser.add_argument("--sanitize-method",
                        dest="sanitize_method",
                        type=str,
                        choices=("ucsc", "ensembl", "genome"),
                        help="method to use for sanitizing chromosome names. "
                        ".")

    parser.add_argument(
        "--flank-method",
        dest="flank_method",
        type=str,
        choices=("add", "extend"),
        help="method to use for adding flanks. ``extend`` will "
        "extend existing features, while ``add`` will add new features. "
        ".")

    parser.add_argument("--skip-missing",
                        dest="skip_missing",
                        action="store_true",
                        help="skip entries on missing contigs. Otherwise an "
                        "exception is raised .")

    parser.add_argument(
        "--contig-pattern",
        dest="contig_pattern",
        type=str,
        help="a comma separated list of regular expressions specifying "
        "contigs to be removed when running method sanitize .")

    parser.add_argument(
        "--assembly-report",
        dest="assembly_report",
        type=str,
        help="path to assembly report file which allows mapping of "
        "ensembl to ucsc contigs when running method sanitize .")

    parser.add_argument(
        "--assembly-report-hasids",
        dest="assembly_report_hasIDs",
        type=int,
        help="path to assembly report file which allows mapping of "
        "ensembl to ucsc contigs when running method sanitize .")

    parser.add_argument(
        "--assembly-report-ucsccol",
        dest="assembly_report_ucsccol",
        type=int,
        help="column in the assembly report containing ucsc contig ids"
        ".")

    parser.add_argument(
        "--assembly-report-ensemblcol",
        dest="assembly_report_ensemblcol",
        type=int,
        help="column in the assembly report containing ensembl contig ids")

    parser.add_argument(
        "--assembly-extras",
        dest="assembly_extras",
        type=str,
        help="additional mismatches between gtf and fasta to fix when"
        "sanitizing the genome .")

    parser.add_argument("--extension-upstream",
                        dest="extension_upstream",
                        type=float,
                        help="extension for upstream end .")

    parser.add_argument("--extension-downstream",
                        dest="extension_downstream",
                        type=float,
                        help="extension for downstream end .")

    parser.add_argument("--min-distance",
                        dest="min_distance",
                        type=int,
                        help="minimum distance of features to merge/join .")

    parser.add_argument("--max-distance",
                        dest="max_distance",
                        type=int,
                        help="maximum distance of features to merge/join .")

    parser.add_argument("--min-features",
                        dest="min_features",
                        type=int,
                        help="minimum number of features to merge/join .")

    parser.add_argument("--max-features",
                        dest="max_features",
                        type=int,
                        help="maximum number of features to merge/join .")

    parser.add_argument(
        "--rename-chr-file",
        dest="rename_chr_file",
        type=str,
        help="mapping table between old and new chromosome names."
        "TAB separated 2-column file.")

    parser.set_defaults(input_filename_contigs=False,
                        filename_crop_gff=None,
                        input_filename_agp=False,
                        genome_file=None,
                        rename_chr_file=None,
                        add_up_flank=None,
                        add_down_flank=None,
                        complement_groups=False,
                        crop=None,
                        crop_unique=False,
                        ignore_strand=False,
                        filter_range=None,
                        min_distance=0,
                        max_distance=0,
                        min_features=1,
                        max_features=0,
                        extension_upstream=1000,
                        extension_downstream=1000,
                        sanitize_method="ucsc",
                        flank_method="add",
                        output_format="%06i",
                        skip_missing=False,
                        is_gtf=False,
                        group_field=None,
                        contig_pattern=None,
                        assembly_report=None,
                        assembly_report_hasIDs=1,
                        assembly_report_ensemblcol=4,
                        assembly_report_ucsccol=9,
                        assembly_extras=None)

    (args) = E.start(parser, argv=argv)

    contigs = None
    genome_fasta = None
    chr_map = None

    if args.input_filename_contigs:
        contigs = Genomics.readContigSizes(
            iotools.open_file(args.input_filename_contigs, "r"))

    if args.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(args.genome_file)
        contigs = genome_fasta.getContigSizes()

    if args.rename_chr_file:
        chr_map = {}
        with open(args.rename_chr_file, 'r') as filein:
            reader = csv.reader(filein, delimiter='\t')
            for row in reader:
                if len(row) != 2:
                    raise ValueError(
                        "Mapping table must have exactly two columns")
                chr_map[row[0]] = row[1]
        if not len(chr_map.keys()) > 0:
            raise ValueError("Empty mapping dictionnary")

    if args.assembly_report:
        df = pd.read_csv(args.assembly_report,
                         comment="#",
                         header=None,
                         sep="\t")
        # fixes naming inconsistency in assembly report: ensembl chromosome
        # contigs found in columnn 0, ensembl unassigned contigs found in
        # column 4.
        if args.assembly_report_hasIDs == 1:
            ucsccol = args.assembly_report_ucsccol
            ensemblcol = args.assembly_report_ensemblcol
            df.loc[df[1] == "assembled-molecule",
                   ensemblcol] = df.loc[df[1] == "assembled-molecule", 0]
            if args.sanitize_method == "ucsc":
                assembly_dict = df.set_index(ensemblcol)[ucsccol].to_dict()
            elif args.sanitize_method == "ensembl":
                assembly_dict = df.set_index(ucsccol)[ensemblcol].to_dict()
            else:
                raise ValueError(''' When using assembly report,
                please specify sanitize method as either
                "ucsc" or "ensembl" to specify direction of conversion
                ''')
        else:
            assembly_dict = {}
        if args.assembly_extras is not None:
            assembly_extras = args.assembly_extras.split(",")
            for item in assembly_extras:
                item = item.split("-")
                assembly_dict[item[0]] = item[1]

    if args.method in ("forward_coordinates", "forward_strand",
                       "add-flank", "add-upstream-flank",
                       "add-downstream-flank") \
       and not contigs:
        raise ValueError("inverting coordinates requires genome file")

    if args.input_filename_agp:
        agp = AGP.AGP()
        agp.readFromFile(iotools.open_file(args.input_filename_agp, "r"))
    else:
        agp = None

    gffs = GTF.iterator(args.stdin)

    if args.method in ("add-upstream-flank", "add-downstream-flank",
                       "add-flank"):

        add_upstream_flank = "add-upstream-flank" == args.method
        add_downstream_flank = "add-downstream-flank" == args.method
        if args.method == "add-flank":
            add_upstream_flank = add_downstream_flank = True

        upstream_flank = int(args.extension_upstream)
        downstream_flank = int(args.extension_downstream)
        extend_flank = args.flank_method == "extend"

        if args.is_gtf:
            iterator = GTF.flat_gene_iterator(gffs)
        else:
            iterator = GTF.joined_iterator(gffs, args.group_field)

        for chunk in iterator:
            is_positive = Genomics.IsPositiveStrand(chunk[0].strand)
            chunk.sort(key=lambda x: (x.contig, x.start))
            lcontig = contigs[chunk[0].contig]

            if extend_flank:
                if add_upstream_flank:
                    if is_positive:
                        chunk[0].start = max(0,
                                             chunk[0].start - upstream_flank)
                    else:
                        chunk[-1].end = min(lcontig,
                                            chunk[-1].end + upstream_flank)
                if add_downstream_flank:
                    if is_positive:
                        chunk[-1].end = min(lcontig,
                                            chunk[-1].end + downstream_flank)
                    else:
                        chunk[0].start = max(0,
                                             chunk[0].start - downstream_flank)
            else:
                if add_upstream_flank:
                    gff = GTF.Entry()
                    if is_positive:
                        gff.copy(chunk[0])
                        gff.end = gff.start
                        gff.start = max(0, gff.start - upstream_flank)
                        chunk.insert(0, gff)
                    else:
                        gff.copy(chunk[-1])
                        gff.start = gff.end
                        gff.end = min(lcontig, gff.end + upstream_flank)
                        chunk.append(gff)
                    gff.feature = "5-Flank"
                    gff.mMethod = "gff2gff"
                if add_downstream_flank:
                    gff = GTF.Entry()
                    if is_positive:
                        gff.copy(chunk[-1])
                        gff.start = gff.end
                        gff.end = min(lcontig, gff.end + downstream_flank)
                        chunk.append(gff)
                    else:
                        gff.copy(chunk[0])
                        gff.end = gff.start
                        gff.start = max(0, gff.start - downstream_flank)
                        chunk.insert(0, gff)
                    gff.feature = "3-Flank"
                    gff.mMethod = "gff2gff"

            if not is_positive:
                chunk.reverse()

            for gff in chunk:
                args.stdout.write(str(gff) + "\n")

    elif args.method == "complement-groups":

        iterator = GTF.joined_iterator(gffs, group_field=args.group_field)

        for chunk in iterator:
            if args.is_gtf:
                chunk = [x for x in chunk if x.feature == "exon"]
                if len(chunk) == 0:
                    continue
            chunk.sort(key=lambda x: (x.contig, x.start))
            x = GTF.Entry()
            x.copy(chunk[0])
            x.start = x.end
            x.feature = "intron"
            for c in chunk[1:]:
                x.end = c.start
                args.stdout.write(str(x) + "\n")
                x.start = c.end

    elif args.method == "combine-groups":

        iterator = GTF.joined_iterator(gffs, group_field=args.group_field)

        for chunk in iterator:
            chunk.sort(key=lambda x: (x.contig, x.start))
            x = GTF.Entry()
            x.copy(chunk[0])
            x.end = chunk[-1].end
            x.feature = "segment"
            args.stdout.write(str(x) + "\n")

    elif args.method == "join-features":
        for gff in combineGFF(gffs,
                              min_distance=args.min_distance,
                              max_distance=args.max_distance,
                              min_features=args.min_features,
                              max_features=args.max_features,
                              merge=False,
                              output_format=args.output_format):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "merge-features":
        for gff in combineGFF(gffs,
                              min_distance=args.min_distance,
                              max_distance=args.max_distance,
                              min_features=args.min_features,
                              max_features=args.max_features,
                              merge=True,
                              output_format=args.output_format):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "crop":
        for gff in cropGFF(gffs, args.filename_crop_gff):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "crop-unique":
        for gff in cropGFFUnique(gffs):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "filter-range":

        contig, strand, interval = None, None, None
        try:
            contig, strand, start, sep, end = re.match(
                "(\S+):(\S+):(\d+)(\.\.|-)(\d+)", args.filter_range).groups()
        except AttributeError:
            pass

        if not contig:
            try:
                contig, start, sep, end = re.match("(\S+):(\d+)(\.\.|-)(\d+)",
                                                   args.filter_range).groups()
                strand = None
            except AttributeError:
                pass

        if not contig:
            try:
                start, end = re.match("(\d+)(\.\.|\,|\-)(\d+)",
                                      args.filter_range).groups()
            except AttributeError:
                raise "can not parse range %s" % args.filter_range
            contig = None
            strand = None

        if start:
            interval = (int(start), int(end))
        else:
            interval = None

        E.debug("filter: contig=%s, strand=%s, interval=%s" %
                (str(contig), str(strand), str(interval)))

        for gff in GTF.iterator_filtered(gffs,
                                         contig=contig,
                                         strand=strand,
                                         interval=interval):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "sanitize":

        def assemblyReport(id):
            if id in assembly_dict.keys():
                id = assembly_dict[id]
            # if not in dict, the contig name is forced
            # into the desired convention, this is helpful user
            # modified gff files that contain additional contigs
            elif args.sanitize_method == "ucsc":
                if not id.startswith("contig") and not id.startswith("chr"):
                    id = "chr%s" % id
            elif args.sanitize_method == "ensembl":
                if id.startswith("contig"):
                    return id[len("contig"):]
                elif id.startswith("chr"):
                    return id[len("chr"):]
            return id

        if args.sanitize_method == "genome":
            if genome_fasta is None:
                raise ValueError("please specify --genome-file= when using "
                                 "--sanitize-method=genome")
            f = genome_fasta.getToken
        else:
            if args.assembly_report is None:
                raise ValueError(
                    "please specify --assembly-report= when using "
                    "--sanitize-method=ucsc or ensembl")
            f = assemblyReport

        skipped_contigs = collections.defaultdict(int)
        outofrange_contigs = collections.defaultdict(int)
        filtered_contigs = collections.defaultdict(int)

        for gff in gffs:
            try:
                gff.contig = f(gff.contig)
            except KeyError:
                if args.skip_missing:
                    skipped_contigs[gff.contig] += 1
                    continue
                else:
                    raise

            if genome_fasta:
                lcontig = genome_fasta.getLength(gff.contig)
                if lcontig < gff.end:
                    outofrange_contigs[gff.contig] += 1
                    continue

            if args.contig_pattern:
                to_remove = [
                    re.compile(x) for x in args.contig_pattern.split(",")
                ]
                if any([x.search(gff.contig) for x in to_remove]):
                    filtered_contigs[gff.contig] += 1
                    continue

            args.stdout.write(str(gff) + "\n")

        if skipped_contigs:
            E.info("skipped %i entries on %i contigs: %s" %
                   (sum(skipped_contigs.values()),
                    len(list(skipped_contigs.keys())), str(skipped_contigs)))

        if outofrange_contigs:
            E.warn(
                "skipped %i entries on %i contigs because they are out of range: %s"
                % (sum(outofrange_contigs.values()),
                   len(list(
                       outofrange_contigs.keys())), str(outofrange_contigs)))

        if filtered_contigs:
            E.info("filtered out %i entries on %i contigs: %s" %
                   (sum(filtered_contigs.values()),
                    len(list(filtered_contigs.keys())), str(filtered_contigs)))

    elif args.method == "rename-chr":
        if not chr_map:
            raise ValueError("please supply mapping file")

        for gff in renameChromosomes(gffs, chr_map):
            args.stdout.write(str(gff) + "\n")

    else:

        for gff in gffs:

            if args.method == "forward_coordinates":
                gff.invert(contigs[gff.contig])

            if args.method == "forward_strand":
                gff.invert(contigs[gff.contig])
                gff.strand = "+"

            if agp:
                # note: this works only with forward coordinates
                gff.contig, gff.start, gff.end = agp.mapLocation(
                    gff.contig, gff.start, gff.end)

            args.stdout.write(str(gff) + "\n")

    E.stop()
Esempio n. 7
0
def main(argv=None):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument(
        "-s",
        "--sample-size",
        dest="sample_size",
        type=float,
        help=
        "sample size. If less than 0, take a proportion of the chromosome size. "
        "If greater than 0, take a fixed number of variants ")

    parser.set_defaults(input_filename_fasta=None,
                        sample_size=0.001,
                        sample_name="NA12878")

    (args) = E.start(parser, argv=argv, add_output_options=True)

    if len(args) > 0:
        args.input_filename_fasta = args[0]

    if args.input_filename_fasta == "-":
        args.input_filename_fasta = args.stdin

    outf = args.stdout
    outf.write("##fileformat=VCFv4.1\n")
    outf.write(
        "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")
    outf.write(
        "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}\n".format(
            args.sample_name))

    with pysam.FastxFile(args.input_filename_fasta) as inf:
        for record in inf:
            contig = record.name
            sequence = record.sequence
            if args.sample_size < 1.0:
                nsamples = int(float(len(sequence)) * args.sample_size)
            else:
                nsamples = int(args.sample_size)
            E.info("generating {} sampled variants for contig {}".format(
                nsamples, contig))
            sampled_positions = set()
            missing_nsamples = nsamples
            while len(sampled_positions) < nsamples:
                raw_positions = random.sample(
                    list(range(len(sequence))),
                    nsamples - len(sampled_positions))
                filtered_positions = [
                    x for x in raw_positions if sequence[x] != "N"
                ]
                sampled_positions.update(filtered_positions)
                E.debug("sample update: total={}, raw={}, filtered={}".format(
                    len(sampled_positions), len(raw_positions),
                    len(filtered_positions)))

            sampled_positions = sorted(sampled_positions)

            for position in sampled_positions:
                base = sequence[position]
                outf.write("{}\t{}\t.\t{}\t{}\t.\t.\t.\tGT\t0/0\n".format(
                    contig, position + 1, base, base))

    E.stop()
Esempio n. 8
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $1.0$",
                            usage=globals()["__doc__"])

    parser.add_option("-r",
                      "--reffile",
                      dest="reffile",
                      type="string",
                      help="Supply reference gtf file name")

    parser.add_option("-d",
                      "--class-file",
                      dest="classfile",
                      type="string",
                      help="Supply database name")

    parser.add_option("-o",
                      "--outfile",
                      dest="outfile",
                      type="string",
                      help="Supply output bed file name")

    parser.add_option("-u",
                      "--indivfile",
                      dest="indivfile",
                      type="string",
                      help="Supply output bed file name for individual utrons")

    parser.add_option("-p",
                      "--partfile",
                      dest="partfile",
                      type="string",
                      help="Supply output bed file name for partnered utrons")
    parser.add_option(
        "-q",
        "--indivpartfile",
        dest="indivpartfile",
        type="string",
        help="Supply output bed file name for individual partnered utrons")
    parser.add_option("-n",
                      "--novel-file",
                      dest="novelfile",
                      type="string",
                      help="Supply output bed file name for novel introns")
    parser.add_option(
        "--novel-transcript",
        dest="novel_id",
        type="string",
        help="DEBUG: Output info for this transcript from the STDIN")
    parser.add_option(
        "--target-transcript",
        dest="target_id",
        type="string",
        help="DEBUG: Output info for this transcript from ref-file")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    outlines = []
    individuals = []
    partnered = []
    individualpartnered = []
    novel = []

    db = pandas.read_csv(options.classfile, sep="\t")

    # This keeps just one entry per-transcript - why?
    #db = db.groupby("transcript_id").first()
    db = db.set_index("transcript_id")
    enshashtable = getGeneTable(options.reffile)

    for novel_transcript in GTF.transcript_iterator(GTF.iterator(
            options.stdin)):

        # Why do it on a gene by gene basis rather than transcript by transcript basis?
        transcript_id = novel_transcript[0].transcript_id

        if transcript_id == options.novel_id:
            output_novel = True
        else:
            output_novel = False

        try:
            geneid = db.loc[transcript_id].match_gene_id
        except KeyError:
            if output_novel:
                E.debug("Transcript %s not in class table" % transcript_id)
            continue

        if pandas.isnull(geneid):
            if output_novel:
                E.debug("Transcript %s matches no gene in class table" %
                        transcript_id)
            continue

        ens_gene = enshashtable[geneid]

        all_ref_introns = set()
        novel_transcript_exons = GTF.asRanges(novel_transcript, "exon")
        novel_transcript_introns = GTF.toIntronIntervals(novel_transcript)
        for ref_transcript in ens_gene["models"].values():
            ref_introns = GTF.toIntronIntervals(ref_transcript)
            all_ref_introns.update(ref_introns)

        #Identify comparison set
        def _in_exon(position, exons):
            return any(e[0] <= position <= e[1] for e in exons)

        # check if this ever gets the wrong start_codon.
        filtered_starts = [
            s for s in ens_gene["start_codons"]
            if _in_exon(s, novel_transcript_exons)
        ]

        if len(filtered_starts) == 0:
            if output_novel:
                E.debug("No starts found for %s" % transcript_id)
            continue

        #if novel_transcript[0].strand == "-":
        #    selected_start = max(filtered_starts)
        #else:
        #    selected_start = min(filtered_starts)

        selected_models = list()
        for startc in filtered_starts:
            selected_models.extend(ens_gene["start_codons"][startc])

        if output_novel:
            E.debug("Transcripts with compatible starts are %s" %
                    selected_models)

        for ref_transcript_id in selected_models:

            if output_novel and ref_transcript_id == options.target_id:
                output_ref = True
            else:
                output_ref = False

            second = ens_gene["models"][ref_transcript_id]
            ens_CDS = GTF.asRanges(second, "CDS")

            if len(ens_CDS) == 0:
                if output_ref:
                    E.debug("%s is not coding"
                            )  # ensure only protein-coding transcripts
                continue

            ens_exons = GTF.asRanges(second, "exon")

            first_introns = set(novel_transcript_introns)
            second_introns = set(GTF.toIntronIntervals(second))

            first_CDSintrons = [
                intron for intron in first_introns
                if (intron[0] > ens_CDS[0][0] and intron[1] < ens_CDS[-1][1])
            ]

            second_CDSintrons = [
                intron for intron in second_introns
                if (intron[0] > ens_CDS[0][0] and intron[1] < ens_CDS[-1][1])
            ]

            first_CDSintrons = set(first_CDSintrons)
            second_CDSintrons = set(second_CDSintrons)

            if not first_CDSintrons == second_CDSintrons:
                if output_ref:
                    E.debug("CDS chains do not match. Chains are:")
                    first_CDSintrons = sorted(list(first_CDSintrons))
                    second_CDSintrons = sorted(list(second_CDSintrons))
                    output = "\n".join(
                        map(str, zip(first_CDSintrons, second_CDSintrons)))
                    E.debug(output)
                continue  # match CDS intron chain

            firstUTRintrons = first_introns - first_CDSintrons

            if len(firstUTRintrons) == 0:
                if output_ref:
                    E.debug("No UTR introns")
                continue

            secondUTRintrons = second_introns - second_CDSintrons

            found = False
            for intron in first_introns:
                if (intron[0] < ens_CDS[-1][1] and
                    intron[1] > ens_CDS[-1][1]) or \
                    (intron[0] < ens_CDS[0][0] and
                     intron[1] > ens_CDS[0][0]):

                    found = True
                    break  # ensure pruned transcript doesn't have
                    # introns overlapping start or stop codons in ensembl
                    # transcript
            if found:
                if output_ref:
                    E.debug("Start or stop in intron")
                continue

            if second[0].strand == "+":
                ens_stop = ens_CDS[-1][1]
                UTR3introns = [
                    intron for intron in firstUTRintrons
                    if intron[0] >= ens_CDS[-1][1]
                    and intron[1] < ens_exons[-1][1]
                ]
                secondUTR3introns = [
                    intron for intron in secondUTRintrons
                    if intron[0] >= ens_CDS[-1][1]
                    and intron[1] < ens_exons[-1][1]
                ]
            else:
                ens_stop = ens_CDS[0][0]
                UTR3introns = [
                    intron for intron in firstUTRintrons if
                    intron[1] <= ens_CDS[0][0] and intron[0] > ens_exons[0][0]
                ]
                secondUTR3introns = [
                    intron for intron in secondUTRintrons if
                    intron[1] <= ens_CDS[0][0] and intron[0] > ens_exons[0][0]
                ]

            if len(UTR3introns) == 0:
                if output_ref:
                    E.debug("No UTR introns")
                continue

            outbed = Bed.Bed()
            outbed.fields = ['.', '.', '.', '.', '.', '.', '.', '.', '.']
            outbed.fromIntervals(UTR3introns)
            outbed.contig = novel_transcript[0].contig
            outbed["name"] = novel_transcript[0].transcript_id
            outbed["strand"] = novel_transcript[0].strand
            outlines.append(outbed)  # get output for each transcript

            for item in UTR3introns:
                outbed2 = Bed.Bed()
                outbed2.fields = ['.', '.', '.', '.']
                outbed2.fromIntervals([item])
                outbed2.contig = novel_transcript[0].contig
                outbed2['name'] = novel_transcript[0].transcript_id
                outbed2["strand"] = novel_transcript[0].strand
                outbed2["thickStart"] = ens_stop
                individuals.append(outbed2)  # get output for each intron

            UTR3introns = set(UTR3introns)
            secondUTR3introns = set(secondUTR3introns)
            extraUTR3introns = list(UTR3introns - secondUTR3introns)

            if output_ref and len(secondUTR3introns - UTR3introns) > 0:
                E.debug("Following introns in UTR of %s but not %s" %
                        (options.target_id, options.novel_id))
                E.debug(secondUTRintrons - UTR3introns)

            # get only introns that are not in matched transcript
            if len(extraUTR3introns) != 0 and len(secondUTR3introns -
                                                  UTR3introns) == 0:
                outbed3 = Bed.Bed()
                outbed3.fields = ['.'] * 9
                outbed3.fromIntervals(extraUTR3introns)
                outbed3.contig = novel_transcript[0].contig
                outbed3["name"] = novel_transcript[
                    0].transcript_id + ":" + second[0].transcript_id
                outbed3["strand"] = novel_transcript[0].strand
                partnered.append(outbed3)

                for item in extraUTR3introns:
                    outbed4 = Bed.Bed()
                    outbed4.fields = ['.', '.', '.', '.']
                    outbed4.fromIntervals([item])
                    outbed4.contig = novel_transcript[0].contig
                    outbed4["name"] = novel_transcript[
                        0].transcript_id + ":" + second[0].transcript_id
                    outbed4["strand"] = novel_transcript[0].strand
                    outbed4["thickStart"] = ens_stop
                    individualpartnered.append(outbed4)

            if len(all_ref_introns) == 0:
                ens_starts, ens_ends = [], []
            else:
                ens_starts, ens_ends = zip(*all_ref_introns)

            novelEvents = [
                i for i in UTR3introns
                if i[0] not in ens_starts and i[1] not in ens_ends
            ]

            for item in novelEvents:
                outbed5 = Bed.Bed()
                outbed5.fields = ['.'] * 4
                outbed5.fromIntervals([item])
                outbed5.contig = novel_transcript[0].contig
                outbed5["name"] = novel_transcript[
                    0].transcript_id + ":" + second[0].transcript_id
                outbed5["strand"] = novel_transcript[0].strand
                outbed5["thickStart"] = ens_stop
                novel.append(outbed5)

    with IOTools.open_file(options.outfile, "w") as outf:
        for line in outlines:
            outf.write(str(line) + "\n")

    if options.indivfile is not None:
        with IOTools.open_file(options.indivfile, "w") as outf2:
            for line in individuals:
                outf2.write(str(line) + "\n")

    if options.partfile is not None:
        with IOTools.open_file(options.partfile, "w") as outf3:
            for line in partnered:
                outf3.write(str(line) + "\n")

    if options.indivpartfile is not None:
        with IOTools.open_file(options.indivpartfile, "w") as outf4:
            for line in individualpartnered:
                outf4.write(str(line) + "\n")

    if options.novelfile is not None:
        with IOTools.open_file(options.novelfile, "w") as outf5:
            for line in novel:
                outf5.write(str(line) + "\n")
    # write footer and output benchmark information.
    E.stop()
Esempio n. 9
0
def read_liftover_chain(infile):

    E.debug("started reading mapping information")

    map_id2chromosome = [
        "",
    ]
    map_chromosome2id = {}
    n = 0

    Chain = collections.namedtuple("Chain", [
        "score", "target_name", "target_size", "target_strand", "target_start",
        "target_end", "query_name", "query_size", "query_strand",
        "query_start", "query_end", "chainid"
    ])

    def blocks(infile):

        keep = False
        for line in infile:
            if line.startswith("chain"):
                chain_data = Chain._make(line[:-1].split(" ")[1:])

                if chain_data.target_strand == "-":
                    raise NotImplementedError("target strand is negative")
                alignment_data = []
            elif line.strip() == "":
                yield chain_data, alignment_data
            else:
                alignment_data.append(list(map(int, line.split(("\t")))))

    map_chromosomes = collections.defaultdict(quicksect.IntervalTree)
    map_contig2length = collections.defaultdict(int)

    for chain_data, alignment_data in blocks(infile):

        map_contig2length[chain_data.query_name] = int(chain_data.query_size)

        # target maps to query
        # coordinates are zero-based, half-open When
        # the strand value is "-", position coordinates are listed in
        # terms of the reverse-complemented sequence
        x = int(chain_data.target_start)
        y = int(chain_data.query_start)
        # revert coordinates for negative strands (it seems that
        # the mapping file uses reverse coordinates, while liftover
        # output doesn't)
        invert = chain_data.query_strand == "-"
        mm = map_chromosomes[chain_data.target_name]

        for d in alignment_data:
            if len(d) == 3:
                size, increment_x, increment_y = d
            else:
                size, increment_x, increment_y = d[0], 0, 0

            mm.add(x, x + size, (chain_data.query_name, y, y + size, invert))

            x += increment_x + size
            y += increment_y + size

            if y < 0:
                raise ValueError(
                    "illegal mapping in chain {}".format(chain_data))

    return map_chromosomes, map_contig2length
Esempio n. 10
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-e",
        "--exclusive-overlap",
        dest="exclusive",
        action="store_true",
        help="Intervals reported will be merged across the "
        "positive set and do not overlap any interval in any of the "
        "other sets [default=%default].")

    parser.add_option("-p",
                      "--pattern-identifier",
                      dest="pattern_id",
                      type="string",
                      help="pattern to convert a filename "
                      "to an id [default=%default].")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("merged-combinations", "unmerged-combinations"),
                      help="method to perform [default=%default]")

    parser.set_defaults(
        pattern_id="(.*).bed.gz",
        exclusive=False,
        method="merged-combinations",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if len(args) < 2:
        raise ValueError("at least two arguments required")

    tags, bedfiles = [], []
    for infile in args:
        bedfiles.append(pysam.Tabixfile(infile, "r"))
        tags.append(re.search(options.pattern_id, infile).groups()[0])

    indices = list(range(len(bedfiles)))
    is_exclusive = options.exclusive

    if options.method == "merged-combinations":

        if is_exclusive:
            start = 1
        else:
            start = 2

        options.stdout.write("combination\twithout\tcounts\n")

        for ncombinants in range(start, len(bedfiles) + 1):
            for combination in itertools.combinations(indices, ncombinants):
                other = [x for x in indices if x not in combination]
                tag = ":".join([tags[x] for x in combination])
                E.debug("combination %s started" % tag)
                E.debug("other: %s" % ":".join([tags[x] for x in other]))

                other_bed = [bedfiles[x] for x in other]
                outf = iotools.open_file(E.get_output_file(tag),
                                         "w",
                                         create_dir=True)
                c = E.Counter()
                for contig, start, end in combineMergedIntervals(
                    [bedfiles[x] for x in combination]):
                    c.found += 1
                    if is_exclusive and isContainedInOne(
                            contig, start, end, other_bed):
                        c.removed += 1
                        continue
                    c.output += 1
                    outf.write("%s\t%i\t%i\n" % (contig, start, end))

                outf.close()
                E.info("combination %s finished: %s" % (tag, c))

                options.stdout.write("%s\t%s\t%i\n" %
                                     (":".join([tags[x] for x in combination]),
                                      ":".join([tags[x]
                                                for x in other]), c.output))

    elif options.method == "unmerged-combinations":
        options.stdout.write("track\tcombination\twithout\tcounts\n")

        for foreground in indices:

            start = 0

            background = [x for x in indices if x != foreground]
            for ncombinants in range(0, len(background) + 1):
                for combination in itertools.combinations(
                        background, ncombinants):
                    other = [x for x in background if x not in combination]
                    combination_bed = [bedfiles[x] for x in combination]
                    other_bed = [bedfiles[x] for x in other]
                    tag = ":".join([tags[foreground]] +
                                   [tags[x] for x in combination])

                    E.debug("fg=%i, combination=%s, other=%s" %
                            (foreground, combination, other))
                    E.debug("combination %s started" % tag)
                    E.debug("other: %s" % ":".join([tags[x] for x in other]))

                    outf = iotools.open_file(E.get_output_file(tag),
                                             "w",
                                             create_dir=True)
                    c = E.Counter()
                    for bed in combineUnmergedIntervals(
                            bedfiles[foreground], combination_bed):
                        c.found += 1
                        if is_exclusive and isContainedInOne(
                                bed.contig, bed.start, bed.end, other_bed):
                            c.removed += 1
                            continue
                        c.output += 1
                        outf.write("%s\n" % str(bed))

                    outf.close()
                    E.info("combination %s finished: %s" % (tag, c))

                    options.stdout.write(
                        "%s\t%s\t%s\t%i\n" % (tags[foreground], ":".join([
                            tags[x] for x in combination
                        ]), ":".join([tags[x] for x in other]), c.output))

    E.stop()
Esempio n. 11
0
def buildSpikeResults(infile, outfile):
    '''build matrices with results from spike-in and upload
    into database.

    The method will output several files:

    .spiked.gz: Number of intervals that have been spiked-in
               for each bin of expression and fold-change

    .power.gz: Global power analysis - aggregates over all
        ranges of fold-change and expression and outputs the
        power, the proportion of intervals overall that
        could be detected as differentially methylated.

        This is a table with the following columns:

        fdr - fdr threshold
        power - power level, number of intervals detectable
        intervals - number of intervals in observed data at given
                    level of fdr and power.
        intervals_percent - percentage of intervals in observed data
              at given level of fdr and power

    The method will also upload the results into the database.

    Arguments
    ---------
    infile : string
        Input filename in :term:`tsv` format. Usually the output of
        :mod:`scripts/runExpression`.
    outfile : string
        Output filename in :term:`tsv` format.

    '''

    expression_nbins = 10
    fold_nbins = 10

    spikefile = P.snip(infile, '.tsv.gz') + '.spike.gz'

    if not os.path.exists(spikefile):
        E.warn('no spike data: %s' % spikefile)
        iotools.touch_file(outfile)
        return

    ########################################
    # output and load spiked results
    tmpfile_name = P.get_temp_filename(shared=True)

    statement = '''zcat %(spikefile)s
    | grep -e "^spike" -e "^test_id"
    > %(tmpfile_name)s
    '''
    P.run(statement)

    E.debug("outputting spiked counts")
    (spiked, spiked_d2hist_counts, xedges, yedges,
     spiked_l10average, spiked_l2fold) = \
        outputSpikeCounts(
            outfile=P.snip(outfile, ".power.gz") + ".spiked.gz",
            infile_name=tmpfile_name,
            expression_nbins=expression_nbins,
            fold_nbins=fold_nbins)

    ########################################
    # output and load unspiked results
    statement = '''zcat %(infile)s
    | grep -v -e "^spike"
    > %(tmpfile_name)s
    '''
    P.run(statement)
    E.debug("outputting unspiked counts")

    (unspiked, unspiked_d2hist_counts, unspiked_xedges,
     unspiked_yedges, unspiked_l10average, unspiked_l2fold) = \
        outputSpikeCounts(
            outfile=P.snip(outfile, ".power.gz") + ".unspiked.gz",
            infile_name=tmpfile_name,
            expression_bins=xedges,
            fold_bins=yedges)

    E.debug("computing power")

    assert xedges.all() == unspiked_xedges.all()

    tmpfile = iotools.open_file(tmpfile_name, "w")
    tmpfile.write("\t".join(("expression", "fold", "fdr", "counts",
                             "percent")) + "\n")

    fdr_thresholds = [0.01, 0.05] + list(numpy.arange(0.1, 1.0, 0.1))
    power_thresholds = numpy.arange(0.1, 1.1, 0.1)

    spiked_total = float(spiked_d2hist_counts.sum().sum())
    unspiked_total = float(unspiked_d2hist_counts.sum().sum())

    outf = iotools.open_file(outfile, "w")
    outf.write("fdr\tpower\tintervals\tintervals_percent\n")

    # significant results
    for fdr in fdr_thresholds:
        take = spiked['qvalue'] < fdr

        # compute 2D histogram in spiked data below fdr threshold
        spiked_d2hist_fdr, xedges, yedges = \
            numpy.histogram2d(spiked_l10average[take],
                              spiked_l2fold[take],
                              bins=(xedges, yedges))

        # convert to percentage of spike-ins per bin
        spiked_d2hist_fdr_normed = spiked_d2hist_fdr / spiked_d2hist_counts
        spiked_d2hist_fdr_normed = numpy.nan_to_num(spiked_d2hist_fdr_normed)

        # set values without data to -1
        spiked_d2hist_fdr_normed[spiked_d2hist_counts == 0] = -1.0

        # output to table for database upload
        for x, y in itertools.product(list(range(len(xedges) - 1)),
                                      list(range(len(yedges) - 1))):
            tmpfile.write("\t".join(
                map(str, (xedges[x], yedges[y], fdr, spiked_d2hist_fdr[x, y],
                          100.0 * spiked_d2hist_fdr_normed[x, y]))) + "\n")

        # take elements in spiked_hist_fdr above a certain threshold
        for power in power_thresholds:
            # select 2D bins at a given power level
            power_take = spiked_d2hist_fdr_normed >= power

            # select the counts in the unspiked data according
            # to this level
            power_counts = unspiked_d2hist_counts[power_take]

            outf.write("\t".join(
                map(str, (fdr, power, power_counts.sum().sum(), 100.0 *
                          power_counts.sum().sum() / unspiked_total))) + "\n")

    tmpfile.close()
    outf.close()

    # upload into table
    method = P.snip(os.path.dirname(outfile), ".dir")
    tablename = P.to_table(
        P.snip(outfile, "power.gz") + method + ".spike.load")

    P.load(tmpfile_name,
           outfile + ".log",
           tablename=tablename,
           options="--add-index=fdr")

    os.unlink(tmpfile_name)
Esempio n. 12
0
def outputSpikeCounts(outfile,
                      infile_name,
                      expression_nbins=None,
                      fold_nbins=None,
                      expression_bins=None,
                      fold_bins=None):
    """count significant results in bins of expression and fold change.

    This method groups the results of a DE analysis in a 2-dimensonal
    histogramy by tag counts/expression level and fold change.

    Either supply one of `nbins` or `bins` for the histograms.

    Arguments
    ---------
    outfile : string
        Output filename
    infile_name : string
        Input filename in :term:`tsv` format. Usually the output of
        :mod:`scripts/runExpression`.
    expression_nbins : int
        Number of bins to use for tag count histogram.
    fold_nbins : int
        Number of bins to use for fold-change histogram.
    expression_bins : list
        List of bins to use for tag count histogram.
    fold_bins : list
        List of bins to use for fold-change histogram.
    """

    df = pandas.read_csv(infile_name, sep="\t", index_col=0)

    E.debug("read %i rows and %i columns of data" % df.shape)

    if "edger" in outfile.lower():
        # edger: treatment_mean and control_mean do not exist
        # use supplied values directly.
        l10average = numpy.log(df['treatment_mean'])
        l2fold = numpy.log2(df['fold'])
    else:
        # use pseudocounts to compute fold changes
        treatment_mean = df['treatment_mean'] + 1
        control_mean = df['control_mean'] + 1
        # build log2 average values
        l10average = numpy.log((treatment_mean + control_mean) / 2)
        l2fold = numpy.log2(treatment_mean / control_mean)

    if expression_nbins is not None:
        mm = math.ceil(max(l10average))
        expression_bins = numpy.arange(0, mm, mm / expression_nbins)

    if fold_nbins is not None:
        mm = math.ceil(max(abs(min(l2fold)), abs(max(l2fold))))
        # ensure that range is centered on exact 0
        n = math.ceil(fold_nbins / 2.0)
        fold_bins = numpy.concatenate(
            (-numpy.arange(0, mm, mm / n)[:0:-1], numpy.arange(0, mm, mm / n)))

    # compute expression bins
    d2hist_counts, xedges, yedges = numpy.histogram2d(l10average,
                                                      l2fold,
                                                      bins=(expression_bins,
                                                            fold_bins))

    dd = pandas.DataFrame(d2hist_counts)
    dd.index = list(xedges[:-1])
    dd.columns = list(yedges[:-1])
    dd.to_csv(iotools.open_file(outfile, "w"), sep="\t")

    return df, d2hist_counts, xedges, yedges, l10average, l2fold
Esempio n. 13
0
def main(argv=None):
    """script main.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-o",
                        "--output-format",
                        dest="output_format",
                        type=str,
                        choices=("bedgraph", "wiggle", "bigbed", "bigwig",
                                 "bed"),
                        help="output format [default=%default]")

    parser.add_argument("-s",
                        "--shift-size",
                        dest="shift",
                        type=int,
                        help="shift reads by a certain amount (ChIP-Seq) ")

    parser.add_argument("-e",
                        "--extend",
                        dest="extend",
                        type=int,
                        help="extend reads by a certain amount "
                        "(ChIP-Seq) ")

    parser.add_argument("-p",
                        "--wiggle-span",
                        dest="span",
                        type=int,
                        help="span of a window in wiggle tracks ")

    parser.add_argument("-m",
                        "--merge-pairs",
                        dest="merge_pairs",
                        action="store_true",
                        help="merge paired-ended reads into a single "
                        "bed interval [default=%default].")

    parser.add_argument("--scale-base",
                        dest="scale_base",
                        type=float,
                        help="number of reads/pairs to scale bigwig file to. "
                        "The default is to scale to 1M reads ")

    parser.add_argument(
        "--scale-method",
        dest="scale_method",
        type=str,
        choices=(
            "none",
            "reads",
        ),
        help="scale bigwig output. 'reads' will normalize by "
        "the total number reads in the bam file that are used "
        "to construct the bigwig file. If --merge-pairs is used "
        "the number of pairs output will be used for "
        "normalization. 'none' will not scale the bigwig file")

    parser.add_argument("--max-insert-size",
                        dest="max_insert_size",
                        type=int,
                        help="only merge if insert size less that "
                        "# bases. 0 turns of this filter ")

    parser.add_argument("--min-insert-size",
                        dest="min_insert_size",
                        type=int,
                        help="only merge paired-end reads if they are "
                        "at least # bases apart. "
                        "0 turns of this filter.")

    parser.set_defaults(
        samfile=None,
        output_format="wiggle",
        shift=0,
        extend=0,
        span=1,
        merge_pairs=None,
        min_insert_size=0,
        max_insert_size=0,
        scale_method='none',
        scale_base=1000000,
    )

    # add common options (-h/--help, ...) and parse command line
    (args, unknown) = E.start(parser,
                              argv=argv,
                              add_output_options=True,
                              unknowns=True)

    if len(unknown) >= 1:
        args.samfile = unknown[0]
    if len(unknown) == 2:
        args.output_filename_pattern = unknown[1]
    if not args.samfile:
        raise ValueError("please provide a bam file")

    # Read BAM file using Pysam
    samfile = pysam.AlignmentFile(args.samfile, "rb")

    # Create temporary files / folders
    tmpdir = tempfile.mkdtemp()
    E.debug("temporary files are in %s" % tmpdir)
    tmpfile_wig = os.path.join(tmpdir, "wig")
    tmpfile_sizes = os.path.join(tmpdir, "sizes")

    # Create dictionary of contig sizes
    contig_sizes = dict(list(zip(samfile.references, samfile.lengths)))
    # write contig sizes
    outfile_size = iotools.open_file(tmpfile_sizes, "w")
    for contig, size in sorted(contig_sizes.items()):
        outfile_size.write("%s\t%s\n" % (contig, size))
    outfile_size.close()

    # Shift and extend only available for bigwig format
    if args.shift or args.extend:
        if args.output_format != "bigwig":
            raise ValueError(
                "shift and extend only available for bigwig output")

    # Output filename required for bigwig / bigbed computation
    if args.output_format == "bigwig":
        if not args.output_filename_pattern:
            raise ValueError(
                "please specify an output file for bigwig computation.")

        # Define executable to use for binary conversion
        if args.output_format == "bigwig":
            executable_name = "wigToBigWig"
        else:
            raise ValueError("unknown output format `%s`" % args.output_format)

        # check required executable file is in the path
        executable = iotools.which(executable_name)
        if not executable:
            raise OSError("could not find %s in path." % executable_name)

        # Open outout file
        outfile = iotools.open_file(tmpfile_wig, "w")
        E.info("starting output to %s" % tmpfile_wig)
    else:
        outfile = iotools.open_file(tmpfile_wig, "w")
        E.info("starting output to stdout")

    # Set up output write functions
    if args.output_format in ("wiggle", "bigwig"):
        # wiggle is one-based, so add 1, also step-size is 1, so need
        # to output all bases
        if args.span == 1:
            outf = lambda outfile, contig, start, end, val: \
                outfile.write(
                    "".join(["%i\t%i\n" % (x, val)
                             for x in range(start + 1, end + 1)]))
        else:
            outf = SpanWriter(args.span)
    elif args.output_format == "bedgraph":
        # bed is 0-based, open-closed
        outf = lambda outfile, contig, start, end, val: \
            outfile.write("%s\t%i\t%i\t%i\n" % (contig, start, end, val))

    # initialise counters
    ninput, nskipped, ncontigs = 0, 0, 0

    # set output file name
    output_filename_pattern = args.output_filename_pattern
    if output_filename_pattern:
        output_filename = os.path.abspath(output_filename_pattern)

    # shift and extend or merge pairs. Output temporay bed file
    if args.shift > 0 or args.extend > 0 or args.merge_pairs:
        # Workflow 1: convert to bed intervals and use bedtools
        # genomecov to build a coverage file.
        # Convert to bigwig with UCSC tools bedGraph2BigWig

        if args.merge_pairs:
            # merge pairs using bam2bed
            E.info("merging pairs to temporary file")
            counter = merge_pairs(samfile,
                                  outfile,
                                  min_insert_size=args.min_insert_size,
                                  max_insert_size=args.max_insert_size,
                                  bed_format=3)
            E.info("merging results: {}".format(counter))
            if counter.output == 0:
                raise ValueError("no pairs output after merging")
        else:
            # create bed file with shifted/extended tags
            shift, extend = args.shift, args.extend
            shift_extend = shift + extend
            counter = E.Counter()

            for contig in samfile.references:
                E.debug("output for %s" % contig)
                lcontig = contig_sizes[contig]

                for read in samfile.fetch(contig):
                    pos = read.pos
                    if read.is_reverse:
                        start = max(0, read.pos + read.alen - shift_extend)
                    else:
                        start = max(0, read.pos + shift)

                    # intervals extending beyond contig are removed
                    if start >= lcontig:
                        continue

                    end = min(lcontig, start + extend)
                    outfile.write("%s\t%i\t%i\n" % (contig, start, end))
                    counter.output += 1

        outfile.close()

        if args.scale_method == "reads":
            scale_factor = float(args.scale_base) / counter.output

            E.info("scaling: method=%s scale_quantity=%i scale_factor=%f" %
                   (args.scale_method, counter.output, scale_factor))
            scale = "-scale %f" % scale_factor
        else:
            scale = ""

        # Convert bed file to coverage file (bedgraph)
        tmpfile_bed = os.path.join(tmpdir, "bed")
        E.info("computing coverage")
        # calculate coverage - format is bedgraph
        statement = """bedtools genomecov -bg -i %(tmpfile_wig)s %(scale)s
        -g %(tmpfile_sizes)s > %(tmpfile_bed)s""" % locals()
        E.run(statement)

        # Convert bedgraph to bigwig
        E.info("converting to bigwig")
        tmpfile_sorted = os.path.join(tmpdir, "sorted")
        statement = ("sort -k 1,1 -k2,2n %(tmpfile_bed)s > %(tmpfile_sorted)s;"
                     "bedGraphToBigWig %(tmpfile_sorted)s %(tmpfile_sizes)s "
                     "%(output_filename_pattern)s" % locals())
        E.run(statement)

    else:

        # Workflow 2: use pysam column iterator to build a
        # wig file. Then convert to bigwig of bedgraph file
        # with UCSC tools.
        def column_iter(iterator):
            start = None
            end = 0
            n = None
            for t in iterator:
                if t.pos - end > 1 or n != t.n:
                    if start is not None:
                        yield start, end, n
                    start = t.pos
                    end = t.pos
                    n = t.n
                end = t.pos
            yield start, end, n

        if args.scale_method != "none":
            raise NotImplementedError(
                "scaling not implemented for pileup method")

        # Bedgraph track definition
        if args.output_format == "bedgraph":
            outfile.write("track type=bedGraph\n")

        for contig in samfile.references:
            # if contig != "chrX": continue
            E.debug("output for %s" % contig)
            lcontig = contig_sizes[contig]

            # Write wiggle header
            if args.output_format in ("wiggle", "bigwig"):
                outfile.write("variableStep chrom=%s span=%i\n" %
                              (contig, args.span))

            # Generate pileup per contig using pysam and iterate over columns
            for start, end, val in column_iter(samfile.pileup(contig)):
                # patch: there was a problem with bam files and reads
                # overextending at the end. These are usually Ns, but
                # need to check as otherwise wigToBigWig fails.
                if lcontig <= end:
                    E.warn("read extending beyond contig: %s: %i > %i" %
                           (contig, end, lcontig))
                    end = lcontig
                    if start >= end:
                        continue

                if val > 0:
                    outf(outfile, contig, start, end, val)
            ncontigs += 1

        # Close output file
        if type(outf) == type(SpanWriter):
            outf.flush(outfile)
        else:
            outfile.flush()

        E.info("finished output")

        # Report counters
        E.info("ninput=%i, ncontigs=%i, nskipped=%i" %
               (ninput, ncontigs, nskipped))

        # Convert to binary formats
        if args.output_format == "bigwig":
            outfile.close()

            E.info("starting %s conversion" % executable)
            try:
                retcode = subprocess.call(" ".join(
                    (executable, tmpfile_wig, tmpfile_sizes,
                     output_filename_pattern)),
                                          shell=True)
                if retcode != 0:
                    E.warn("%s terminated with signal: %i" %
                           (executable, -retcode))
                    return -retcode
            except OSError as msg:
                E.warn("Error while executing bigwig: %s" % msg)
                return 1
            E.info("finished bigwig conversion")
        else:
            with open(tmpfile_wig) as inf:
                sys.stdout.write(inf.read())

    # Cleanup temp files
    shutil.rmtree(tmpdir)

    E.stop()
Esempio n. 14
0
def compareCheckSums(infiles, outfile):
    '''compare checksum files against existing reference data.
    '''

    outf = iotools.open_file(outfile, "w")
    outf.write("\t".join((
        ("track", "status", "job_finished", "nfiles", "nref", "missing",
         "extra", "different", "different_md5", "different_lines", "same",
         "same_md5", "same_lines", "same_exist", "files_missing",
         "files_extra", "files_different_md5", "files_different_lines"))) +
               "\n")

    for infile in infiles:
        E.info("working on {}".format(infile))
        track = P.snip(infile, ".stats")

        logfiles = glob.glob(track + "*.log")
        job_finished = True
        for logfile in logfiles:
            is_complete = iotools.is_complete(logfile)
            E.debug("logcheck: {} = {}".format(logfile, is_complete))
            job_finished = job_finished and is_complete

        reffile = track + ".ref"

        # regular expression of files to test only for existence
        regex_exist = PARAMS.get('%s_regex_exist' % track, None)
        if regex_exist:
            regex_exist = re.compile("|".join(P.as_list(regex_exist)))

        regex_linecount = PARAMS.get('%s_regex_linecount' % track, None)
        if regex_linecount:
            regex_linecount = re.compile("|".join(P.as_list(regex_linecount)))

        regex_md5 = PARAMS.get('%s_regex_md5' % track, None)
        if regex_md5:
            regex_md5 = re.compile("|".join(P.as_list(regex_md5)))

        if not os.path.exists(reffile):
            raise ValueError('no reference data defined for %s' % track)

        cmp_data = pandas.read_csv(iotools.open_file(infile),
                                   sep="\t",
                                   index_col=0)

        ref_data = pandas.read_csv(iotools.open_file(reffile),
                                   sep="\t",
                                   index_col=0)

        shared_files = set(cmp_data.index).intersection(ref_data.index)
        missing = set(ref_data.index).difference(cmp_data.index)
        extra = set(cmp_data.index).difference(ref_data.index)

        different = set(shared_files)

        # remove those for which only check for existence
        if regex_exist:
            same_exist = set([x for x in different if regex_exist.search(x)])

            different = set(
                [x for x in different if not regex_exist.search(x)])
        else:
            same_exist = set()

        # select those for which only check for number of lines
        if regex_linecount:
            check_lines = [x for x in different if regex_linecount.search(x)]

            dd = (cmp_data['nlines'][check_lines] !=
                  ref_data['nlines'][check_lines])
            different_lines = set(dd.index[dd])
            different = different.difference(check_lines)

            dd = (cmp_data['nlines'][check_lines] == ref_data['nlines']
                  [check_lines])
            same_lines = set(dd.index[dd])

        else:
            different_lines = set()
            same_lines = set()

        # remainder - check md5
        if regex_md5:
            check_md5 = [x for x in different if regex_md5.search(x)]

            dd = (cmp_data['md5'][check_md5] != ref_data['md5'][check_md5])
            different_md5 = set(dd.index[dd])

            dd = (cmp_data['md5'][check_md5] == ref_data['md5'][check_md5])
            same_md5 = set(dd.index[dd])

        else:
            different_md5 = set()
            same_md5 = set()

        if job_finished and (len(missing) + len(extra) + len(different_md5) +
                             len(different_lines) == 0):
            status = "OK"
        else:
            status = "FAIL"

        outf.write("\t".join(
            map(str, (
                track,
                status,
                job_finished,
                len(cmp_data),
                len(ref_data),
                len(missing),
                len(extra),
                len(different_md5) + len(different_lines),
                len(different_md5),
                len(different_lines),
                len(same_md5) + len(same_lines) + len(same_exist),
                len(same_md5),
                len(same_lines),
                len(same_exist),
                ",".join(missing),
                ",".join(extra),
                ",".join(different_md5),
                ",".join(different_lines),
            ))) + "\n")

    outf.close()
Esempio n. 15
0
def runDRMAA(data, environment):
    '''run jobs in data using drmaa to connect to the cluster.'''

    # SNS: Error dection now taken care of with Cluster.py
    # expandStatement function

    # working directory - needs to be the one from which the
    # the script is called to resolve input files.
    cwd = os.getcwd()

    session = drmaa.Session()
    session.initialize()

    jobids = []
    kwargs = {}

    for filename, cmd, options, tmpdir, subdirs in data:

        from_stdin, to_stdout = True, True

        if subdirs:
            outdir = "%s.dir/" % (filename)
            os.mkdir(outdir)
            cmd = re.sub("%DIR%", outdir, cmd)

        x = re.search("'--log=(\S+)'", cmd) or re.search("'--L\s+(\S+)'", cmd)
        if x:
            logfile = filename + ".log"
            cmd = cmd[:x.start()] + "--log=%s" % logfile + cmd[x.end():]
        else:
            logfile = filename + ".out"

        if "%STDIN%" in cmd:
            cmd = re.sub("%STDIN%", filename, cmd)
            from_stdin = False

        if "%STDOUT%" in cmd:
            cmd = re.sub("%STDOUT%", filename + ".out", cmd)
            to_stdout = False

        cmd = " ".join(re.sub("\t+", " ", cmd).split("\n"))
        E.info("running statement:\n%s" % cmd)

        job_script = tempfile.NamedTemporaryFile(dir=os.getcwd(),
                                                 delete=False,
                                                 mode="w+t")
        job_script.write("#!/bin/bash\n")  # -l -O expand_aliases\n" )
        job_script.write(Cluster.expandStatement(cmd) + "\n")
        job_script.close()

        job_path = os.path.abspath(job_script.name)

        os.chmod(job_path, stat.S_IRWXG | stat.S_IRWXU)

        # get session for process - only one is permitted

        job_name = os.path.basename(kwargs.get("outfile", "farm.py"))

        options_dict = vars(options)
        options_dict["workingdir"] = os.getcwd()

        if options.job_memory:
            job_memory = options.job_memory
        elif options.cluster_memory_default:
            job_memory = options.cluster_memory_default
        else:
            job_memory = "2G"

        jt = Cluster.setupDrmaaJobTemplate(session, options_dict, job_name,
                                           job_memory)

        jt.remoteCommand = job_path

        # update the environment
        e = {'BASH_ENV': options.bashrc}
        if environment:
            for en in environment:
                try:
                    e[en] = os.environ[en]
                except KeyError:
                    raise KeyError(
                        "could not export environment variable '%s'" % en)
        jt.jobEnvironment = e

        # SNS: Native specifation setting abstracted
        # to Pipeline/Cluster.setupDrmaaJobTemplate()

        # use stdin for data
        if from_stdin:
            jt.inputPath = ":" + filename

        # set paths.

        # later: allow redirection of stdout and stderr to files
        # could this even be across hosts?
        if to_stdout:
            jt.outputPath = ":" + filename + ".out"
        else:
            jt.outputPath = ":" + filename + ".stdout"

        jt.errorPath = ":" + filename + ".err"

        jobid = session.runJob(jt)
        jobids.append((jobid, job_path, filename, cmd, logfile))

    E.debug("%i jobs have been submitted" % len(jobids))

    results = []

    for jobid, job_path, filename, cmd, logfile in jobids:

        try:
            retval = session.wait(jobid, drmaa.Session.TIMEOUT_WAIT_FOREVER)
        except Exception as msg:
            # ignore message 24 in PBS
            # code 24: drmaa: Job finished but resource usage information
            # and/or termination status could not be provided.":
            if not msg.message.startswith("code 24"):
                raise
            retval = None

        if retval and retval.exitStatus != 0:
            raise OSError("Child was terminated by signal %i: \n%s\n" %
                          (retval.exitStatus, cmd))

        results.append((retval, filename, cmd, logfile, 1))

        os.unlink(job_path)

    session.deleteJobTemplate(jt)
    session.exit()
Esempio n. 16
0
def annotateGenome(iterator, fasta, options):
    """perform a full segmentation of the genome (UTR, exon, intron ...)
    """

    ninput, noutput, nadded, nambiguous, nframeshifts, nunknown = 0, 0, 0, 0, 0, 0
    last = None
    is_ambiguous = False

    for this in iterator:
        ninput += 1

        E.debug("last=%s" % str(last))
        E.debug("this=%s" % str(this))
        E.debug("is_ambiguous=%s" % str(is_ambiguous))

        if last and last.contig == this.contig:
            # check if file is sorted correctly
            assert last.start <= this.start, "input file needs to be sorted by contig, start"
            if last.end <= this.start:
                if not is_ambiguous:
                    if last.gene_id != this.gene_id:
                        nadded += addIntergenicSegment(last, this, fasta,
                                                       options)
                    else:
                        d = this.start - last.end
                        if d >= options.min_intron_length:
                            nadded += addSegment("intronic", last.end,
                                                 this.start, last, options)
                        elif d <= options.max_frameshift_length:
                            nframeshifts += addSegment("frameshift", last.end,
                                                       this.start, last,
                                                       options)
                        else:
                            nunknown += addSegment("unknown", last.end,
                                                   this.start, last, options)
                else:
                    if last.feature == this.feature and \
                       last.gene_id == this.gene_id:
                        nambiguous += addSegment(last.feature, last.end,
                                                 this.start, last, options)
                    else:
                        nambiguous += addSegment("ambiguous", last.end,
                                                 this.start, last, options)
                    is_ambiguous = False
                last = this
            elif last.end > this.start:
                if last.gene_id != this.gene_id:
                    # flag next region as ambiguous
                    is_ambiguous = True
                last.end = this.end
        else:
            nadded += addIntergenicSegment(last, None, fasta, options)
            nadded += addIntergenicSegment(None, this, fasta, options)
            last = this

        options.stdout.write("%s\n" % str(this))
        noutput += 1

    E.info(
        "ninput=%i, noutput=%i, nadded=%i, nambiguous=%i, nframeshifts=%i, nunknown=%i"
        % (ninput, noutput, nadded, nambiguous, nframeshifts, nunknown))
Esempio n. 17
0
def main(argv=None):

    parser = getOptionParser()

    (options, args) = E.Start(parser, add_cluster_options=True)

    if len(args) == 0:
        raise ValueError(
            "command line argument missing - see usage information")

    options.renumber_column = [x.split(":") for x in options.renumber_column]

    cmd = args[0]
    if len(args) > 1:
        cmd += " '" + "' '".join(args[1:]) + "'"

    if options.dry_run:

        cmd = re.sub("%DIR%", "", cmd)
        retcode = subprocess.call(cmd,
                                  shell=True,
                                  stdin=sys.stdin,
                                  stdout=sys.stdout,
                                  cwd=os.getcwd(),
                                  close_fds=True)
        E.Stop()
        sys.exit(0)

    failed_requests = []
    started_requests = []
    niterations = 0

    if not options.collect:
        tmpdir = os.path.abspath(tempfile.mkdtemp(dir=options.tmpdir))

        E.info(" working in directory %s" % tmpdir)

        if options.split_at_lines:
            chunk_iterator = chunk_iterator_lines
            args = (options.split_at_lines, )
        elif options.split_at_column:
            chunk_iterator = chunk_iterator_column
            args = (options.split_at_column - 1, options.max_files)
        elif options.split_at_regex:
            chunk_iterator = chunk_iterator_regex_split
            args = (re.compile(options.split_at_regex), 0, options.chunksize,
                    options.max_lines)
        elif options.group_by_regex:
            chunk_iterator = chunk_iterator_regex_group
            args = (re.compile(options.group_by_regex), 0, options.chunksize)
        else:
            raise ValueError("please specify a way to chunk input data")

        data = [(x, cmd, options, None, options.subdirs)
                for x in chunk_iterator(options.stdin,
                                        args,
                                        prefix=tmpdir,
                                        use_header=options.input_header)]

        started_requests = [(x[0], x[0] + ".out") for x in data]

        if len(data) == 0:
            E.warn("no data received")
            E.Stop()
            sys.exit(0)

        if options.method == "multiprocessing":
            pool = Pool(options.cluster_num_jobs)
            results = pool.map(runCommand, data, chunksize=1)
        elif options.method == "drmaa":
            results = []
            runDRMAA(data, environment=options.environment)
        elif options.method == "threads":
            pool = ThreadPool(options.cluster_num_jobs)
            results = pool.map(runCommand, data, chunksize=1)

        niterations = 0
        for retcode, filename, cmd, logfile, iterations in results:
            niterations += iterations
            if not hasFinished(retcode, filename, options.output_tag, logfile):
                failed_requests.append((filename, cmd))

    else:
        tmpdir = options.collect
        started_requests = [(x[:-4], x) for x in glob.glob(tmpdir + "/*.out")]

        E.info("collecting %i files from %s" % (len(started_requests), tmpdir))

    if failed_requests:
        for fn, cmd in failed_requests:
            E.error("failed request: filename= %s, cmd= %s" % (fn, cmd))
    else:
        E.info("building result from %i parts" % len(started_requests))

        if options.renumber:
            mapper = MapperLocal(pattern=options.renumber)
        else:
            mapper = MapperEmpty()

        # deal with stdout
        name = None
        index = None

        for pattern, column in options.renumber_column:

            if re.search(pattern, "stdout"):
                try:
                    index = int(column) - 1
                except ValueError:
                    name = column
                    break

        if options.binary:
            ResultBuilderBinary()(started_requests, options.stdout, options)
        else:
            regex = None
            if options.output_regex_header:
                regex = re.compile(options.output_regex_header)
            ResultBuilder(mapper=mapper,
                          field_index=index,
                          field_name=name,
                          header_regex=regex)(started_requests, options.stdout,
                                              options)

        # deal with logfiles : combine them into a single file
        rr = re.search("'--log=(\S+)'", cmd) or re.search("'--L\s+(\S+)'", cmd)
        if rr:
            E.info("logging output goes to %s" % rr.groups()[0])
            logfile = iotools.openFile(rr.groups()[0], "a")
            ResultBuilderLog()([(x[0], "%s.log" % x[0])
                                for x in started_requests], logfile, options)
            logfile.close()

        # deal with other files
        if options.subdirs:

            files = glob.glob("%s/*.dir/*" % tmpdir)
            # remove directory
            filenames = set([os.path.basename(x) for x in files])
            xx = len(".out")

            for filename in filenames:

                _, filetype = os.path.splitext(filename)

                name = None
                index = None

                for pattern, column in options.renumber_column:
                    if re.search(pattern, filename):
                        try:
                            index = int(column) - 1
                        except ValueError:
                            name = column
                        break

                if options.binary:
                    builder = ResultBuilderBinary(mapper=mapper)
                elif filetype in (".fa", ".fasta"):
                    builder = ResultBuilderFasta(mapper=mapper)
                elif filetype in (".mali", ):
                    builder = ResultBuilderFasta(mapper=MapperEmpty())
                elif filetype in (".psl"):
                    builder = ResultBuilderPSL(mapper=mapper)
                elif filetype in (".gtf", ".gff"):
                    builder = ResultBuilderGFF(mapper=mapper,
                                               field_index=index,
                                               field_name=name)
                elif filetype in (".png"):
                    builder = ResultBuilderCopies(mapper=mapper)
                else:
                    builder = ResultBuilder(mapper=mapper,
                                            field_index=index,
                                            field_name=name)

                E.debug("chose the following builder for %s: %s: %s" %
                        (filename, filetype, str(builder)))

                E.info("collecting results for %s" % filename)

                input_filenames = []
                for fi, fn in started_requests:
                    fn = fn[:-xx] + ".dir/" + filename
                    if os.path.exists(fn):
                        input_filenames.append((fi, fn))

                E.info("output of %i files goes to %s" %
                       (len(filenames), filename))

                outfile = iotools.openFile(options.output_pattern % filename,
                                           "w")
                builder(input_filenames, outfile, options)
                outfile.close()

    if not options.debug and (not options.resume or not options.collect):
        if len(failed_requests) == 0:
            E.info("removing directory %s" % tmpdir)
            shutil.rmtree(tmpdir)
        else:
            E.info("directory %s not removed due to %i failed jobs" %
                   (tmpdir, len(failed_requests)))

    E.info("job control: nstarted=%i, nfinished=%i, nerrors=%i, nrepeats=%i" %
           (len(started_requests), len(started_requests) -
            len(failed_requests), len(failed_requests), niterations))

    E.Stop()
Esempio n. 18
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=('join', ),
                      help="method to apply [default=%default].")

    parser.set_defaults(method="join", )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError(
            "please supply at least two fastq files on the commandline")

    fn1, fn2 = args
    c = E.Counter()
    outfile = options.stdout

    if options.method == "join":
        # merge based on diagonals in dotplot
        iter1 = Fastq.iterate(iotools.open_file(fn1))
        iter2 = Fastq.iterate(iotools.open_file(fn2))
        tuple_size = 2
        for left, right in zip(iter1, iter2):
            c.input += 1

            # build dictionary of tuples
            s1, q1 = left.seq, left.quals
            d = collections.defaultdict(list)
            for x in range(len(s1) - tuple_size):
                d[s1[x:x + tuple_size]].append(x)

            s2, q2 = right.seq, right.quals
            s2 = Genomics.reverse_complement(s2)
            q2 = q2[::-1]

            # compute list of offsets/diagonals
            offsets = collections.defaultdict(int)
            for x in range(len(s2) - tuple_size):
                c = s2[x:x + tuple_size]
                for y in d[c]:
                    offsets[x - y] += 1

            # find maximum diagonal
            sorted = sorted([(y, x) for x, y in list(offsets.items())])
            max_count, max_offset = sorted[-1]

            E.debug('%s: maximum offset at %i' % (left.identifier, max_offset))

            # simple merge sequence
            take = len(s2) - max_offset
            merged_seq = s1 + s2[take:]

            # simple merge quality scores
            merged_quals = q1 + q2[take:]

            new_entry = copy.copy(left)
            new_entry.seq = merged_seq
            new_entry.quals = merged_quals
            outfile.write(new_entry)
            c.output += 1

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.stop()
Esempio n. 19
0
def loadTagDataPandas(tags_filename, design_filename):
    '''load tag data for deseq/edger analysis.

    *Infile* is a tab-separated file with counts.

    *design_file* is a tab-separated file with the
    experimental design with four columns::

      track   include group   pair
      CW-CD14-R1      0       CD14    1
      CW-CD14-R2      0       CD14    1
      CW-CD14-R3      1       CD14    1
      CW-CD4-R1       1       CD4     1
      FM-CD14-R1      1       CD14    2
      FM-CD4-R2       0       CD4     2
      FM-CD4-R3       0       CD4     2
      FM-CD4-R4       0       CD4     2

    track
        name of track - should correspond to column header in *infile*
    include
        flag to indicate whether or not to include this data
    group
        group indicator - experimental group
    pair
        pair that sample belongs to (for paired tests)

    This method creates various R objects:

    countsTable : data frame with counts.
    groups : vector with groups
    pairs  : vector with pairs

    '''
    counts_table = pd.read_table(tags_filename, sep="\t",
                                 index_col=0, comment="#")

    E.info("read data: %i observations for %i samples" % counts_table.shape)

    E.debug("sample names: %s" % list(counts_table.columns))

    inf = iotools.open_file(design_filename)
    design_table = pd.read_csv(inf, sep="\t", index_col=0)
    inf.close()

    E.debug("design names: %s" % list(design_table.index))

    missing = set(counts_table.columns).difference(design_table.index)

    if missing:
        E.warn("missing samples from design file are ignored: %s" % missing)

    # remove unnecessary samples
    design_table = design_table[design_table["include"] != 0]
    E.debug("included samples: %s" % list(design_table.index))

    counts_table = counts_table[list(design_table.index)]
    E.info("filtered data: %i observations for %i samples" %
           counts_table.shape)

    return counts_table, design_table
def main(argv):

    options = P.initialize(argv, config_file="benchmark.yml")

    # compatibility with cgatcore < 0.6.3
    if isinstance(options, tuple):
        options = options[0]

    # not sure what this does
    # if not options.config_file:
    #     P.get_parameters(options.config_file)
    # else:
    #     sys.exit(P.main(options, args))

    params = P.get_params()

    with arvados_enabled(always_mount=options.always_mount):
        mountpoint = params.get("mount_point", None)
        if mountpoint:
            redirect_defaults2mountpoint(mountpoint)

        # A selection of command line arguments are added to PARAMS
        # as 'extras' not implemented in ruffus 2.6.3
        kwargs = collections.defaultdict(dict)
        if options.only_info:
            kwargs["extras"].update({'only_info': True})
            P.PARAMS["only_info"] = True
        if options.is_test:
            kwargs["extras"].update({'is_test': True})
            P.PARAMS["is_test"] = True

        E.debug("construction of workflow started")
        pipeline = ruffus.Pipeline('benchmark')
        # Tool execution
        suffix, tool_runners = add_tools_to_pipeline(pipeline,
                                                     map_tool_to_runner,
                                                     config=P.PARAMS,
                                                     **kwargs)

        E.debug("added {} tools to workflow".format(len(tool_runners)))
        # Optionally, add externally computed files as
        # pseudo-tools:
        if "external" in P.PARAMS["setup"]:
            external_runners = add_external_data_to_pipeline(pipeline,
                                                             config=P.PARAMS,
                                                             **kwargs)
            tool_runners.extend(external_runners)

        # Optionally, combine tool runs into aggregate
        # outputs. The type of the output is preserved
        # (VCF -> VCF, etc.)
        # For example, call individual members in a trio
        # and then build a combined VCF to analyse mendelian
        # inconsistencies.
        if "collate" in P.PARAMS["setup"]:
            collate_runners = add_collations_to_pipeline(
                pipeline,
                map_collate_to_runner,
                P.PARAMS["setup"]["collate"],
                tasks=tool_runners,
                config=P.PARAMS)
            if P.PARAMS["setup"].get("only_collate", False):
                tool_runners = []
            if P.PARAMS["setup"].get("no_collate_metrics", False):
                collate_runners = []
            E.debug("added {} collators to workflow".format(
                len(collate_runners)))
        else:
            collate_runners = []

        # Optionally, split up the output before applying
        # additional analyses. The type of the output is preserved
        # (VCF -> VCF, etc).
        # For example, identify false positives, false negatives
        # and true positives and collect metrics individually.
        if "split" in P.PARAMS["setup"]:
            split_runners = add_splits_to_pipeline(pipeline,
                                                   map_split_to_runner,
                                                   tool_runners,
                                                   P.PARAMS["setup"]["split"],
                                                   tasks=tool_runners,
                                                   config=P.PARAMS)
            if P.PARAMS["setup"].get("only_split", False):
                tool_runners = []
            E.debug("added {} splitters to workflow".format(
                len(split_runners)))
        else:
            split_runners = []

        metric_runners = []
        for prefix, r in zip(["tool", "collate", "split"],
                             [tool_runners, collate_runners, split_runners]):
            if not r:
                continue

            metrics = None

            if prefix == "collate" and "collate_metrics" in P.PARAMS["setup"]:
                metrics = P.PARAMS["setup"]["collate_metrics"]
            elif prefix == "split" and "split_metrics" in P.PARAMS["setup"]:
                metrics = P.PARAMS["setup"]["split_metrics"]
            elif "metrics" in P.PARAMS["setup"]:
                metrics = P.PARAMS["setup"]["metrics"]
            else:
                raise KeyError(
                    "configuration file requires a 'setup:metrics' section")

            # Metric execution
            mm = add_metrics_to_pipeline(pipeline,
                                         metrics,
                                         map_metric_to_runner,
                                         r,
                                         suffix=suffix,
                                         prefix=prefix + "_",
                                         config=P.PARAMS,
                                         **kwargs)

            if len(mm) == 0:
                raise ValueError(
                    "workflow construction error: "
                    "no metric tasks result for metrics {}".format(metrics))

            metric_runners.extend(mm)
            E.debug("added {} {}_metrics to workflow".format(len(mm), prefix))

        # add plot task
        if "aggregate" in P.PARAMS["setup"]:
            aggregate_metrics = add_collations_to_pipeline(
                pipeline,
                map_collate_to_runner,
                P.PARAMS["setup"]["aggregate"],
                metric_runners,
                config=P.PARAMS)

            E.debug("added metric aggregation to workflow")
        else:
            aggregate_metrics = []

        add_upload_to_pipeline(pipeline, metric_runners + aggregate_metrics,
                               P.PARAMS)
        E.debug("added upload to workflow".format(prefix))

        # add export task
        export = P.PARAMS["setup"].get("export", ["tools", "collate", "split"])
        map_export2runner = {
            "collate": collate_runners,
            "tools": tool_runners,
            "split": split_runners
        }

        export_runners = []
        for e in export:
            try:
                export_runners.extend(map_export2runner[e])
            except KeyError:
                raise KeyError("unknown export section: {}".format(e))

        add_export_to_pipeline(pipeline,
                               export_runners,
                               suffix=suffix,
                               config=P.PARAMS)

        E.debug("added export to workflow")

        add_all_task_to_pipeline(pipeline, metric_runners + aggregate_metrics)

        # Collate output files to facilitate analysis
        if "collation" in P.PARAMS:
            collators = add_collations_to_pipeline(pipeline,
                                                   map_collate_to_runner,
                                                   P.PARAMS["collation"],
                                                   config=P.PARAMS)

        E.debug("construction of workflow completed")

        E.debug("starting workflow")
        P.run_workflow(options, pipeline=pipeline)
Esempio n. 21
0
def save_table(table: pandas.DataFrame,
               url: str,
               tablename: str,
               schema: str = None,
               dtypes=None,
               indices=["instance_id"]):
    logger = P.get_logger()
    table.columns = sql_sanitize_columns(table.columns)

    engine = create_engine(url)

    # pandas/sqlite3 prefers the raw connection, otherwise error:
    # AttributeError: 'Engine' object has no attribute 'rollback'
    if url.startswith("sqlite"):
        _engine = engine.raw_connection()
        # In pandas >= 0.23 and using sqlite as a backend, the
        # pandas.DataFrame.to_sql command fails with "OperationalError:
        # (sqlite3.OperationalError) too many SQL variables". The reason is a
        # fixed limit in sqlite, SQLITE_MAX_VARIABLE_NUMBER, which is by
        # default set to 999.
        sql_chunk_size = 999 // (len(table.columns) + 1)
    else:
        _engine = engine
        sql_chunk_size = None

    # lower case all table names. Otherwise issues with psql
    # mixed case access
    tablename = tablename.lower()
    create_index = False

    try:
        retry_table_to_sql(table,
                           tablename,
                           _engine,
                           schema=schema,
                           if_exists="fail",
                           index=False,
                           dtype=dtypes,
                           chunksize=sql_chunk_size)
        E.debug(f"table {tablename} was new")
        create_index = True
    except TableExistsException:
        E.debug(f"table {tablename} already exists - appending")

    if create_index:
        # sqlite requires an index name
        if schema:
            tablename = "{}.{}".format(schema, tablename)

        for field in indices:
            E.debug(f"creating index on {field} for {tablename}")
            try:
                retry_sql_execute(
                    _engine,
                    str(
                        text("CREATE INDEX {} ON {} ({})".format(
                            re.sub("[-.]", "_", tablename) + "_" + field,
                            tablename, field))))
            except IndexExistsException:
                pass
            except TypeError as ex:
                logger.warn("could not create index: {}".format(str(ex)))
            except sqlalchemy.exc.ProgrammingError as ex:
                logger.warn("could not create index: {}".format(str(ex)))
    else:
        reconcile_columns(tablename, engine, table)
        retry_table_to_sql(table,
                           tablename,
                           _engine,
                           schema=schema,
                           if_exists="append",
                           index=False,
                           dtype=dtypes,
                           chunksize=sql_chunk_size)
Esempio n. 22
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--inplace",
                        dest="inplace",
                        action="store_true",
                        help="update option list in place. New options will"
                        "be added to the list given by --options-tsv-file. "
                        "Options will only be added, not removed ")

    parser.add_argument("--options-tsv-file",
                        dest="tsv_file",
                        type=str,
                        help="existing table with options. Will be updated if "
                        "--in-place is set [default]")

    parser.set_defaults(inplace=False, tsv_file=None)

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv)

    old_options = None
    if args.tsv_file:
        if not os.path.exists(args.tsv_file):
            raise OSError("filename %s not found, see --options-tsv-file" %
                          args.tsv_file)
        old_options = pandas.read_csv(
            iotools.open_file(args.tsv_file),
            sep="\t",
            index_col=0,
        )
        old_options = old_options.fillna("")

    global ORIGINAL_START
    ORIGINAL_START = E.start

    all_options = collections.defaultdict(list)

    for label, expression in EXPRESSIONS:

        files = glob.glob(expression)
        files.sort()

        for f in files:

            E.debug("processing %s" % f)
            if os.path.isdir(f):
                continue
            if os.path.basename(f) in EXCLUDE:
                continue
            collected_options = collectOptionsFromScript(os.path.abspath(f))
            for o in collected_options:
                all_options[o].append(f)

    # add old options
    for x in old_options.index:
        if x not in all_options:
            all_options[x].append("--")

    if args.inplace:
        outfile = iotools.open_file(args.tsv_file, "w")
        E.info("updating file '%s'" % args.tsv_file)
    else:
        outfile = args.stdout

    outfile.write("option\taction\tcomment\talternative\tfiles\n")
    for o, v in sorted(all_options.items()):
        try:
            action, comment, alternative, ff = old_options.xs(o)

        except KeyError:
            action, comment, alternative, ff = "", "", "", ""

        if comment == "nan":
            comment = ""
        if alternative == "nan":
            alternative = ""

        outfile.write("\t".join(
            (list(map(str, (o, action, comment, alternative, ",".join(v)))))) +
                      "\n")

    if outfile != args.stdout:
        outfile.close()

    # write footer and output benchmark information.
    E.stop()
Esempio n. 23
0
 def __del__(self):
     E.debug(f"closing table cache {id(self)}")
     self.close()
Esempio n. 24
0
 def __enter__(self):
     table_cache = TableCache(self.database_url, self.schema)
     E.debug(f"{os.getpid()}: created resource={id(self)}: cache={id(table_cache)}")
     self.table_cache = table_cache
     return self
Esempio n. 25
0
    for glob_expression, template, dest in dirs:

        if not os.path.exists(dest):
            os.mkdir(dest)

        files = glob.glob(os.path.abspath(glob_expression))

        for filename in files:
            dirname, name = os.path.split(filename)
            prefix = name[:-3]

            # if os.path.exists( os.path.join( dirname, "_%s.pyx" % prefix )):
            #     E.warn( "ignoring pyximport file _%s.pyx" % prefix )
            #     continue

            filename = os.path.join(os.path.abspath(dest), "%s.rst" % prefix)
            if os.path.exists(filename):
                nskipped += 1
                continue

            E.debug("adding %s" % filename)
            outfile = open(filename, "w")
            outfile.write(template % locals())
            outfile.close()

            ncreated += 1

    E.info("ncreated=%i, nskipped=%i" % (ncreated, nskipped))

    E.Stop()