Ejemplo n.º 1
0
def iterator_sorted(gff_iterator, sort_order="gene"):
    '''sort input and yield sorted output.'''
    entries = list(gff_iterator)
    if sort_order == "gene":
        entries.sort(key=lambda x: (x.gene_id, x.contig, x.start))
    elif sort_order == "gene+transcript":
        entries.sort(
            key=lambda x: (x.gene_id, x.transcript_id, x.contig, x.start))
    elif sort_order == "contig+gene":
        entries.sort(
            key=lambda x: (x.contig, x.gene_id, x.transcript_id, x.start))
    elif sort_order == "transcript":
        entries.sort(key=lambda x: (x.transcript_id, x.contig, x.start))
    elif sort_order == "position":
        entries.sort(key=lambda x: (x.contig, x.start))
    elif sort_order == "position+gene":
        entries.sort(key=lambda x: (x.gene_id, x.start))
        genes = list(flat_gene_iterator(entries))
        genes.sort(key=lambda x: (x[0].contig, x[0].start))
        entries = iotools.flatten(genes)
    elif sort_order == "gene+exon":
        entries.sort(key=lambda x: (x.gene_id, x.exon_number))

    for entry in entries:
        yield entry
Ejemplo n.º 2
0
    dbh = sqlite3.connect(P.get_params()['database'])
    return dbh


@transform(P.get_params()["input_globs"].get("default", INPUT_FORMATS),
           regex("(.*)"), r"\1")
def unprocessReads(infiles, outfiles):
    """dummy task - no processing of reads."""


# if preprocess tools are specified, preprocessing is done on output that has
# already been generated in the first run
if P.get_params().get("preprocessors", None):
    if P.get_params()["auto_remove"]:
        # check if FastQC has been run
        for x in iotools.flatten([glob.glob(y) for y in
                                  P.get_params()["input_globs"].get("default", INPUT_FORMATS)]):
            f = "fastqc.dir/" + re.match(REGEX_TRACK, x).group(1) + ".fastqc"
            if not os.path.exists(f):
                raise ValueError(
                    "file %s missing, "
                    "you need to run the pipeline once before "
                    "specifying 'auto_remove'" % f)

        @follows(mkdir("fasta.dir"))
        @transform(unprocessReads,
                   regex(SEQUENCEFILES_REGEX),
                   r"fasta.dir/\1.fasta")
        def makeAdaptorFasta(infile, outfile):
            '''Make a single fasta file for each sample of all contaminant adaptor
            sequences for removal
            '''
Ejemplo n.º 3
0
def save_metric_data(meta_data, table_cache, schema, instance_id: int, session):

    logger = P.get_logger()
    metric_table_filter = None
    if "metric_no_upload" in meta_data:
        if meta_data["metric_no_upload"] == "*":
            logger.warn("upload turned off for metric {}".format(
                meta_data["metric_name"]))
            return
        else:
            metric_table_filter = re.compile(meta_data["metric_no_upload"])

    # multiple tablenames for multiple metric output
    #
    # Tables are added into schemas to avoid cluttering
    # the public namespace.
    # (if only blobs, no metric output file)
    if "metric_output_files" in meta_data:
        assert len(meta_data["metric_output_files"]) == \
            len(meta_data["metric_tablenames"])

        for output_file, tablename in zip(
                meta_data["metric_output_files"],
                meta_data["metric_tablenames"]):

            if metric_table_filter and metric_table_filter.search(tablename):
                logger.warn("upload for table {} turned off".format(
                    tablename))
                continue

            if not os.path.exists(output_file):
                logger.warning("output file {} does not exist - ignored".format(
                    output_file))
                continue

            if IOTools.is_empty(output_file):
                logger.warn("output file {} is empty - ignored".format(
                    output_file))
                continue

            # table = pandas.DataFrame({"values": [1, 2]})
            try:
                table = pandas.read_csv(output_file,
                                        sep="\t",
                                        comment="#",
                                        skip_blank_lines=True)
            except ValueError as e:
                logger.warn("table {} can not be read: {}".format(
                    output_file, str(e)))
                continue
            except pandas.parser.CParserError as e:
                logger.warn("malformatted table {} can not be read: {}".format(
                    output_file, str(e)))
                continue

            if table.empty:
                logger.warn("table {} is empty - ignored".format(output_file))
                continue

            tablename, table, dtypes = transform_table_before_upload(tablename,
                                                                     table,
                                                                     instance_id,
                                                                     meta_data,
                                                                     table_cache)

            if schema is None:
                tn = tablename
            else:
                tn = "{}.{}".format(schema, tablename)

            # add foreign key
            table["instance_id"] = instance_id
            logger.debug(f"saving data {table.shape} from {output_file} to table {tn} under {instance_id}")
            table_cache.add_table(table, tablename, dtypes)

    if "metric_blob_globs" in meta_data:
        metric_dir = meta_data["metric_outdir"]
        files = [glob.glob(os.path.join(metric_dir, x))
                 for x in meta_data["metric_blob_globs"]]
        files = IOTools.flatten(files)
        logger.debug(
            "uploading binary data in {} files from {} to "
            "table binary_data".format(len(files), metric_dir))
        table = []
        for fn in files:
            with IOTools.open_file(fn, "rb", encoding=None) as inf:
                data_row = BenchmarkBinaryData(
                    instance_id=instance_id,
                    filename=os.path.basename(fn),
                    path=fn,
                    data=inf.read())
                session.add(data_row)
            session.commit()

    if meta_data.get("metric_tableindices", None):
        table_cache.add_indices(meta_data["metric_tableindices"])
Ejemplo n.º 4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: diff_bed.py 2866 2010-03-03 10:18:49Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-u",
        "--update",
        dest="filename_update",
        type="string",
        help=
        "if filename is given, previous results will be read from there and only changed sets will be computed [default=%default]."
    )

    parser.add_option(
        "-p",
        "--pattern-identifier",
        dest="pattern_id",
        type="string",
        help="pattern to convert a filename to an id [default=%default].")

    parser.add_option(
        "-t",
        "--tracks",
        dest="tracks",
        action="store_true",
        help=
        "compare files against all tracks in the first file [default=%default]"
    )

    parser.set_defaults(
        filename_update=None,
        pattern_id="(.*).bed",
        tracks=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) < 2:
        raise ValueError("at least two arguments required")

    if options.filename_update:
        infile = iotools.open_file(options.filename_update, "r")
        previous_results = {}
        for line in infile:
            if line.startswith("#"):
                continue
            if line.startswith("set1"):
                continue
            data = line[:-1].split("\t")
            set1, set2 = data[0], data[1]

            if set1 not in previous_results:
                previous_results[set1] = {}
            if set2 not in previous_results:
                previous_results[set2] = {}

            previous_results[set1][set2] = "\t".join(data[2:])
            rev = [(data[x + 1], data[x]) for x in range(2, len(data), 2)]
            previous_results[set2][set1] = "\t".join(iotools.flatten(rev))
    else:
        previous_results = {}

    pattern_id = re.compile(options.pattern_id)

    def getTitle(x):
        try:
            return pattern_id.search(x).groups()[0]
        except AttributeError:
            return x

    ncomputed, nupdated = 0, 0

    if options.tracks:
        counter = CounterTracks(args[0])
        options.stdout.write("set1\tset2\t%s\n" % counter.getHeader())
        for filename in args[1:]:
            title1 = getTitle(filename)
            for title2 in counter.getTracks():

                if previous_results:
                    try:
                        prev = previous_results[title1][title2]
                    except KeyError:
                        pass
                    else:
                        options.stdout.write("%s\t%s\t%s\n" %
                                             ((title1, title2, prev)))
                        nupdated += 1
                        continue

                counter.count(filename, title2)
                options.stdout.write("%s\t%s\t%s\n" %
                                     ((title1, title2, str(counter))))
                ncomputed += 1
    else:
        counter = Counter()
        options.stdout.write("set1\tset2\t%s\n" % counter.getHeader())

        for x in range(len(args)):

            title1 = getTitle(args[x])

            for y in range(0, x):
                title2 = getTitle(args[y])
                if previous_results:
                    try:
                        prev = previous_results[title1][title2]
                    except KeyError:
                        pass
                    else:
                        options.stdout.write("%s\t%s\t%s\n" %
                                             ((title1, title2, prev)))
                        nupdated += 1
                        continue

                counter.count(args[x], args[y])
                options.stdout.write("%s\t%s\t%s\n" %
                                     ((title1, title2, str(counter))))
                ncomputed += 1

    E.info("nupdated=%i, ncomputed=%i" % (nupdated, ncomputed))
    E.stop()
Ejemplo n.º 5
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-s",
                        "--ignore-strand",
                        dest="ignore_strand",
                        action="store_true",
                        help="ignore strand information.")

    parser.add_argument(
        "-u",
        "--update",
        dest="filename_update",
        type=str,
        help="if filename is given, previous results will be read"
        "from there and only changed sets will be computed ")

    parser.add_argument("-p",
                        "--pattern-identifier",
                        dest="pattern_id",
                        type=str,
                        help="pattern to convert a filename to an id")

    parser.add_argument("-g",
                        "--output-only-genes",
                        dest="output_only_genes",
                        action="store_true",
                        help="only output gene stats (includes gene lists)")

    parser.set_defaults(
        ignore_strand=False,
        filename_update=None,
        pattern_id="(.*).gtf",
        output_only_genes=False,
    )

    (args, unknown) = E.start(parser, unknowns=True)

    if len(unknown) < 2:
        print(USAGE)
        raise ValueError("at least two arguments required")

    if args.filename_update:
        infile = iotools.open_file(args.filename_update, "r")
        previous_results = {}
        for line in infile:
            if line.startswith("#"):
                continue
            if line.startswith("set1"):
                continue
            data = line[:-1].split("\t")
            set1, set2 = data[0], data[1]

            if set1 not in previous_results:
                previous_results[set1] = {}
            if set2 not in previous_results:
                previous_results[set2] = {}

            previous_results[set1][set2] = "\t".join(data[2:])
            rev = [(data[x + 1], data[x]) for x in range(2, len(data), 2)]
            previous_results[set2][set1] = "\t".join(iotools.flatten(rev))
    else:
        previous_results = {}

    if args.output_only_genes:
        counter = CounterGenes()
    else:
        counter = Counter()

    args.stdout.write("set1\tset2\t%s\n" % counter.getHeader())

    pattern_id = re.compile(args.pattern_id)

    def getTitle(x):
        try:
            return pattern_id.search(x).groups()[0]
        except AttributeError:
            return x

    ncomputed, nupdated = 0, 0
    for x in range(len(unknown)):
        title1 = getTitle(unknown[x])
        for y in range(0, x):
            title2 = getTitle(unknown[y])
            if previous_results:
                try:
                    prev = previous_results[title1][title2]
                except KeyError:
                    pass
                else:
                    args.stdout.write("%s\t%s\t%s\n" %
                                      ((title1, title2, prev)))
                    nupdated += 1
                    continue

            counter.count(unknown[x], unknown[y])
            args.stdout.write("%s\t%s\t%s\n" %
                              ((title1, title2, str(counter))))
            ncomputed += 1

    E.info("nupdated=%i, ncomputed=%i" % (nupdated, ncomputed))

    E.stop()
Ejemplo n.º 6
0
def loadNormalisedExpression(infiles, outfiles):
    for infile in iotools.flatten(infiles):
        outfile = P.snip(infile, ".tsv.gz") + ".load"
        P.load(infile, outfile)
Ejemplo n.º 7
0
def loadDifferentialExpression(infiles, outfiles):
    for infile in iotools.flatten(infiles):
        outfile = P.snip(infile, ".tsv") + ".load"
        P.load(infile, outfile)