コード例 #1
0
    def run(self, infile, outfile, params):

        if params.reference_fasta_map is None:
            raise ValueError("bam2reference requires a reference sequence map")

        reference_fasta_map = build_reference_fasta_map(
            params.reference_fasta_map)

        fasta = resolve_argument(list(reference_fasta_map.values()),
                                 ",").split(",")
        retval, diff = get_reference_for_bam(infile, fasta)
        if retval is None:
            if diff is None:
                retval = "corrupted"
            else:
                retval = "unknown"
                E.debug("differences: {}".format(str(diff)))
            path = ""
        else:
            map_path2name = dict([(x[1], x[0])
                                  for x in list(reference_fasta_map.items())])
            path = map_path2name.get(retval, os.path.basename(retval))

        with IOTools.open_file(outfile, "w") as outf:
            outf.write("filename\treference\tpath\n")
            outf.write("\t".join((infile, retval, path)) + "\n")

        return None
コード例 #2
0
    def run(self, infiles, outfile, params):

        if not outfile.endswith("-pass.fastq.gz"):
            raise ValueError(
                "outfile must end in -pass.fastq.gz, got {}".format(outfile))

        if params.min_size_bytes:
            before = len(infiles)
            infiles = [
                x for x in infiles
                if os.path.getsize(x) >= params.min_size_bytes
            ]
            E.debug(
                "removing small files: after={}, before={}, removed={}".format(
                    len(infiles), before, before - len(infiles)))

        if params.newer_than:
            before = len(infiles)
            cutoff = os.path.getmtime(params.newer_than)
            infiles = [x for x in infiles if os.path.getmtime(x) > cutoff]
            E.debug(
                "removing old files: after={}, before={}, removed={}".format(
                    len(infiles), before, before - len(infiles)))

        if len(infiles) == 0:
            E.warn("no files left after filtering, creating empty file")
            IOTools.touch_file(outfile)
            return

        infiles = " ".join(infiles)

        outfile_fail = IOTools.snip(outfile,
                                    "-pass.fastq.gz") + "-fail.fastq.gz"

        statement = ("zcat {infiles} "
                     "| daisy fastq2fastq "
                     "--method=filter-ONT "
                     "--min-average-quality={params.min_average_quality} "
                     "--log={outfile}.log "
                     "--min-length={params.min_length} "
                     "--output-removed-fastq={outfile_fail} "
                     "- "
                     "| gzip "
                     "> {outfile}".format(**locals()))
        return P.run(statement)
コード例 #3
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    total_counter = E.Counter()
    table = []

    for section, map_task2runner in [("tool", map_tool_to_runner),
                                     ("metric", map_metric_to_runner),
                                     ("split", map_split_to_runner),
                                     ("collate", map_collate_to_runner)]:
        E.debug("processing section: {}".format(section))
        counter = E.Counter()

        for task, taskf in sorted(map_task2runner.items()):
            counter.ntasks += 1
            comments = []
            try:
                version = taskf().get_version()
                counter.version_ok += 1
            except Exception:
                version = ""
                comments.append("unavailable")
                counter.version_fail += 1

            comments = "; ".join(comments)
            table.append((section, task, version, comments))

        E.info("{}: {}".format(section, counter))
        total_counter += counter

    options.stdout.write("section\ttask\tversion\tcomments\n")
    for row in table:
        options.stdout.write("\t".join(map(str, row)) + "\n")

    E.info("{}: {}".format("total", counter))
    E.stop()
コード例 #4
0
def match_sequence_dictionaries(sequence_dict, fastafiles):
    """match a sequence dictionary (contig, length) against
    a collection of fasta files.

    :param sequence_dict: dictionary of contig/length pairs.
    :param fastafiles: list of :term:`fasta` formatted files. The
       fasta files need to indexed with samtools faidx.
    :return: a tuple (fastafn, diffs). Fastafn is the filename of
       the fasta file that has been matched, None if no match
       has been found. Diffs contains a list of
       discrepancies between sequence_dict and files in
       fastafiles that have been examined. If sequence_dict is empty,
       fastafn will be None and the list of diffs empty.
    """
    if not sequence_dict:
        return None, []

    fastafn = None
    diffs = []
    # match by sequence dictionary with optional length
    for fastafn, fastadict in sequence_length_dicts_iterator(fastafiles):
        E.debug("inspected {}".format(fastafn))
        contig_missing = None
        length_mismatch = None
        for reference, length in sequence_dict.items():
            if reference not in fastadict:
                contig_missing = reference
                break
            if length > 0 and length != fastadict[reference]:
                length_mismatch = reference
                break

        if not (length_mismatch or contig_missing):
            break
        else:
            diffs.append((fastafn, contig_missing, length_mismatch))

    return fastafn, diffs
コード例 #5
0
def main(argv):

    options, args = P.parse_commandline(argv)

    if options.config_file:
        PARAMS = P.get_parameters(options.config_file)
    else:
        sys.exit(P.main(options, args))

    with arvados_enabled(always_mount=options.always_mount):
        mountpoint = PARAMS.get("mount_point", None)
        if mountpoint:
            redirect_defaults2mountpoint(mountpoint)

        with LibraryContext(PARAMS, options, args, argv, "daisy"):
            # A selection of command line arguments are added to PARAMS
            # as 'extras' not implemented in ruffus 2.6.3
            kwargs = collections.defaultdict(dict)
            if options.only_info:
                kwargs["extras"].update({'only_info': True})
                P.PARAMS["only_info"] = True
            if options.is_test:
                kwargs["extras"].update({'is_test': True})
                P.PARAMS["is_test"] = True

            E.debug("construction of workflow started")
            pipeline = ruffus.Pipeline('benchmark')
            # Tool execution
            suffix, tool_runners = add_tools_to_pipeline(pipeline,
                                                         map_tool_to_runner,
                                                         config=P.PARAMS,
                                                         **kwargs)

            E.debug("added tools to workflow ")
            # Optionally, add externally computed files as
            # pseudo-tools:
            if "external" in P.PARAMS["setup"]:
                external_runners = add_external_data_to_pipeline(
                    pipeline, config=P.PARAMS, **kwargs)
                tool_runners.extend(external_runners)

            # Optionally, combine tool runs into aggregate
            # outputs. The type of the output is preserved
            # (VCF -> VCF, etc.)
            # For example, call individual members in a trio
            # and then build a combined VCF to analyse mendelian
            # inconsistencies.
            if "collate" in P.PARAMS["setup"]:
                collate_runners = add_collations_to_pipeline(
                    pipeline,
                    map_collate_to_runner,
                    P.PARAMS["setup"]["collate"],
                    tasks=tool_runners,
                    config=P.PARAMS)
                if P.PARAMS["setup"].get("only_collate", False):
                    tool_runners = []
                if P.PARAMS["setup"].get("no_collate_metrics", False):
                    collate_runners = []
                E.debug("added collators to workflow ")
            else:
                collate_runners = []

            # Optionally, split up the output before applying
            # additional analyses. The type of the output is preserved
            # (VCF -> VCF, etc).
            # For example, identify false positives, false negatives
            # and true positives and collect metrics individually.
            if "split" in P.PARAMS["setup"]:
                split_runners = add_splits_to_pipeline(
                    pipeline,
                    map_split_to_runner,
                    tool_runners,
                    P.PARAMS["setup"]["split"],
                    tasks=tool_runners,
                    config=P.PARAMS)
                if P.PARAMS["setup"].get("only_split", False):
                    tool_runners = []
                E.debug("added splitters to workflow ")
            else:
                split_runners = []

            metric_runners = []
            for prefix, r in zip(
                ["tool", "collate", "split"],
                [tool_runners, collate_runners, split_runners]):
                if not r:
                    continue

                metrics = None

                if prefix == "collate" and "collate_metrics" in P.PARAMS[
                        "setup"]:
                    metrics = P.PARAMS["setup"]["collate_metrics"]
                elif prefix == "split" and "split_metrics" in P.PARAMS["setup"]:
                    metrics = P.PARAMS["setup"]["split_metrics"]
                elif "metrics" in P.PARAMS["setup"]:
                    metrics = P.PARAMS["setup"]["metrics"]
                else:
                    raise KeyError(
                        "configuration file requires a 'setup:metrics' section"
                    )

                # Metric execution
                mm = add_metrics_to_pipeline(pipeline,
                                             metrics,
                                             map_metric_to_runner,
                                             r,
                                             suffix=suffix,
                                             prefix=prefix + "_",
                                             config=P.PARAMS,
                                             **kwargs)

                if len(mm) == 0:
                    raise ValueError(
                        "workflow construction error: "
                        "no metric tasks result for metrics {}".format(
                            metrics))

                metric_runners.extend(mm)
                E.debug("added {}_metrics to workflow".format(prefix))

            # add plot task
            if "aggregate" in P.PARAMS["setup"]:
                aggregate_metrics = add_collations_to_pipeline(
                    pipeline,
                    map_collate_to_runner,
                    P.PARAMS["setup"]["aggregate"],
                    metric_runners,
                    config=P.PARAMS)

                E.debug("added metric aggregation to workflow")
            else:
                aggregate_metrics = []

            add_upload_to_pipeline(pipeline,
                                   metric_runners + aggregate_metrics,
                                   P.PARAMS)
            E.debug("added upload to workflow".format(prefix))

            # add export task
            export = P.PARAMS["setup"].get("export",
                                           ["tools", "collate", "split"])
            map_export2runner = {
                "collate": collate_runners,
                "tools": tool_runners,
                "split": split_runners
            }

            export_runners = []
            for e in export:
                try:
                    export_runners.extend(map_export2runner[e])
                except KeyError:
                    raise KeyError("unknown export section: {}".format(e))

            add_export_to_pipeline(pipeline,
                                   export_runners,
                                   suffix=suffix,
                                   config=P.PARAMS)

            E.debug("added export to workflow")

            add_all_task_to_pipeline(pipeline,
                                     metric_runners + aggregate_metrics)

            # Collate output files to facilitate analysis
            if "collation" in P.PARAMS:
                collators = add_collations_to_pipeline(pipeline,
                                                       map_collate_to_runner,
                                                       P.PARAMS["collation"],
                                                       config=P.PARAMS)

            E.debug("construction of workflow completed")

    E.stop()
コード例 #6
0
def purge_run_id(run_id, url, dry_run=False, schemas=None):
    """remove a run from a database.
    """
    engine = sqlalchemy.create_engine(url)
    connection = engine.connect()

    # automap
    metadata = sqlalchemy.MetaData()
    metadata.reflect(engine)
    base = automap_base(metadata=metadata)
    base.prepare()

    if schemas is None:
        insp = reflection.Inspector.from_engine(engine)
        schemas = insp.get_schema_names()
        # note: default sqlite schema is "main"
        if 'public' in schemas:
            schemas.remove('public')
        if 'information_schema' in schemas:
            schemas.remove('information_schema')

    E.debug("getting instance_id list of run_id={}".format(run_id))
    instance_ids = set(get_instance_ids_for_run_id(run_id, engine))
    E.debug("found {} instances for run_id={}".format(len(instance_ids),
                                                      run_id))
    non_metric_tables = [
        'run', 'arvados_job', 'instance', 'binary_data', 'metric_timings',
        'tool_timings', 'metric_storage', 'tags'
    ]

    # delete from tables with field "instance_id"
    if instance_ids:
        for schema in schemas:
            # automap the schema
            metadata_schema = sqlalchemy.MetaData()
            metadata_schema.reflect(engine, schema=schema)
            base_schema = automap_base(metadata=metadata_schema)
            base_schema.prepare()
            for table_name in list(base_schema.metadata.tables.keys()):
                table = sqlalchemy.Table(table_name,
                                         metadata_schema,
                                         autoload=True)
                if "instance_id" not in table.c:
                    continue
                E.info("deleting data in {}".format(table_name))
                delete = table.delete().where(
                    table.c.instance_id.in_(instance_ids))
                # E.debug(delete)
                if not dry_run:
                    connection.execute(delete)

    # delete from tables with field "run_id"
    for table_name in base.metadata.tables.keys():
        table = sqlalchemy.Table(table_name, metadata, autoload=True)
        if "run_id" not in table.c:
            continue
        E.info("deleting data in {} for run_id {}".format(table_name, run_id))
        delete = table.delete().where(table.c.run_id == run_id)
        # E.debug(delete)
        if not dry_run:
            connection.execute(delete)

    table = sqlalchemy.Table('run', metadata, autoload=True)
    delete = table.delete().where(table.c.id == run_id)
    E.info("deleting data in 'run' for id {}".format(run_id))
    # E.debug(delete)
    if not dry_run:
        connection.execute(delete)