Esempio n. 1
0
def run_consolidate(dataset_file,
                    output_file,
                    datastore_file,
                    consolidate,
                    n_files,
                    task_id=Constants.TOOL_ID):
    datastore_files = []
    with openDataSet(dataset_file) as ds_in:
        if consolidate:
            if len(ds_in.toExternalFiles()) != 1:
                new_resource_file = op.splitext(output_file)[0] + ".bam"
                ds_in.consolidate(new_resource_file, numFiles=n_files)
            # always display the BAM/BAI if consolidation is enabled
            # XXX there is no uniqueness constraint on the sourceId, but this
            # seems sloppy nonetheless - unfortunately I don't know how else to
            # make view rule whitelisting work
            for ext_res in ds_in.externalResources:
                if ext_res.resourceId.endswith(".bam"):
                    ds_file = DataStoreFile(ext_res.uniqueId,
                                            task_id + "-out-2",
                                            ext_res.metaType, ext_res.bam)
                    datastore_files.append(ds_file)
                    for index in ext_res.indices:
                        if index.metaType in Constants.BAI_FILE_TYPES:
                            ds_file = DataStoreFile(index.uniqueId,
                                                    task_id + "-out-3",
                                                    index.metaType,
                                                    index.resourceId)
                            datastore_files.append(ds_file)
        ds_in.newUuid()
        ds_in.write(output_file)
    datastore = DataStore(datastore_files)
    datastore.write_json(datastore_file)
    return 0
def run_consolidate(dataset_file,
                    output_file,
                    datastore_file,
                    consolidate,
                    n_files,
                    consolidate_f=lambda ds: ds.consolidate):
    # XXX https://github.com/pysam-developers/pysam/issues/939
    pysam.set_verbosity(0)  # pylint: disable=no-member
    datastore_files = []
    with openDataSet(dataset_file) as ds_in:
        if consolidate:
            if len(ds_in.toExternalFiles()) <= 0:
                raise ValueError(
                    "DataSet {} must contain one or more files!".format(
                        dataset_file))
            new_resource_file = bam_of_dataset(output_file)
            consolidate_f(ds_in)(new_resource_file,
                                 numFiles=n_files,
                                 useTmp=False)
            # always display the BAM/BAI if consolidation is enabled
            # XXX there is no uniqueness constraint on the sourceId, but this
            # seems sloppy nonetheless - unfortunately I don't know how else to
            # make view rule whitelisting work
            reads_name = get_reads_name(ds_in)
            for ext_res in ds_in.externalResources:
                if ext_res.resourceId.endswith(".bam"):
                    ds_file = DataStoreFile(ext_res.uniqueId,
                                            Constants.TOOL_ID + "-out-2",
                                            ext_res.metaType,
                                            ext_res.bam,
                                            name=reads_name,
                                            description=reads_name)
                    datastore_files.append(ds_file)
                    # Prevent duplicated index files being added to datastore, since consolidated
                    # dataset may contain multiple indices pointing to the same physical file
                    added_resources = set()
                    for index in ext_res.indices:
                        if (index.metaType in Constants.BAI_FILE_TYPES
                                and index.resourceId not in added_resources):
                            added_resources.add(index.resourceId)
                            ds_file = DataStoreFile(
                                index.uniqueId,
                                Constants.TOOL_ID + "-out-3",
                                index.metaType,
                                index.resourceId,
                                name="Index of {}".format(reads_name.lower()),
                                description="Index of {}".format(
                                    reads_name.lower()))
                            datastore_files.append(ds_file)
        ds_in.newUuid()
        ds_in.write(output_file)
    datastore = DataStore(datastore_files)
    datastore.write_json(datastore_file)
    return 0
Esempio n. 3
0
 def test_datastore_file(self):
     tmpfile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     ds = DataStoreFile(str(uuid.uuid4()), "pbcommand.tasks.dev_task",
                        FileTypes.DS_SUBREADS.file_type_id, tmpfile, False,
                        "Subreads", "Subread DataSet XML")
     log.info("DataStoreFile: {s}".format(s=ds))
     ds2 = DataStoreFile.from_dict(ds.to_dict())
     for attr in [
             "uuid", "file_type_id", "file_id", "path", "is_chunked",
             "name", "description"
     ]:
         self.assertEqual(getattr(ds2, attr), getattr(ds, attr))
Esempio n. 4
0
def to_reports(subreads, output_dir):
    output_files = []
    log.info("Loading {f}".format(f=subreads))
    ds = SubreadSet(subreads)
    ds.loadStats()
    for base, module in [("filter_stats_xml", filter_stats_xml),
                         ("adapter_xml", adapter_xml),
                         ("loading_xml", loading_xml),
                         ("control", control)]:
        constants = getattr(module, "Constants")
        task_id = constants.TOOL_ID
        to_report = getattr(module, "to_report_impl")
        try:
            rpt_output_dir = os.path.join(output_dir, base)
            os.mkdir(rpt_output_dir)
            file_name = os.path.join(rpt_output_dir, "{b}.json".format(b=base))
            report = to_report(ds, rpt_output_dir)
            log.info("Writing {f}".format(f=file_name))
            report.write_json(file_name)
            output_files.append(DataStoreFile(
                uuid=report.uuid,
                source_id=task_id,
                type_id=FileTypes.REPORT.file_type_id,
                path=file_name,
                is_chunked=False,
                name=base))
        except InvalidStatsError as e:
            log.error("This dataset lacks some required statistics")
            log.error("Skipping generation of {b} report".format(b=base))
    datastore = DataStore(output_files)
    return datastore
Esempio n. 5
0
def to_datastore_file(file_name, file_id, file_type, label):
    return DataStoreFile(uuid.uuid4(),
                         file_id,
                         file_type.file_type_id,
                         op.abspath(file_name),
                         name=label,
                         description=label)
Esempio n. 6
0
def _to_datastore_file(file_name, file_id, file_type, description):
    return DataStoreFile(uuid.uuid4(),
                         file_id,
                         file_type.file_type_id,
                         op.abspath(file_name),
                         name=op.basename(file_name),
                         description=description)
Esempio n. 7
0
def __create_zipped_fastx(file_type_id, source_id, ds_files, output_file):
    fastx_files = [f.path for f in ds_files if f.file_type_id == file_type_id]
    with tarfile.open(output_file, mode="w:gz") as tgz_out:

        def _write_fastx(fh):
            arcname = re.sub(".gz", "", op.basename(fh.name))
            fastx_in_info = tgz_out.gettarinfo(fileobj=fh, arcname=arcname)
            # XXX This is very slow but necessary
            if fh.name.endswith(".gz"):
                fastx_in_info.size = fh.seek(0, io.SEEK_END)
                fh.seek(0)
            tgz_out.addfile(fastx_in_info, fileobj=fh)

        for file_name in fastx_files:
            if file_name.endswith(".zip"):
                with ZipFile(file_name, "r") as zip_in:
                    for fn in zip_in.namelist():
                        with zip_in.open(fn, mode="r") as fastx_in:
                            _write_fastx(fastx_in)
            elif file_name.endswith(".gz"):
                with gzip.open(file_name, "r") as fastx_in:
                    _write_fastx(fastx_in)
            else:
                with open(file_name, "r") as fastx_in:
                    _write_fastx(fastx_in)

    file_type_label = file_type_id.split(".")[-1].upper()
    return DataStoreFile(uuid.uuid4(),
                         source_id,
                         FileTypes.TGZ.file_type_id,
                         op.abspath(output_file),
                         name="All Barcodes ({l})".format(l=file_type_label))
Esempio n. 8
0
    def _update_analysis_reports_and_datastore(tnode_, task_):
        assert (len(tnode_.meta_task.output_file_display_names) ==
                len(tnode_.meta_task.output_file_descriptions) ==
                len(tnode_.meta_task.output_types) == len(task_.output_files))
        for i_file, (file_type_, path_, name, description) in enumerate(zip(
                tnode_.meta_task.output_types, task_.output_files,
                tnode_.meta_task.output_file_display_names,
                tnode_.meta_task.output_file_descriptions)):
            source_id = "{t}-out-{i}".format(t=task_.task_id, i=i_file)
            if tnode_.meta_task.datastore_source_id is not None:
                source_id = tnode_.meta_task.datastore_source_id
            ds_uuid = _get_or_create_uuid_from_file(path_, file_type_)
            is_chunked_ = _is_chunked_task_node_type(tnode_)
            ds_file_ = DataStoreFile(ds_uuid, source_id, file_type_.file_type_id, path_, is_chunked=is_chunked_, name=name, description=description)
            ds.add(ds_file_)
            ds.write_update_json(job_resources.datastore_json)

            # Update Services
            services_add_datastore_file(ds_file_)

            dsr = DU.datastore_to_report(ds)
            R.write_report_to_html(dsr, os.path.join(job_resources.html, 'datastore.html'))
            if file_type_ == FileTypes.REPORT:
                T.write_task_report(job_resources, task_.task_id, path_, DU._get_images_in_dir(task_.output_dir))
                update_analysis_file_links(tnode_.idx, path_)
Esempio n. 9
0
def _to_ds_file(d):
    # is_chunk this isn't exposed at the service level
    return DataStoreFile(d['uuid'],
                         d['sourceId'],
                         d['fileTypeId'],
                         d['path'],
                         is_chunked=False,
                         name=d.get("name", ""),
                         description=d.get("description", ""))
Esempio n. 10
0
def _make_datastore(subreads):
    files = [
        DataStoreFile(uuid.uuid4(), "barcoding.tasks.lima-out-0",
                      FileTypes.DS_SUBREADS.file_type_id, subreads)
    ]
    ds = DataStore(files)
    ds_path = tempfile.NamedTemporaryFile(suffix=".datastore.json").name
    ds.write_json(ds_path)
    return ds_path
Esempio n. 11
0
def dataset_to_datastore(dataset_file,
                         datastore_file,
                         source_id="dataset_to_datastore"):
    """Copied from pbcoretools.tasks.barcoding"""
    # FIXME: replace barcoding
    dsmd = get_dataset_metadata(dataset_file)
    ds_file = DataStoreFile(dsmd.uuid, source_id, dsmd.metatype, dataset_file)
    ds_out = DataStore([ds_file])
    ds_out.write_json(datastore_file)
    return 0
def _merge_chunks(file_names, datastore_file):
    output_file = op.basename(file_names[0])
    ds = openDataSet(*file_names)
    if len(file_names) > 1:
        bam_file = ".".join(output_file.split(".")[:-2] + ["bam"])
        ds.consolidate(bam_file, useTmp=False)
        bc_file = ds.subdatasets[0].externalResources[0].barcodes
        if bc_file is not None:
            bc_file = op.realpath(bc_file)
            ds.externalResources[0].barcodes = bc_file
    ds.write(output_file, relPaths=False)
    log.info("Wrote %s", output_file)
    return DataStoreFile(ds.uuid, datastore_file.source_id, ds.datasetType,
                         op.abspath(output_file))
Esempio n. 13
0
 def test_datastore_paths(self):
     tmpfile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     base_dir = os.path.dirname(tmpfile)
     tmp_ds = os.path.join(base_dir, "datastore.json")
     dsf = DataStoreFile(str(uuid.uuid4()), "pbcommand.tasks.dev_task",
                         FileTypes.DS_SUBREADS.file_type_id,
                         os.path.basename(tmpfile), False, "Subreads",
                         "Subread DataSet XML")
     ds = DataStore([dsf])
     ds.write_json(tmp_ds)
     with open(tmp_ds) as json_in:
         d = json.loads(json_in.read())
         self.assertFalse(os.path.isabs(d['files'][0]['path']))
     ds = DataStore.load_from_json(tmp_ds)
     self.assertEqual(ds.files.values()[0].path, tmpfile)
Esempio n. 14
0
    def _update_analysis_reports_and_datastore(tnode_, task_):
        for file_type_, path_ in zip(tnode_.meta_task.output_types, task_.output_files):
            source_id = "{t}-{f}".format(t=task_.task_id, f=file_type_.file_type_id)
            ds_uuid = _get_dataset_uuid_or_create_uuid(path_)
            ds_file_ = DataStoreFile(ds_uuid, source_id, file_type_.file_type_id, path_)
            ds.add(ds_file_)
            ds.write_update_json(job_resources.datastore_json)

            # Update Services
            services_add_datastore_file(ds_file_)

            dsr = DU.datastore_to_report(ds)
            R.write_report_to_html(dsr, os.path.join(job_resources.html, 'datastore.html'))
            if file_type_ == FileTypes.REPORT:
                T.write_task_report(job_resources, task_.task_id, path_, DU._get_images_in_dir(task_.output_dir))
                update_analysis_file_links(tnode_.idx, path_)
Esempio n. 15
0
 def to_f(x):
     source_id = "out-1"
     sset_out = sset.copy()
     sset_out.newUuid(random=True)
     if add_parent:
         sset_out.metadata.addParentDataSet(sset.uuid,
                                            sset.datasetType,
                                            createdBy="AnalysisJob",
                                            timeStampedName="")
     file_name = "file-{x:03d}.subreadset.xml".format(x=x)
     out_path = os.path.join(p, file_name)
     sset_out.write(out_path)
     sset_uuid = sset_out.uniqueId
     name = "subreadset-{}".format(x)
     dsf = DataStoreFile(sset_uuid,
                         source_id,
                         FileTypes.DS_SUBREADS.file_type_id,
                         file_name,
                         name=name,
                         description="{} Example Description".format(name))
     return dsf
Esempio n. 16
0
def job_resource_create_and_setup_logs(job_root_dir, bg, task_opts,
                                       workflow_level_opts, ep_d):
    """
    Create job resource dirs and setup log handlers

    :type job_root_dir: str
    :type bg: BindingsGraph
    :type task_opts: dict
    :type workflow_level_opts: WorkflowLevelOptions
    :type ep_d: dict
    """

    job_resources = to_job_resources_and_create_dirs(job_root_dir)

    pb_log_path = os.path.join(job_resources.logs, 'pbsmrtpipe.log')
    master_log_path = os.path.join(job_resources.logs, "master.log")
    master_log_level = logging.INFO
    stdout_level = logging.INFO
    if workflow_level_opts.debug_mode:
        master_log_level = logging.DEBUG
        stdout_level = logging.DEBUG

    setup_internal_logs(master_log_path, master_log_level, pb_log_path,
                        stdout_level)

    log.info("Starting pbsmrtpipe v{v}".format(v=pbsmrtpipe.get_version()))
    log.info("\n" + _log_pbsmrptipe_header())

    BU.write_binding_graph_images(bg, job_resources.workflow)

    write_entry_points_json(job_resources.entry_points_json, ep_d)

    # Need to map entry points to a FileType and store in the DataStore? or
    # does DataStore only represent outputs?
    smrtpipe_log_df = DataStoreFile(str(uuid.uuid4()),
                                    "pbsmrtpipe::pbsmrtpipe.log",
                                    FileTypes.LOG.file_type_id,
                                    pb_log_path,
                                    name="Analysis Log",
                                    description="pbsmrtpipe log")
    master_log_df = DataStoreFile(str(uuid.uuid4()),
                                  "pbsmrtpipe::master.log",
                                  FileTypes.LOG.file_type_id,
                                  master_log_path,
                                  name="Master Log",
                                  description="Master log")
    ds = write_and_initialize_data_store_json(job_resources.datastore_json,
                                              [smrtpipe_log_df, master_log_df])
    slog.info("successfully initialized datastore.")

    write_workflow_settings(
        workflow_level_opts,
        os.path.join(job_resources.workflow, 'options-workflow.json'))
    if workflow_level_opts.system_message is not None:
        slog.info("Command: {m}".format(m=workflow_level_opts.system_message))

    slog.info("Entry Points:")
    slog.info("\n" + pprint.pformat(ep_d, indent=4))

    slog.info("Workflow Options:")
    slog.info("\n" + pprint.pformat(workflow_level_opts.to_dict(), indent=4))

    slog.info("Task Options:")
    slog.info("\n" + pprint.pformat(task_opts, indent=4))

    task_opts_path = os.path.join(job_resources.workflow, 'options-task.json')
    with open(task_opts_path, 'w') as f:
        f.write(json.dumps(task_opts, sort_keys=True, indent=4))

    env_path = os.path.join(job_resources.workflow, '.env.json')
    IO.write_env_to_json(env_path)
    log.info("wrote current env to {e}".format(e=env_path))

    try:
        sa_system, sa_components = IO.get_smrtanalysis_system_and_components_from_env(
        )
        log.info(sa_system)
        for c in sa_components:
            log.info(c)
    except Exception:
        # black hole exception
        log.warn("unable to determine SMRT Analysis version.")
        pass

    slog.info(
        "completed setting up job directory resources and logs in {r}".format(
            r=job_root_dir))
    return job_resources, ds, master_log_df
Esempio n. 17
0
def job_resource_create_and_setup_logs(job_root_dir, bg, task_opts,
                                       workflow_level_opts, ep_d):
    """
    Create job resource dirs and setup log handlers

    :type job_root_dir: str
    :type bg: BindingsGraph
    :type task_opts: dict
    :type workflow_level_opts: WorkflowLevelOptions
    :type ep_d: dict
    """

    job_resources = to_job_resources_and_create_dirs(job_root_dir)

    pb_log_path = os.path.join(job_resources.logs, 'pbsmrtpipe.log')
    master_log_path = os.path.join(job_resources.logs, "master.log")
    master_log_level = logging.INFO
    stdout_level = logging.INFO
    if workflow_level_opts.debug_mode:
        master_log_level = logging.DEBUG
        stdout_level = logging.DEBUG

    setup_internal_logs(master_log_path, master_log_level, pb_log_path,
                        stdout_level)

    log.info("Starting pbsmrtpipe {v}".format(v=pbsmrtpipe.get_version()))
    log.info("\n" + _log_pbsmrptipe_header())

    BU.write_binding_graph_images(bg, job_resources.workflow)

    write_entry_points_json(job_resources.entry_points_json, ep_d)

    # Need to map entry points to a FileType and store in the DataStore? or
    # does DataStore only represent outputs?

    # For historical reasons, this is a bit non-obvious. The "master" log is now at the
    # the SMRT Link level, so we've promoted the pbsmrtpipe "master" log (i.e., master.log) to the
    # be Analysis Details Log using the pbsmrtpipe::pbsmrtpipe.log source Id. There's also this friction point
    # of marketing using "Analysis" vs "pbsmrtpipe which has generated some inconsistency.
    smrtpipe_log_df = DataStoreFile(str(uuid.uuid4()),
                                    GlobalConstants.SOURCE_ID_INFO_LOG,
                                    FileTypes.LOG.file_type_id,
                                    pb_log_path,
                                    name="Analysis Log",
                                    description="pbsmrtpipe INFO log")
    master_log_df = DataStoreFile(str(uuid.uuid4()),
                                  GlobalConstants.SOURCE_ID_MASTER_LOG,
                                  FileTypes.LOG.file_type_id,
                                  master_log_path,
                                  name="Analysis Details Log",
                                  description="Analysis Details log")

    ds = write_and_initialize_data_store_json(job_resources.datastore_json,
                                              [smrtpipe_log_df, master_log_df])
    slog.info("successfully initialized datastore.")

    write_workflow_settings(
        workflow_level_opts,
        os.path.join(job_resources.workflow, 'options-workflow.json'))
    if workflow_level_opts.system_message is not None:
        slog.info("Command: {m}".format(m=workflow_level_opts.system_message))

    slog.info("Entry Points:")
    slog.info("\n" + pprint.pformat(ep_d, indent=4))

    slog.info("Workflow Options:")
    slog.info("\n" + pprint.pformat(workflow_level_opts.to_dict(), indent=4))

    slog.info("Task Options:")
    slog.info("\n" + pprint.pformat(task_opts, indent=4))

    task_opts_path = os.path.join(job_resources.workflow, 'options-task.json')
    with open(task_opts_path, 'w') as f:
        f.write(json.dumps(task_opts, sort_keys=True, indent=4))

    env_path = os.path.join(job_resources.workflow, '.env.json')
    IO.write_env_to_json(env_path)
    log.info("wrote current env to {e}".format(e=env_path))

    slog.info(
        "completed setting up job directory resources and logs in {r}".format(
            r=job_root_dir))
    return job_resources, ds, master_log_df