Beispiel #1
0
    def determine_appropriate_source_from_hints(
        self,
        tool: Tool,
        inpid: str,
        source: Union[str, List[str], Dict[str, Union[str, List[str]]]],
    ) -> Optional[Union[str, List[str]]]:

        if isinstance(source, str) or isinstance(source, list):
            return source

        elif not isinstance(source, dict):
            Logger.critical(
                f"The input to the tool '{tool.id()}'.'{inpid}' did not have the correct format for doc.source, "
                f"expected Union[str, List[str], Dict[str, Union[str, List[str]]]], received '{type(source)}'"
            )

        tishj = ", ".join(source.keys())
        if not self.source_hints or len(self.source_hints) == 0:
            Logger.warn(
                f"There were no source hints specified to find an input for {tool.id()}.{inpid}, expected one "
                f"or more of {tishj}. You can specify source hints with --source-hint (in janis prepare)."
            )
            return None

        for hint in self.source_hints:
            if hint in source:
                return source[hint]

        shj = ", ".join(self.source_hints)
        Logger.warn(
            f"Couldn't find any of the specified source_hints ({shj}) in the tool input {tool.id()}.{inpid}'s source fields ({tishj})"
        )
        return None
Beispiel #2
0
    def query_tasks(self, status, name) -> Dict[str, WorkflowModel]:

        rows: [TaskRow] = self.get_lazy_db_connection().get_all_tasks()

        failed = []
        relevant = {}

        for row in rows:
            if not os.path.exists(row.outputdir):
                failed.append(row.wid)
                continue
            try:
                metadb = WorkflowManager.has(
                    row.outputdir, wid=row.wid, name=name, status=status
                )
                if metadb:
                    model = metadb.to_model()
                    model.outdir = row.outputdir
                    relevant[row.wid] = model
            except Exception as e:
                Logger.critical(f"Couldn't check workflow '{row.wid}': {e}")
                failed.append(row.wid)

        if failed:
            failedstr = ", ".join(failed)
            Logger.warn(
                f"Couldn't get information for tasks: {failedstr}, run"
                f"'janis cleanup' to clean up your tasks."
            )

        return relevant
Beispiel #3
0
    def cleanup_missing_tasks(self):
        from tabulate import tabulate

        rows: [TaskRow] = self.get_lazy_db_connection().get_all_tasks()

        failed = []

        for row in rows:
            if not os.path.exists(row.outputdir):
                failed.append((row.wid, row.outputdir))
                continue
            try:
                _ = WorkflowManager.from_path_with_wid(
                    row.outputdir, row.wid, readonly=True
                )
            except Exception as e:
                failed.append((row.wid, row.outputdir))

        if failed:
            Logger.warn(f"Removing the following tasks:\n" + tabulate(failed))

            if "y" in str(input(f"Remove {len(failed)} tasks (Y / n)? ")).lower():
                self.get_lazy_db_connection().remove_by_ids([r[0] for r in failed])
                Logger.info("Cleaned up tasks")
            else:
                Logger.info("Skipping cleaning of tasks")
def unpickle_obj(obj):
    if obj is None:
        return None
    try:
        return pickle.loads(obj)
    except Exception as ex:
        Logger.warn(f"Couldn't unpickle {repr(obj)} as encountered {repr(ex)}")
        return None
def pickle_obj(obj):
    if obj is None:
        return None
    try:
        return pickle.dumps(obj, protocol=2)
    except Exception as ex:
        Logger.warn(f"Couldn't pickle {repr(obj)} as encountered {repr(ex)}")
        return None
def send_slack_notification(result: Dict, option: NotificationOption):
    Logger.info("sending notification to Slack")

    if len(result["failed"]) == 0 and not result["execution_error"]:
        failed = False
        status = "Test Succeeded"
        icon = ":white_check_mark:"
    else:
        failed = True
        status = "Test Failed"
        icon = ":x:"

    test_description = ""
    if option.test_id:
        test_description = f" *{option.test_id}*"

    summary_block = {
        "type": "section",
        "text": {
            "type": "mrkdwn",
            "text": f"{icon} {status}{test_description}: {option.tool_name} - {option.test_case}",
        },
    }

    blocks = [summary_block]

    if failed and result["failed"]:
        failed_expected_output = []

        for f in result["failed"]:
            failed_expected_output.append(f":black_small_square: {f}")

        failed_block = {
            "type": "section",
            "text": {"type": "mrkdwn", "text": "\n".join(failed_expected_output)},
        }

        blocks.append(failed_block)

    if result["execution_error"]:
        text = result["execution_error"].replace("\n", "<br />")
        execution_error_block = {
            "type": "section",
            "text": {"type": "mrkdwn", "text": f"{result['execution_error']}"},
        }

        blocks.append(execution_error_block)

    request = {"blocks": blocks}
    resp = requests.post(url=option.url, json=request)

    if resp.status_code == requests.codes.ok:
        Logger.info("Notification sent")
    else:
        Logger.warn("Failed to send slack notification")
        Logger.warn(f"{resp.status_code}: {resp.text}")

    return resp.status_code, resp.text
def execute(args):
    output = None
    if args.output:
        output = ast.literal_eval(args.output)

    try:
        available_test_cases = find_test_cases(args.tool)
        if args.test_case:
            if args.test_case not in available_test_cases:
                raise TestCasesNotFound(
                    f"Test case with name `{args.test_case}` NOT found."
                )
            test_cases = [args.test_case]
        else:
            test_cases = available_test_cases

    except Exception as e:
        Logger.critical("Unexpected error occurred when searching for test cases")
        Logger.critical(str(e))
        exit()

    for tc_name in test_cases:

        result = run_test_case(
            tool_id=args.tool,
            test_case=tc_name,
            engine=args.engine,
            output=output,
            config=args.config,
        )
        result["test_case"] = tc_name
        cli_logging(result)

        try:
            # send output to test framework API
            if args.test_manager_url and args.test_manager_token:
                option = UpdateStatusOption(
                    url=args.test_manager_url, token=args.test_manager_token
                )
                update_status(result, option)
        except Exception as e:
            Logger.warn(f"Failed to update test status to {args.test_manager_url}")

        try:
            # Send notification to Slack
            if args.slack_notification_url:
                option = NotificationOption(
                    url=args.slack_notification_url,
                    tool_name=args.tool,
                    test_case=tc_name,
                    test_id=args.test_id,
                )
                send_slack_notification(result=result, option=option)
        except Exception as e:
            Logger.warn(
                f"Failed to send notifications to Slack {args.slack_notification_url}"
            )
 def __init__(self, host, repository, image, tag, chash: str):
     self.host = host
     self.repository = repository
     self.image = image
     self.tag = tag
     self.chash = chash
     if image is None:
         Logger.warn(f"{str(self)} didn't have an image, so setting to None")
         self.image = "ubuntu"
    def create_task_base(self, wf: Workflow, job: PreparedJob):

        forbiddenids = set()
        if job.store_in_central_db:
            try:
                with self.with_cursor() as cursor:
                    forbiddenids = set(
                        t[0] for t in cursor.execute("SELECT id FROM tasks").fetchall()
                    )
            except sqlite3.OperationalError as e:
                if "no such column: id" in repr(e):
                    from shutil import move

                    dt = datetime.utcnow()
                    np = f"{job.db_path}.original-{dt.strftime('%Y%m%d')}"
                    Logger.warn(f"Moving old janis-db to '{np}'")
                    move(job.db_path, np)
                    self._taskDB = None
                    return self.create_task_base(wf=wf, job=job)
                raise

        submission_id = generate_new_id(forbiddenids)

        output_dir = fully_qualify_filename(job.output_dir)

        if not job.execution_dir:
            job.execution_dir = os.path.join(output_dir, "janis")
            Logger.debug(
                f"No execution-dir was provided, constructed one from the output-dir: {job.execution_dir}"
            )
        job.execution_dir = fully_qualify_filename(job.execution_dir)

        Logger.info(
            f"Starting task with id = '{submission_id}' | output dir: {job.output_dir} | execution dir: {job.execution_dir}"
        )

        row = TaskRow(
            submission_id, execution_dir=job.execution_dir, output_dir=output_dir
        )
        WorkflowManager.create_dir_structure(job.execution_dir)

        if job.store_in_central_db:
            self.get_lazy_db_connection().insert_task(row)
        else:
            Logger.info(
                f"Not storing task '{submission_id}' in database. To watch, use: 'janis watch {output_dir}'"
            )

        if self._connection:
            self._connection.commit()
            self._connection.close()
            self._taskDB = None
            self._connection = None
        return row
Beispiel #10
0
def get_tag_and_cleanup_prefix(
    prefix, ) -> Optional[Tuple[str, str, bool, Optional[DataType]]]:
    """
    :param prefix:
    :return: (raw_element, potentialID, hasSeparator, potentialType)
    """
    # cases:
    # -a ADAPTER
    # --adapter=ADAPTER
    # --quality-cutoff=[5'CUTOFF,]3'CUTOFF
    el = prefix.lstrip()
    has_equals = False
    pretag = None
    potential_type = None

    # if prefix is split by ':' or split by
    if ":" in el or "=" in el:
        parts = None
        if ":" in el:
            parts = el.split(":")
        elif "=" in el:
            parts = el.split("=")
            has_equals = True

        if len(parts) > 2:
            Logger.warn(
                f"Unexpected number of components in the tag '{el}' to guess the type, using '{parts[0]}' and skipping type inference"
            )
        else:
            el, pt = parts[0], guess_type(parts[1])

            if not potential_type and pt:
                potential_type = pt

    if " " in el:
        el = el.split(" ")[0]

    titleComponents = [l.strip().lower() for l in el.split("-") if l]
    if len(titleComponents) == 0:
        Logger.critical(
            f"Title components for tag '{prefix}' does not have a component")
        return None
    tag = "_".join(titleComponents)

    if tag.lower() in common_replacements:
        tag = common_replacements[tag.lower()]

    if tag.lower() == "outputfilename":
        potential_type = Filename

    return el, tag, has_equals, potential_type
 def get_by_id(self,
               submission_id,
               allow_operational_errors=True) -> Optional[SubmissionModel]:
     s = self.get(
         where=("id = ?", [submission_id]),
         allow_operational_errors=allow_operational_errors,
     )
     if s is None:
         return None
     if len(s) != 1:
         Logger.warn(
             f"Couldn't get submission with id={submission_id}, query returned {len(s)} results."
         )
         return None
     return s[0]
    def copy_outputs_if_required(self):
        if self.database.progressDB.has(ProgressKeys.copiedOutputs):
            return Logger.debug(f"Workflow '{self.wid}' has copied outputs, skipping")

        if self.database.workflowmetadata.status != TaskStatus.COMPLETED:
            return Logger.warn(
                f"Skipping copying outputs as workflow "
                f"status was not completed ({self.database.workflowmetadata.status})"
            )

        wf_outputs = self.database.outputsDB.get_all()
        engine_outputs = self.get_engine().outputs_task(self.get_engine_wid())
        eoutkeys = engine_outputs.keys()
        fs = self.environment.filescheme

        for out in wf_outputs:
            eout = engine_outputs.get(out.tag)

            if eout is None:
                Logger.warn(
                    f"Couldn't find expected output with tag {out.tag}, found outputs ({', '.join(eoutkeys)}"
                )
                continue
            originalfile, newfilepath = self.copy_output(
                fs=fs,
                outputid=out.tag,
                prefix=out.prefix,
                tag=out.tags,
                secondaries=out.secondaries,
                extension=out.extension,
                engine_output=eout,
                iscopyable=out.iscopyable,
            )

            if isinstance(originalfile, list):
                originalfile = recursively_join(originalfile, "|")

            if isinstance(newfilepath, list):
                newfilepath = recursively_join(newfilepath, "|")

            self.database.outputsDB.update_paths(
                tag=out.tag, original_path=originalfile, new_path=newfilepath
            )

        self.database.progressDB.set(ProgressKeys.copiedOutputs)
        Logger.info(f"View the task outputs: file://{self.get_task_path()}")
    def get_ids(self, db_path):
        try:
            with self.with_cursor() as cursor:
                return set(
                    t[0] for t in cursor.execute("SELECT id FROM tasks").fetchall()
                )
        except sqlite3.OperationalError as e:
            if "no such column: id" in repr(e):
                from shutil import move

                dt = datetime.utcnow()
                np = f"{db_path}.original-{dt.strftime('%Y%m%d')}"
                Logger.warn(f"Moving old janis-db to '{np}'")
                move(db_path, np)
                self._taskDB = None
                return self.get_ids(db_path)
            raise
    def parse(container: str):
        if "/" in container:
            matched = docker_string_regex.match(container)
            if not matched:
                raise Exception(f"Invalid docker container '{container}'")
            name, tag, chash = matched.groups()
        else:
            if "@" in container or ":" in container:
                if "@" in container:
                    parts = container.split("@")
                else:
                    parts = container.split(":")
                if len(parts) != 2:
                    # This might happen if you use a library container with a tag AND a hash on dockerhub
                    # raise an issue if this happens
                    raise Exception(
                        f"Unexpected format for container: {str(container)}. If you're using a library container with a tag AND a hash, please raise an issue on GitHub"
                    )
                name, tagorhash = parts

                if ContainerInfo.validate_docker_digest(tagorhash) is False:
                    tag, chash = tagorhash, None
                else:
                    tag, chash = None, tagorhash
            else:
                name, tag, chash = container, None, None

        host, repo, image = ContainerInfo.deconstruct_image_name(name)

        has_hash = chash is not None
        final_tag = None
        if not has_hash:
            final_tag = "latest" if tag is None else tag
        else:
            if ContainerInfo.validate_docker_digest(chash) is False:
                Logger.warn(
                    "Invalid format for docker hash ({hash}) in container {container}"
                )
                return False
            # final_tag = chash if tag is None else f"{tag}@{chash}"

        return ContainerInfo(
            host=host, repository=repo, image=image, tag=final_tag, chash=chash
        )
    def evaluate_output_selector(self, selector, inputs: dict):
        if selector is None:
            return None

        if isinstance(selector, str):
            return selector

        if isinstance(selector, list):
            return [self.evaluate_output_selector(s, inputs) for s in selector]

        if isinstance(selector, InputSelector):
            if selector.input_to_select not in inputs:
                Logger.warn(f"Couldn't find the input {selector.input_to_select}")
                return None
            return inputs[selector.input_to_select]

        raise Exception(
            f"Janis assistant cannot evaluate selecting the output from a {type(selector).__name__} type"
        )
Beispiel #16
0
    def filter_updates(self,
                       jobs: List[RunJobModel],
                       add_inserts_to_cache=True
                       ) -> Tuple[List[RunJobModel], List[RunJobModel]]:
        # don't call super, it'll break because of the cache

        updates = []
        inserts = []

        if len(jobs) == 0:
            return updates, inserts

        self.populate_cache_if_required()

        idkeys = set(self.get_id_keys())
        idkeys_ordered = list(idkeys)
        dbalias_map = {t.dbalias: t.name for t in self._base.keymap()}
        skipped = 0

        for job in jobs:
            el_idkey = tuple(
                [getattr(job, dbalias_map[_k]) for _k in idkeys_ordered])

            jstatus = self._cache_completed_ids.get(el_idkey)
            if jstatus is None:
                inserts.append(job)
            elif job.status.value != jstatus:
                updates.append(job)
            elif jstatus:
                skipped += 1
            self._cache_completed_ids[el_idkey] = job.status.value

        if skipped:
            Logger.log(
                f"Skipped updating {skipped} jobs as those jobs were already in a final state"
            )
        memory = getsizeof(self._cache_completed_ids) // 1024
        if (self.job_cache_last_idx < len(self.job_cache_warnings)
                and memory > self.job_cache_warnings[self.job_cache_last_idx]):
            Logger.warn(f"Job cache is using {memory} MB")
            self.job_cache_last_idx += 1

        return updates, inserts
def get_file_from_searchname(name, cwd):
    if cwd == ".":
        cwd = os.getcwd()
    Logger.log(f"Searching for a file called '{name}'")
    resolved = os.path.expanduser(name)
    if os.path.exists(resolved) and os.path.isfile(resolved):
        Logger.log(f"Found file called '{name}'")
        return resolved

    Logger.log(f"Searching for file '{name}' in the cwd, '{cwd}'")
    with Path(cwd):
        if os.path.exists(name) and os.path.isfile(resolved):
            Logger.log(f"Found file in '{cwd}' called '{name}'")
            return os.path.join(cwd, name)

    Logger.log(
        f"Attempting to get search path $JANIS_SEARCHPATH from environment variables"
    )
    search_path = os.getenv("JANIS_SEARCHPATH")
    if search_path:
        Logger.log(
            f"Got value for env JANIS_SEARCHPATH '{search_path}', searching for file '{name}' here."
        )
        if os.path.exists(search_path):
            with Path(search_path):
                if os.path.exists(name) and os.path.isfile(resolved):
                    Logger.log(
                        f"Found file in '{search_path}' called '{name}'")
                    return os.path.join(search_path, name)
        else:
            Logger.warn(
                f"Search path '{search_path}' (obtained from $JANIS_SEARCHPATH) does not exist "
            )
    else:
        Logger.log(
            "Couldn't find JANIS_SEARCHPATH in environment variables, skipping"
        )

    Logger.log(
        f"Couldn't find a file with filename '{name}' in any of the following: "
        f"full path, current working directory ({cwd}) or the search path.")
    return None
    def start_engine_if_required(self):
        # engine should be loaded from the DB
        engine = self.get_engine()
        self.environment.engine = engine

        is_allegedly_started = engine.test_connection()

        if is_allegedly_started:
            return

        if not isinstance(engine, Cromwell):
            engine.start_engine()
            return

        additional_cromwell_params = []
        if not engine.config:
            Logger.info("Skipping start database as Janis is not managing the config")
        else:
            dbconfig: JanisDatabaseConfigurationHelper = self.database.workflowmetadata.dbconfig
            dbtype = dbconfig.which_db_to_use()
            if dbtype == dbconfig.DatabaseTypeToUse.existing:
                engine.config.database = dbconfig.get_config_for_existing_config()
            elif dbtype == dbconfig.DatabaseTypeToUse.filebased:
                engine.config.database = dbconfig.get_config_for_filebased_db(
                    path=self.get_path_for_component(self.WorkflowManagerPath.database)
                    + "/cromwelldb"
                )
            elif dbtype == dbconfig.DatabaseTypeToUse.managed:
                cromwelldb_config = self.start_mysql_and_prepare_cromwell_config()
                additional_cromwell_params.append(
                    "-Ddatabase.db.url=" + cromwelldb_config.db.url
                )
                engine.config.database = cromwelldb_config
            else:
                Logger.warn(
                    "Skipping database config as '--no-database' option was provided."
                )

        engine.start_engine(additional_cromwell_options=additional_cromwell_params)
        # Write the new engine details back into the database (for like PID, host and is_started)
        self.database.workflowmetadata.engine = engine
    def process_single_input(self, key: str, dt: DataType, value):
        if value is None:
            return None

        if isinstance(value, list):
            if not isinstance(dt, Array):
                Logger.warn(
                    f"{key} provided list of values, but type was not an array"
                )
                subtype = dt
            else:
                subtype = dt.subtype()
            return [
                self.process_single_input(f"{key}.{idx}", subtype, value[idx])
                for idx in range(len(value))
            ]

        if not isinstance(value, dict):
            return value

        if dt.is_base_type((File, Directory)):
            if "path" in value:
                return value["path"]
            else:
                Logger.warn(
                    f"Couldn't unwrap dictionary for input {key} ('{value}') as it didn't provide a value for 'path'"
                )
        else:
            Logger.warn(
                f"Couldn't unwrap dictionary for input {key} ('{value}') as the input isn't expected to be a file"
            )
        return value
    def check_extensions(inpid: str, datatype: DataType, path: str):
        """
        This method only WARNS about incorrect extension
        """

        if not isinstance(datatype, File):
            return

        if not isinstance(path, str):
            Logger.warn(
                f"Expecting string type input '{inpid}' of file File, but received '{type(path)}'"
            )

        # check extension (and in future, secondaries)
        pre_extensions = [
            datatype.extension,
            *list(datatype.alternate_extensions or []),
        ]
        extensions = {ext for ext in pre_extensions if ext is not None}

        if len(extensions) == 0:
            # skip because no extension
            return

        has_extension = False
        for ext in extensions:
            if path.endswith(ext):
                has_extension = True
                break

        if has_extension:
            # looks like we're sweet
            Logger.debug(
                f"Validated that the input for {inpid} had the expected extension for {datatype.id()}"
            )
            return

        Logger.warn(
            f"The input for '{inpid}' ({datatype.name()}) did not have the expected extension "
            f"{' OR '.join(extensions)}: {path}. ")
def guess_datatype_by_filename(filename: str):
    """
    We'll try to guess which datatype a file with name 'filename' is.
    Primarily, this will look at the extension, and whether the secondary files exist
    :param filename:
    :return:
    """
    dts = JanisShed.get_all_datatypes()
    fs = FileScheme.get_type_by_prefix(filename)()
    if not isinstance(fs, LocalFileScheme):
        Logger.warn(
            f"The filescheme detected by Janis for '{filename}' was not LOCAL. This guess datatype process may rely on "
            f"polling the {fs.id()} file system to check if related files exist. This might have some financial cost involved."
        )

    file_exists_map = {}

    # each match has a score
    matches: List[Tuple[int, File]] = []

    for datatype in dts:
        if isclass(datatype):
            if not issubclass(datatype, File):
                continue
            datatype = get_instantiated_type(datatype)
        elif not isinstance(datatype, File):
            continue
        if not datatype.extension:
            continue
        datatype: File = datatype

        extensions = {datatype.extension, *(datatype.alternate_extensions or [])}

        matching_extension = None
        for ext in extensions:
            if filename.endswith(ext):
                matching_extension = ext
                break

        secondaries_match = True

        if datatype.secondary_files():
            for secondary in datatype.secondary_files():
                secondary_filename = apply_secondary_file_format_to_filename(
                    filename, secondary
                )
                if secondary not in file_exists_map:
                    file_exists_map[secondary] = fs.exists(secondary_filename)
                if not file_exists_map[secondary]:
                    secondaries_match = False
                    break
            if secondaries_match is False:
                continue

        # we got here, we're G

        if matching_extension is not None and secondaries_match:
            extension_reward = len(matching_extension) * EXTENSION_REWARD_MULTIPLER
            secondaries_reward = (
                len(datatype.secondary_files() or []) * SECONDARIES_REWARD_MULTIPLER
            )
            score = extension_reward + secondaries_reward

            matches.append((score, datatype))

    if len(matches) == 0:
        return None
    elif len(matches) == 1:
        return matches[0][1]
    else:
        matches = sorted(matches, key=lambda a: a[0], reverse=True)
        matched_dt = matches[0][1]
        ranked = ", ".join(f"{match[1].name()} ({match[0]})" for match in matches[1:])
        Logger.debug(
            f"There were {len(matches)} for matching datatypes. Using {matched_dt.name()} ({matches[0][0]}) "
            f"as it was the best match from: {ranked}"
        )
        return matched_dt
Beispiel #22
0
    def localise_inputs(
        self,
        inpid: str,
        inptype: DataType,
        dest_dir: str,
        source: Union[str, List[str]],
        localise_secondary_files: bool = True,
    ):
        if isinstance(source, list):
            return [
                self.localise_inputs(inpid, inptype, dest_dir, s)
                for s in source
            ]

        fs = FileScheme.get_type_by_prefix(source)()
        if isinstance(fs, LocalFileScheme):
            return source

        out_path = self.generate_file_path(source, dest_dir)
        if os.path.exists(out_path):
            Logger.info(
                f"A file already exists when localising '{inpid}' at '{out_path}'. If this isn't the right file, "
                f"you'll need to manually remove this file before proceeding")
        else:
            try:
                Logger.info(f"Downloading file from {source} -> {out_path}")
                fs.cp_from(source, out_path)
            except Exception as e:
                Logger.critical(
                    f"Couldn't localise source from {source} -> {out_path}: {repr(e)}"
                )
                raise

        if localise_secondary_files:
            try:
                # Handle normal input type or array input type
                secondary_files = inptype.secondary_files()
                if inptype.is_array():
                    secondary_files = inptype.subtype().secondary_files()

                for sec in secondary_files or []:
                    sec_source = apply_secondary_file_format_to_filename(
                        source, sec)
                    out_sec_path = apply_secondary_file_format_to_filename(
                        out_path, sec)

                    if os.path.exists(out_sec_path):
                        Logger.info(
                            f"The secondary file for {inpid} ({sec}) already exists when localising '{inpid}' at '{out_sec_path}'. If this isn't the right file, "
                            f"you'll need to manually remove this file before proceeding"
                        )
                    elif not fs.exists(sec_source):
                        Logger.warn(
                            f"Couldn't find the secondary file for {inpid}, expected at {sec_source}, skipping for now"
                        )
                    else:
                        fs.cp_from(sec_source, out_sec_path)

            except Exception as e:
                Logger.critical(
                    f"Couldn't localise secondary file due to: {e}")

        return out_path
Beispiel #23
0
    def check_input_for_correctness(self, inpid: str, dt: DataType, value: any):
        if isinstance(dt, Array):
            if isinstance(value, list):
                return [
                    self.check_input_for_correctness(f"{inpid}[{idx}]", dt.subtype(), v)
                    for idx, v in zip(range(len(value)), value)
                ]

        if not isinstance(dt, File):
            return value

        if not isinstance(value, str):
            Logger.warn(
                f"Expecting string type input '{inpid}' for type File, but received '{type(value)}'. Janis won't transform this value, but you should confirm your inputs."
            )
            return value

        guessed_datatype = guess_datatype_by_filename(value)

        if not guessed_datatype:
            Logger.info(
                f"Couldn't guess datatype for {value}. Returning the value instead."
            )
            return value

        if dt.can_receive_from(guessed_datatype):
            Logger.debug(f"Input '{inpid}' had a compatible type")
            return value

        message_prefix = (
            f"The value for input '{inpid}' did not match the expected type {dt.name()} "
            f"through the extension and / or existence of secondary files"
        )
        if not guessed_datatype:
            Logger.warn(
                message_prefix
                + f"\nand Janis couldn't guess the datatype from the input for {inpid} and value '{value}'."
            )
            return value
        try:
            transformation = JanisShed.get_transformation_graph().find_connection(
                guessed_datatype, dt
            )
            steps = (
                "".join(t.type1.name() + " -> " for t in transformation)
                + transformation[-1].type2.name()
            )
            Logger.warn(
                message_prefix
                + f",\nJanis guessed the actual datatype for '{inpid}' from data '{value}' to be {guessed_datatype.id()}, "
                f"and Janis was able to determine a transformation in {len(transformation)} step(s): {steps}"
            )
            wf = JanisTransformation.convert_transformations_to_workflow(transformation)

            trans = wf.translate("wdl", to_console=False)[0]
            Logger.debug(
                f"Transforming {inpid} ({guessed_datatype.name()} -> {dt.name()}): {trans}"
            )
        except Exception as e:
            Logger.warn(
                message_prefix
                + f",\nbut Janis couldn't find a transformation between the guessed and expected type:"
                f" {guessed_datatype.name()} -> {dt.name()}: {str(e)}"
            )
            return value

        # maybe do some other things with respect to the path

        try:
            return self.try_get_outputs_for(
                inpid=inpid,
                wf=wf,
                inputs={wf.tool_inputs()[0].id(): value},
                output_dir=os.path.join(self.cache_dir, inpid),
                description=f"{guessed_datatype.name()} -> {dt.name()}",
            )

        except Exception as e:
            Logger.critical(
                f"An internal error occurred when performing the transformation for {inpid} "
                f"({guessed_datatype.name()} -> {dt.name()}): {str(e)}"
            )
            Logger.debug(traceback.format_exc())

            return value
    def do_bed_fasta_contig_check(tool: Tool, inputs: Dict[str, any]):
        from janis_bioinformatics.data_types import Fasta, Bed, BedTabix

        supported_bed_types = (Bed, BedTabix)

        beds_inputs = []
        refs = []

        for i in tool.tool_inputs():
            if isinstance(i.intype, supported_bed_types) or (
                    isinstance(i.intype, Array)
                    and isinstance(i.intype.subtype(), supported_bed_types)):
                beds_inputs.append(i)

            if (isinstance(i.intype, Fasta) and i.intype.secondary_files()
                    and ".fai" in i.intype.secondary_files()):
                refs.append(i)

        if len(refs) == 0:
            return
        if len(refs) > 1:
            Logger.info(
                "Skipping bioinformatics FASTA-BED file checks as there were more than 1 reference"
            )

        for inp_ref in refs:
            value_ref = inputs[inp_ref.id()]
            if not value_ref:
                Logger.warn(
                    f"Skipping '{inp_ref.id()}' as no value was provided")
                continue

            ref_contigs = ContigChecker.get_list_of_contigs_from_fastafai(
                value_ref + ".fai")

            if not ref_contigs:
                Logger.debug(
                    f"Didn't get any contigs from ref {value_ref}.fai, skipping..."
                )
                continue

            for inp_bed in beds_inputs:
                value_bed = inputs[inp_bed.id()]
                is_array = isinstance(value_bed, list)
                beds = value_bed if is_array else [value_bed]
                for b_idx in range(len(beds)):
                    bed = beds[b_idx]

                    bed_contigs = ContigChecker.get_list_of_contigs_from_bed(
                        bed)

                    missing_contigs = bed_contigs - ref_contigs
                    if missing_contigs:
                        inpname = (f"{inp_bed.id()}.{b_idx}"
                                   if is_array else inp_bed.id())
                        contiglist = (", ".join(missing_contigs)
                                      if len(missing_contigs) < 5 else
                                      (", ".join(list(missing_contigs)[:3]) +
                                       "..."))
                        Logger.warn(
                            f"The BED file '{inpname}' contained {len(missing_contigs)} contigs ({contiglist}) that were missing from the reference: {value_ref}"
                        )
Beispiel #25
0
def prepare_all_tools():
    JanisShed.hydrate(modules=[janis_unix, janis_bioinformatics])

    data_types = JanisShed.get_all_datatypes()
    tools = {
        ts[0].id(): {t.version(): t
                     for t in ts}
        for ts in JanisShed.get_all_tools()
    }

    Logger.info(f"Preparing documentation for {len(tools)} tools")
    Logger.info(f"Preparing documentation for {len(data_types)} data_types")

    tool_module_index = {}
    dt_module_index = {}
    ROOT_KEY = "root"

    if os.path.exists(tools_dir):
        rmtree(tools_dir)

    for toolname, toolsbyversion in tools.items():
        # tool = tool_vs[0][0]()
        tool_versions = sort_tool_versions(list(toolsbyversion.keys()))
        default_version = tool_versions[0]
        Logger.log(
            f"Preparing {toolname}, found {len(tool_versions)} version[s] ({','.join(tool_versions)})"
        )

        defaulttool = toolsbyversion[default_version]
        if isclass(defaulttool):
            defaulttool = defaulttool()
        try:
            tool_path_components = list(
                filter(
                    lambda a: bool(a),
                    [defaulttool.tool_module(),
                     defaulttool.tool_provider()],
                ))
        except Exception as e:
            Logger.critical(f"Failed to generate docs for {toolname}: {e}")
            continue

        # (toolURL, tool, isPrimary)
        toolurl_to_tool = [(toolname.lower(), defaulttool, True)] + [
            (get_tool_url(toolname, v), toolsbyversion[v], False)
            for v in tool_versions
        ]

        path_components = "/".join(tool_path_components)
        output_dir = f"{tools_dir}/{path_components}/".lower()
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        for (toolurl, tool, isprimary) in toolurl_to_tool:
            output_str = prepare_tool(tool, tool_versions, not isprimary)
            output_filename = output_dir + toolurl + ".rst"
            if output_str is None:
                Logger.warn(f"Skipping {tool.id()}")
                continue
            with open(output_filename, "w+") as tool_file:
                tool_file.write(output_str)

        nested_keys_append_with_root(tool_module_index,
                                     tool_path_components,
                                     toolname,
                                     root_key=ROOT_KEY)

        Logger.log("Prepared " + toolname)

    for d in data_types:
        # tool = tool_vs[0][0]()
        if issubclass(d, Array):
            Logger.info("Skipping Array DataType")
            continue
        try:
            dt = d()
        except:
            print(d.__name__ + " failed to instantiate")
            continue
        did = dt.name().lower()
        Logger.log("Preparing " + dt.name())
        output_str = prepare_data_type(dt)

        dt_path_components = []
        # dt_path_components = list(filter(
        #     lambda a: bool(a),
        #     [, tool.tool_provider()]
        # ))

        path_components = "/".join(dt_path_components)
        output_dir = f"{dt_dir}{path_components}/"
        output_filename = output_dir + did + ".rst"

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        nested_keys_append_with_root(dt_module_index,
                                     dt_path_components,
                                     did,
                                     root_key=ROOT_KEY)

        with open(output_filename, "w+") as dt_file:
            dt_file.write(output_str)

        Logger.log("Prepared " + did)

    def prepare_modules_in_index(contents, title, dir, max_depth=1):
        module_filename = dir + "/index.rst"
        module_tools = sorted(
            set(contents[ROOT_KEY] if ROOT_KEY in contents else []))
        submodule_keys = sorted(m for m in contents.keys() if m != ROOT_KEY)
        indexed_submodules_tools = [m.lower() for m in submodule_keys]

        with open(module_filename, "w+") as module_file:
            module_file.write(
                get_tool_toc(
                    alltoolsmap=tools,
                    title=title,
                    intro_text=
                    f"Automatically generated index page for {title}:",
                    subpages=indexed_submodules_tools,
                    tools=module_tools,
                    max_depth=max_depth,
                ))

        for submodule in submodule_keys:
            prepare_modules_in_index(contents=contents[submodule],
                                     title=submodule,
                                     dir=f"{dir}/{submodule}/")

    def prepare_dtmodules_in_index(contents, title, dir, max_depth=1):
        module_filename = dir + "/index.rst"
        module_tools = sorted(
            set(contents[ROOT_KEY] if ROOT_KEY in contents else []))
        submodule_keys = sorted(m for m in contents.keys() if m != ROOT_KEY)
        indexed_submodules_tools = [
            m.lower() + "/index" for m in submodule_keys
        ]

        with open(module_filename, "w+") as module_file:
            module_file.write(
                get_toc(
                    title=title,
                    intro_text=
                    f"Automatically generated index page for {title}:",
                    subpages=indexed_submodules_tools + module_tools,
                    max_depth=max_depth,
                ))

        for submodule in submodule_keys:
            prepare_modules_in_index(contents=contents[submodule],
                                     title=submodule,
                                     dir=f"{dir}/{submodule}/")

    prepare_modules_in_index(tool_module_index, title="Tools", dir=tools_dir)
    prepare_dtmodules_in_index(dt_module_index,
                               title="Data Types",
                               dir=dt_dir,
                               max_depth=1)
def get_workflow_from_file(file, name, include_commandtools=False):
    # How to import a module given the full path
    # https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path
    import importlib.util

    try:
        import sys

        basefilename = os.path.basename(file)

        sys.path.append(os.path.dirname(file))
        spec = importlib.util.spec_from_file_location("module.name", file)
        foo = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(foo)
        ptypes = get_janis_from_module_spec(
            foo, include_commandtools=include_commandtools, name=name)

    except Exception as e:
        raise Exception(
            f"Unrecognised python file when getting workflow / command tool: {file} :: {e}"
        )

    # Per https://github.com/PMCC-BioinformaticsCore/janis-core/issues/31, we'll use the following process:
    # 	1. If a `name` is defined:
    # 	    - Force parse every token with a case-insensitive match
    # 	    - If a single item is returned from a case-sensitive match, then use that
    # 	2. If multiple workflows are defined in the same file, use the last defined workflow
    # 	   - This covers the existing _If a single workflow is defined, use that_ case
    # 	3. If no tools were found, raise an Exception
    # 	4. If multiple tools are defined in the file, use the last one:
    # 	   - If a name was defined, `warn` the user that the case-insensitive match returned no results and use the last one
    # 	   - Otherwise, just tell the user we'll use the last defined tool

    ptypes_casesensitive = [(k, v) for (k, v) in ptypes if k == name]

    if len(ptypes_casesensitive) == 1:
        return ptypes_casesensitive[0][1]

    if name is None:
        mains = [v for (k, v) in ptypes if k == "__JANIS_ENTRYPOINT"]
        if len(mains) > 0:
            Logger.debug(
                "Using workflow defined by '__JANIS_ENTRYPOINT' as no name was used"
            )
            return mains[0]

    wftypes = [
        t for t in ptypes
        if (issubclass(t[1], WorkflowBase
                       ) if isclass(t[1]) else isinstance(t[1], WorkflowBase))
    ]
    detected_tokens = ", ".join(f"'{x[0]}' ({x[1].__class__.__name__})"
                                for x in ptypes)

    if len(wftypes) > 0:
        if len(wftypes) > 1:
            if name:
                Logger.warn(
                    f"Providing the `--name` parameter performs a case-insensitive search for the tokens in "
                    f"'{basefilename}, and a case-sensitive search returned no results. You had {len(wftypes)} "
                    f"tokens that matched this search. Janis will use the last one, defined as "
                    f"'{ptypes[-1][0]}' from: {detected_tokens}")
            else:
                Logger.info(
                    f"Multiple workflows were found in '{basefilename}', using '{wftypes[-1][0]}'"
                )
        return wftypes[-1][1]

    if len(ptypes) == 0:
        raise Exception(
            f"There were no valid tools in '{file}', try running with the `--name YourToolName` parameter "
            f"to get more information (it might have abstract / unimplemented methods)."
        )
    if len(ptypes) > 1:

        if name:
            Logger.warn(
                f"Providing the `--name` parameter performs a case-insensitive search for the tokens in "
                f"'{basefilename}, and a case-sensitive search returned no results. You had {len(ptypes)} "
                f"tokens that matched this search. Janis will use the last one, defined as "
                f"'{ptypes[-1][0]}' from: {detected_tokens}")
        else:
            Logger.info(
                f"There were multiple tools (an no workflows) detected in {basefilename}, "
                f"Janis will use '{ptypes[-1][0]}' (the last defined)")

    return ptypes[-1][1]
Beispiel #27
0
    def insert_or_update_many(self, els: List[T]):
        if len(els) == 0:
            return
        queries: Dict[str, List[List[any]]] = {}
        update_separator = ",\n"
        tab = "\t"

        idkeys = set(self.get_id_keys())
        idkeys_ordered = list(idkeys)
        pkeys_ordered = self.get_primary_keys()
        existing_keys = set()  # (*pkeys_ordered)

        # get all primary keys

        dbalias_map: Dict[str, DatabaseObjectField] = {
            t.dbalias: t
            for t in self._base.keymap()
        }

        updates, inserts = self.filter_updates(els)

        def add_query(query, values):
            if query in queries:
                queries[query].append(values)
            else:
                queries[query] = [values]

        for job in updates:
            keys, values = job.prepare_insert()
            # el_pkeys = [getattr(job, dbalias_map[_k]) for _k in idkeys_ordered]

            keys_np, values_np = [], []
            for k, v in zip(keys, values):
                if k in idkeys:
                    continue

                keys_np.append(k)
                values_np.append(v)

            # problem is we want to update matching on some fields when they are NULL, our WHERE statement
            # should be something like:
            #   WHERE id1 = ? AND id2 = ? AND id3 is null AND id4 is null

            id_keyvalues = {
                pkey: prep_object_for_db(
                    getattr(job, dbalias_map[pkey].name),
                    encode=dbalias_map[pkey].encode,
                )
                for pkey in idkeys_ordered
            }
            id_withvalues_keyvalue_ordered = [
                (idkey, idvalue) for idkey, idvalue in id_keyvalues.items()
                if idvalue is not None
            ]
            id_withvalues_updater_keys = [
                f"{idkey} = ?" for idkey, _ in id_withvalues_keyvalue_ordered
            ]
            id_withvalues_updater_values = [
                idvalue for _, idvalue in id_withvalues_keyvalue_ordered
            ]
            id_novalues_updater_keys = [
                f"{idkey} is NULL" for idkey, idvalue in id_keyvalues.items()
                if idvalue is None
            ]

            prepared_statement = f"""
            UPDATE {self._tablename}
                SET {', '.join(f'{k} = ?' for k in keys_np)}
            WHERE
                {" AND ".join([*id_withvalues_updater_keys, *id_novalues_updater_keys])}
            """
            vtuple = (
                *values_np,
                *id_withvalues_updater_values,
            )

            add_query(prepared_statement, vtuple)

        for job in inserts:
            keys, values = job.prepare_insert()
            # el_pkeys = [getattr(job, dbalias_map[_k]) for _k in idkeys_ordered]
            prepared_statement = f"""
            INSERT INTO {self._tablename}
                ({', '.join(keys)})
            VALUES
                ({', '.join(f'?' for _ in keys)});
            """
            add_query(prepared_statement, values)

        Logger.log(
            f"DB {self._tablename}: Inserting {len(inserts)} and updating {len(updates)} rows"
        )
        with self.with_cursor() as cursor:
            start = DateUtil.now()
            if len(inserts) + len(updates) > 300:
                Logger.warn(
                    f"DB '{self._tablename}' is inserting {len(inserts)} and updating {len(updates)} rows, this might take a while"
                )
            for query, vvalues in queries.items():
                try:
                    Logger.log(
                        f"Running query: {query}\n\t: values: {vvalues}")
                    cursor.executemany(query, vvalues)
                except OperationalError as e:
                    Logger.log_ex(e)
            seconds = (DateUtil.now() - start).total_seconds()
            if seconds > 2:
                Logger.warn(
                    f"DB '{self._tablename}' took {second_formatter(seconds)} to insert {len(inserts)} and update {len(updates)} rows"
                )

        return True