def determine_appropriate_source_from_hints( self, tool: Tool, inpid: str, source: Union[str, List[str], Dict[str, Union[str, List[str]]]], ) -> Optional[Union[str, List[str]]]: if isinstance(source, str) or isinstance(source, list): return source elif not isinstance(source, dict): Logger.critical( f"The input to the tool '{tool.id()}'.'{inpid}' did not have the correct format for doc.source, " f"expected Union[str, List[str], Dict[str, Union[str, List[str]]]], received '{type(source)}'" ) tishj = ", ".join(source.keys()) if not self.source_hints or len(self.source_hints) == 0: Logger.warn( f"There were no source hints specified to find an input for {tool.id()}.{inpid}, expected one " f"or more of {tishj}. You can specify source hints with --source-hint (in janis prepare)." ) return None for hint in self.source_hints: if hint in source: return source[hint] shj = ", ".join(self.source_hints) Logger.warn( f"Couldn't find any of the specified source_hints ({shj}) in the tool input {tool.id()}.{inpid}'s source fields ({tishj})" ) return None
def query_tasks(self, status, name) -> Dict[str, WorkflowModel]: rows: [TaskRow] = self.get_lazy_db_connection().get_all_tasks() failed = [] relevant = {} for row in rows: if not os.path.exists(row.outputdir): failed.append(row.wid) continue try: metadb = WorkflowManager.has( row.outputdir, wid=row.wid, name=name, status=status ) if metadb: model = metadb.to_model() model.outdir = row.outputdir relevant[row.wid] = model except Exception as e: Logger.critical(f"Couldn't check workflow '{row.wid}': {e}") failed.append(row.wid) if failed: failedstr = ", ".join(failed) Logger.warn( f"Couldn't get information for tasks: {failedstr}, run" f"'janis cleanup' to clean up your tasks." ) return relevant
def cleanup_missing_tasks(self): from tabulate import tabulate rows: [TaskRow] = self.get_lazy_db_connection().get_all_tasks() failed = [] for row in rows: if not os.path.exists(row.outputdir): failed.append((row.wid, row.outputdir)) continue try: _ = WorkflowManager.from_path_with_wid( row.outputdir, row.wid, readonly=True ) except Exception as e: failed.append((row.wid, row.outputdir)) if failed: Logger.warn(f"Removing the following tasks:\n" + tabulate(failed)) if "y" in str(input(f"Remove {len(failed)} tasks (Y / n)? ")).lower(): self.get_lazy_db_connection().remove_by_ids([r[0] for r in failed]) Logger.info("Cleaned up tasks") else: Logger.info("Skipping cleaning of tasks")
def unpickle_obj(obj): if obj is None: return None try: return pickle.loads(obj) except Exception as ex: Logger.warn(f"Couldn't unpickle {repr(obj)} as encountered {repr(ex)}") return None
def pickle_obj(obj): if obj is None: return None try: return pickle.dumps(obj, protocol=2) except Exception as ex: Logger.warn(f"Couldn't pickle {repr(obj)} as encountered {repr(ex)}") return None
def send_slack_notification(result: Dict, option: NotificationOption): Logger.info("sending notification to Slack") if len(result["failed"]) == 0 and not result["execution_error"]: failed = False status = "Test Succeeded" icon = ":white_check_mark:" else: failed = True status = "Test Failed" icon = ":x:" test_description = "" if option.test_id: test_description = f" *{option.test_id}*" summary_block = { "type": "section", "text": { "type": "mrkdwn", "text": f"{icon} {status}{test_description}: {option.tool_name} - {option.test_case}", }, } blocks = [summary_block] if failed and result["failed"]: failed_expected_output = [] for f in result["failed"]: failed_expected_output.append(f":black_small_square: {f}") failed_block = { "type": "section", "text": {"type": "mrkdwn", "text": "\n".join(failed_expected_output)}, } blocks.append(failed_block) if result["execution_error"]: text = result["execution_error"].replace("\n", "<br />") execution_error_block = { "type": "section", "text": {"type": "mrkdwn", "text": f"{result['execution_error']}"}, } blocks.append(execution_error_block) request = {"blocks": blocks} resp = requests.post(url=option.url, json=request) if resp.status_code == requests.codes.ok: Logger.info("Notification sent") else: Logger.warn("Failed to send slack notification") Logger.warn(f"{resp.status_code}: {resp.text}") return resp.status_code, resp.text
def execute(args): output = None if args.output: output = ast.literal_eval(args.output) try: available_test_cases = find_test_cases(args.tool) if args.test_case: if args.test_case not in available_test_cases: raise TestCasesNotFound( f"Test case with name `{args.test_case}` NOT found." ) test_cases = [args.test_case] else: test_cases = available_test_cases except Exception as e: Logger.critical("Unexpected error occurred when searching for test cases") Logger.critical(str(e)) exit() for tc_name in test_cases: result = run_test_case( tool_id=args.tool, test_case=tc_name, engine=args.engine, output=output, config=args.config, ) result["test_case"] = tc_name cli_logging(result) try: # send output to test framework API if args.test_manager_url and args.test_manager_token: option = UpdateStatusOption( url=args.test_manager_url, token=args.test_manager_token ) update_status(result, option) except Exception as e: Logger.warn(f"Failed to update test status to {args.test_manager_url}") try: # Send notification to Slack if args.slack_notification_url: option = NotificationOption( url=args.slack_notification_url, tool_name=args.tool, test_case=tc_name, test_id=args.test_id, ) send_slack_notification(result=result, option=option) except Exception as e: Logger.warn( f"Failed to send notifications to Slack {args.slack_notification_url}" )
def __init__(self, host, repository, image, tag, chash: str): self.host = host self.repository = repository self.image = image self.tag = tag self.chash = chash if image is None: Logger.warn(f"{str(self)} didn't have an image, so setting to None") self.image = "ubuntu"
def create_task_base(self, wf: Workflow, job: PreparedJob): forbiddenids = set() if job.store_in_central_db: try: with self.with_cursor() as cursor: forbiddenids = set( t[0] for t in cursor.execute("SELECT id FROM tasks").fetchall() ) except sqlite3.OperationalError as e: if "no such column: id" in repr(e): from shutil import move dt = datetime.utcnow() np = f"{job.db_path}.original-{dt.strftime('%Y%m%d')}" Logger.warn(f"Moving old janis-db to '{np}'") move(job.db_path, np) self._taskDB = None return self.create_task_base(wf=wf, job=job) raise submission_id = generate_new_id(forbiddenids) output_dir = fully_qualify_filename(job.output_dir) if not job.execution_dir: job.execution_dir = os.path.join(output_dir, "janis") Logger.debug( f"No execution-dir was provided, constructed one from the output-dir: {job.execution_dir}" ) job.execution_dir = fully_qualify_filename(job.execution_dir) Logger.info( f"Starting task with id = '{submission_id}' | output dir: {job.output_dir} | execution dir: {job.execution_dir}" ) row = TaskRow( submission_id, execution_dir=job.execution_dir, output_dir=output_dir ) WorkflowManager.create_dir_structure(job.execution_dir) if job.store_in_central_db: self.get_lazy_db_connection().insert_task(row) else: Logger.info( f"Not storing task '{submission_id}' in database. To watch, use: 'janis watch {output_dir}'" ) if self._connection: self._connection.commit() self._connection.close() self._taskDB = None self._connection = None return row
def get_tag_and_cleanup_prefix( prefix, ) -> Optional[Tuple[str, str, bool, Optional[DataType]]]: """ :param prefix: :return: (raw_element, potentialID, hasSeparator, potentialType) """ # cases: # -a ADAPTER # --adapter=ADAPTER # --quality-cutoff=[5'CUTOFF,]3'CUTOFF el = prefix.lstrip() has_equals = False pretag = None potential_type = None # if prefix is split by ':' or split by if ":" in el or "=" in el: parts = None if ":" in el: parts = el.split(":") elif "=" in el: parts = el.split("=") has_equals = True if len(parts) > 2: Logger.warn( f"Unexpected number of components in the tag '{el}' to guess the type, using '{parts[0]}' and skipping type inference" ) else: el, pt = parts[0], guess_type(parts[1]) if not potential_type and pt: potential_type = pt if " " in el: el = el.split(" ")[0] titleComponents = [l.strip().lower() for l in el.split("-") if l] if len(titleComponents) == 0: Logger.critical( f"Title components for tag '{prefix}' does not have a component") return None tag = "_".join(titleComponents) if tag.lower() in common_replacements: tag = common_replacements[tag.lower()] if tag.lower() == "outputfilename": potential_type = Filename return el, tag, has_equals, potential_type
def get_by_id(self, submission_id, allow_operational_errors=True) -> Optional[SubmissionModel]: s = self.get( where=("id = ?", [submission_id]), allow_operational_errors=allow_operational_errors, ) if s is None: return None if len(s) != 1: Logger.warn( f"Couldn't get submission with id={submission_id}, query returned {len(s)} results." ) return None return s[0]
def copy_outputs_if_required(self): if self.database.progressDB.has(ProgressKeys.copiedOutputs): return Logger.debug(f"Workflow '{self.wid}' has copied outputs, skipping") if self.database.workflowmetadata.status != TaskStatus.COMPLETED: return Logger.warn( f"Skipping copying outputs as workflow " f"status was not completed ({self.database.workflowmetadata.status})" ) wf_outputs = self.database.outputsDB.get_all() engine_outputs = self.get_engine().outputs_task(self.get_engine_wid()) eoutkeys = engine_outputs.keys() fs = self.environment.filescheme for out in wf_outputs: eout = engine_outputs.get(out.tag) if eout is None: Logger.warn( f"Couldn't find expected output with tag {out.tag}, found outputs ({', '.join(eoutkeys)}" ) continue originalfile, newfilepath = self.copy_output( fs=fs, outputid=out.tag, prefix=out.prefix, tag=out.tags, secondaries=out.secondaries, extension=out.extension, engine_output=eout, iscopyable=out.iscopyable, ) if isinstance(originalfile, list): originalfile = recursively_join(originalfile, "|") if isinstance(newfilepath, list): newfilepath = recursively_join(newfilepath, "|") self.database.outputsDB.update_paths( tag=out.tag, original_path=originalfile, new_path=newfilepath ) self.database.progressDB.set(ProgressKeys.copiedOutputs) Logger.info(f"View the task outputs: file://{self.get_task_path()}")
def get_ids(self, db_path): try: with self.with_cursor() as cursor: return set( t[0] for t in cursor.execute("SELECT id FROM tasks").fetchall() ) except sqlite3.OperationalError as e: if "no such column: id" in repr(e): from shutil import move dt = datetime.utcnow() np = f"{db_path}.original-{dt.strftime('%Y%m%d')}" Logger.warn(f"Moving old janis-db to '{np}'") move(db_path, np) self._taskDB = None return self.get_ids(db_path) raise
def parse(container: str): if "/" in container: matched = docker_string_regex.match(container) if not matched: raise Exception(f"Invalid docker container '{container}'") name, tag, chash = matched.groups() else: if "@" in container or ":" in container: if "@" in container: parts = container.split("@") else: parts = container.split(":") if len(parts) != 2: # This might happen if you use a library container with a tag AND a hash on dockerhub # raise an issue if this happens raise Exception( f"Unexpected format for container: {str(container)}. If you're using a library container with a tag AND a hash, please raise an issue on GitHub" ) name, tagorhash = parts if ContainerInfo.validate_docker_digest(tagorhash) is False: tag, chash = tagorhash, None else: tag, chash = None, tagorhash else: name, tag, chash = container, None, None host, repo, image = ContainerInfo.deconstruct_image_name(name) has_hash = chash is not None final_tag = None if not has_hash: final_tag = "latest" if tag is None else tag else: if ContainerInfo.validate_docker_digest(chash) is False: Logger.warn( "Invalid format for docker hash ({hash}) in container {container}" ) return False # final_tag = chash if tag is None else f"{tag}@{chash}" return ContainerInfo( host=host, repository=repo, image=image, tag=final_tag, chash=chash )
def evaluate_output_selector(self, selector, inputs: dict): if selector is None: return None if isinstance(selector, str): return selector if isinstance(selector, list): return [self.evaluate_output_selector(s, inputs) for s in selector] if isinstance(selector, InputSelector): if selector.input_to_select not in inputs: Logger.warn(f"Couldn't find the input {selector.input_to_select}") return None return inputs[selector.input_to_select] raise Exception( f"Janis assistant cannot evaluate selecting the output from a {type(selector).__name__} type" )
def filter_updates(self, jobs: List[RunJobModel], add_inserts_to_cache=True ) -> Tuple[List[RunJobModel], List[RunJobModel]]: # don't call super, it'll break because of the cache updates = [] inserts = [] if len(jobs) == 0: return updates, inserts self.populate_cache_if_required() idkeys = set(self.get_id_keys()) idkeys_ordered = list(idkeys) dbalias_map = {t.dbalias: t.name for t in self._base.keymap()} skipped = 0 for job in jobs: el_idkey = tuple( [getattr(job, dbalias_map[_k]) for _k in idkeys_ordered]) jstatus = self._cache_completed_ids.get(el_idkey) if jstatus is None: inserts.append(job) elif job.status.value != jstatus: updates.append(job) elif jstatus: skipped += 1 self._cache_completed_ids[el_idkey] = job.status.value if skipped: Logger.log( f"Skipped updating {skipped} jobs as those jobs were already in a final state" ) memory = getsizeof(self._cache_completed_ids) // 1024 if (self.job_cache_last_idx < len(self.job_cache_warnings) and memory > self.job_cache_warnings[self.job_cache_last_idx]): Logger.warn(f"Job cache is using {memory} MB") self.job_cache_last_idx += 1 return updates, inserts
def get_file_from_searchname(name, cwd): if cwd == ".": cwd = os.getcwd() Logger.log(f"Searching for a file called '{name}'") resolved = os.path.expanduser(name) if os.path.exists(resolved) and os.path.isfile(resolved): Logger.log(f"Found file called '{name}'") return resolved Logger.log(f"Searching for file '{name}' in the cwd, '{cwd}'") with Path(cwd): if os.path.exists(name) and os.path.isfile(resolved): Logger.log(f"Found file in '{cwd}' called '{name}'") return os.path.join(cwd, name) Logger.log( f"Attempting to get search path $JANIS_SEARCHPATH from environment variables" ) search_path = os.getenv("JANIS_SEARCHPATH") if search_path: Logger.log( f"Got value for env JANIS_SEARCHPATH '{search_path}', searching for file '{name}' here." ) if os.path.exists(search_path): with Path(search_path): if os.path.exists(name) and os.path.isfile(resolved): Logger.log( f"Found file in '{search_path}' called '{name}'") return os.path.join(search_path, name) else: Logger.warn( f"Search path '{search_path}' (obtained from $JANIS_SEARCHPATH) does not exist " ) else: Logger.log( "Couldn't find JANIS_SEARCHPATH in environment variables, skipping" ) Logger.log( f"Couldn't find a file with filename '{name}' in any of the following: " f"full path, current working directory ({cwd}) or the search path.") return None
def start_engine_if_required(self): # engine should be loaded from the DB engine = self.get_engine() self.environment.engine = engine is_allegedly_started = engine.test_connection() if is_allegedly_started: return if not isinstance(engine, Cromwell): engine.start_engine() return additional_cromwell_params = [] if not engine.config: Logger.info("Skipping start database as Janis is not managing the config") else: dbconfig: JanisDatabaseConfigurationHelper = self.database.workflowmetadata.dbconfig dbtype = dbconfig.which_db_to_use() if dbtype == dbconfig.DatabaseTypeToUse.existing: engine.config.database = dbconfig.get_config_for_existing_config() elif dbtype == dbconfig.DatabaseTypeToUse.filebased: engine.config.database = dbconfig.get_config_for_filebased_db( path=self.get_path_for_component(self.WorkflowManagerPath.database) + "/cromwelldb" ) elif dbtype == dbconfig.DatabaseTypeToUse.managed: cromwelldb_config = self.start_mysql_and_prepare_cromwell_config() additional_cromwell_params.append( "-Ddatabase.db.url=" + cromwelldb_config.db.url ) engine.config.database = cromwelldb_config else: Logger.warn( "Skipping database config as '--no-database' option was provided." ) engine.start_engine(additional_cromwell_options=additional_cromwell_params) # Write the new engine details back into the database (for like PID, host and is_started) self.database.workflowmetadata.engine = engine
def process_single_input(self, key: str, dt: DataType, value): if value is None: return None if isinstance(value, list): if not isinstance(dt, Array): Logger.warn( f"{key} provided list of values, but type was not an array" ) subtype = dt else: subtype = dt.subtype() return [ self.process_single_input(f"{key}.{idx}", subtype, value[idx]) for idx in range(len(value)) ] if not isinstance(value, dict): return value if dt.is_base_type((File, Directory)): if "path" in value: return value["path"] else: Logger.warn( f"Couldn't unwrap dictionary for input {key} ('{value}') as it didn't provide a value for 'path'" ) else: Logger.warn( f"Couldn't unwrap dictionary for input {key} ('{value}') as the input isn't expected to be a file" ) return value
def check_extensions(inpid: str, datatype: DataType, path: str): """ This method only WARNS about incorrect extension """ if not isinstance(datatype, File): return if not isinstance(path, str): Logger.warn( f"Expecting string type input '{inpid}' of file File, but received '{type(path)}'" ) # check extension (and in future, secondaries) pre_extensions = [ datatype.extension, *list(datatype.alternate_extensions or []), ] extensions = {ext for ext in pre_extensions if ext is not None} if len(extensions) == 0: # skip because no extension return has_extension = False for ext in extensions: if path.endswith(ext): has_extension = True break if has_extension: # looks like we're sweet Logger.debug( f"Validated that the input for {inpid} had the expected extension for {datatype.id()}" ) return Logger.warn( f"The input for '{inpid}' ({datatype.name()}) did not have the expected extension " f"{' OR '.join(extensions)}: {path}. ")
def guess_datatype_by_filename(filename: str): """ We'll try to guess which datatype a file with name 'filename' is. Primarily, this will look at the extension, and whether the secondary files exist :param filename: :return: """ dts = JanisShed.get_all_datatypes() fs = FileScheme.get_type_by_prefix(filename)() if not isinstance(fs, LocalFileScheme): Logger.warn( f"The filescheme detected by Janis for '{filename}' was not LOCAL. This guess datatype process may rely on " f"polling the {fs.id()} file system to check if related files exist. This might have some financial cost involved." ) file_exists_map = {} # each match has a score matches: List[Tuple[int, File]] = [] for datatype in dts: if isclass(datatype): if not issubclass(datatype, File): continue datatype = get_instantiated_type(datatype) elif not isinstance(datatype, File): continue if not datatype.extension: continue datatype: File = datatype extensions = {datatype.extension, *(datatype.alternate_extensions or [])} matching_extension = None for ext in extensions: if filename.endswith(ext): matching_extension = ext break secondaries_match = True if datatype.secondary_files(): for secondary in datatype.secondary_files(): secondary_filename = apply_secondary_file_format_to_filename( filename, secondary ) if secondary not in file_exists_map: file_exists_map[secondary] = fs.exists(secondary_filename) if not file_exists_map[secondary]: secondaries_match = False break if secondaries_match is False: continue # we got here, we're G if matching_extension is not None and secondaries_match: extension_reward = len(matching_extension) * EXTENSION_REWARD_MULTIPLER secondaries_reward = ( len(datatype.secondary_files() or []) * SECONDARIES_REWARD_MULTIPLER ) score = extension_reward + secondaries_reward matches.append((score, datatype)) if len(matches) == 0: return None elif len(matches) == 1: return matches[0][1] else: matches = sorted(matches, key=lambda a: a[0], reverse=True) matched_dt = matches[0][1] ranked = ", ".join(f"{match[1].name()} ({match[0]})" for match in matches[1:]) Logger.debug( f"There were {len(matches)} for matching datatypes. Using {matched_dt.name()} ({matches[0][0]}) " f"as it was the best match from: {ranked}" ) return matched_dt
def localise_inputs( self, inpid: str, inptype: DataType, dest_dir: str, source: Union[str, List[str]], localise_secondary_files: bool = True, ): if isinstance(source, list): return [ self.localise_inputs(inpid, inptype, dest_dir, s) for s in source ] fs = FileScheme.get_type_by_prefix(source)() if isinstance(fs, LocalFileScheme): return source out_path = self.generate_file_path(source, dest_dir) if os.path.exists(out_path): Logger.info( f"A file already exists when localising '{inpid}' at '{out_path}'. If this isn't the right file, " f"you'll need to manually remove this file before proceeding") else: try: Logger.info(f"Downloading file from {source} -> {out_path}") fs.cp_from(source, out_path) except Exception as e: Logger.critical( f"Couldn't localise source from {source} -> {out_path}: {repr(e)}" ) raise if localise_secondary_files: try: # Handle normal input type or array input type secondary_files = inptype.secondary_files() if inptype.is_array(): secondary_files = inptype.subtype().secondary_files() for sec in secondary_files or []: sec_source = apply_secondary_file_format_to_filename( source, sec) out_sec_path = apply_secondary_file_format_to_filename( out_path, sec) if os.path.exists(out_sec_path): Logger.info( f"The secondary file for {inpid} ({sec}) already exists when localising '{inpid}' at '{out_sec_path}'. If this isn't the right file, " f"you'll need to manually remove this file before proceeding" ) elif not fs.exists(sec_source): Logger.warn( f"Couldn't find the secondary file for {inpid}, expected at {sec_source}, skipping for now" ) else: fs.cp_from(sec_source, out_sec_path) except Exception as e: Logger.critical( f"Couldn't localise secondary file due to: {e}") return out_path
def check_input_for_correctness(self, inpid: str, dt: DataType, value: any): if isinstance(dt, Array): if isinstance(value, list): return [ self.check_input_for_correctness(f"{inpid}[{idx}]", dt.subtype(), v) for idx, v in zip(range(len(value)), value) ] if not isinstance(dt, File): return value if not isinstance(value, str): Logger.warn( f"Expecting string type input '{inpid}' for type File, but received '{type(value)}'. Janis won't transform this value, but you should confirm your inputs." ) return value guessed_datatype = guess_datatype_by_filename(value) if not guessed_datatype: Logger.info( f"Couldn't guess datatype for {value}. Returning the value instead." ) return value if dt.can_receive_from(guessed_datatype): Logger.debug(f"Input '{inpid}' had a compatible type") return value message_prefix = ( f"The value for input '{inpid}' did not match the expected type {dt.name()} " f"through the extension and / or existence of secondary files" ) if not guessed_datatype: Logger.warn( message_prefix + f"\nand Janis couldn't guess the datatype from the input for {inpid} and value '{value}'." ) return value try: transformation = JanisShed.get_transformation_graph().find_connection( guessed_datatype, dt ) steps = ( "".join(t.type1.name() + " -> " for t in transformation) + transformation[-1].type2.name() ) Logger.warn( message_prefix + f",\nJanis guessed the actual datatype for '{inpid}' from data '{value}' to be {guessed_datatype.id()}, " f"and Janis was able to determine a transformation in {len(transformation)} step(s): {steps}" ) wf = JanisTransformation.convert_transformations_to_workflow(transformation) trans = wf.translate("wdl", to_console=False)[0] Logger.debug( f"Transforming {inpid} ({guessed_datatype.name()} -> {dt.name()}): {trans}" ) except Exception as e: Logger.warn( message_prefix + f",\nbut Janis couldn't find a transformation between the guessed and expected type:" f" {guessed_datatype.name()} -> {dt.name()}: {str(e)}" ) return value # maybe do some other things with respect to the path try: return self.try_get_outputs_for( inpid=inpid, wf=wf, inputs={wf.tool_inputs()[0].id(): value}, output_dir=os.path.join(self.cache_dir, inpid), description=f"{guessed_datatype.name()} -> {dt.name()}", ) except Exception as e: Logger.critical( f"An internal error occurred when performing the transformation for {inpid} " f"({guessed_datatype.name()} -> {dt.name()}): {str(e)}" ) Logger.debug(traceback.format_exc()) return value
def do_bed_fasta_contig_check(tool: Tool, inputs: Dict[str, any]): from janis_bioinformatics.data_types import Fasta, Bed, BedTabix supported_bed_types = (Bed, BedTabix) beds_inputs = [] refs = [] for i in tool.tool_inputs(): if isinstance(i.intype, supported_bed_types) or ( isinstance(i.intype, Array) and isinstance(i.intype.subtype(), supported_bed_types)): beds_inputs.append(i) if (isinstance(i.intype, Fasta) and i.intype.secondary_files() and ".fai" in i.intype.secondary_files()): refs.append(i) if len(refs) == 0: return if len(refs) > 1: Logger.info( "Skipping bioinformatics FASTA-BED file checks as there were more than 1 reference" ) for inp_ref in refs: value_ref = inputs[inp_ref.id()] if not value_ref: Logger.warn( f"Skipping '{inp_ref.id()}' as no value was provided") continue ref_contigs = ContigChecker.get_list_of_contigs_from_fastafai( value_ref + ".fai") if not ref_contigs: Logger.debug( f"Didn't get any contigs from ref {value_ref}.fai, skipping..." ) continue for inp_bed in beds_inputs: value_bed = inputs[inp_bed.id()] is_array = isinstance(value_bed, list) beds = value_bed if is_array else [value_bed] for b_idx in range(len(beds)): bed = beds[b_idx] bed_contigs = ContigChecker.get_list_of_contigs_from_bed( bed) missing_contigs = bed_contigs - ref_contigs if missing_contigs: inpname = (f"{inp_bed.id()}.{b_idx}" if is_array else inp_bed.id()) contiglist = (", ".join(missing_contigs) if len(missing_contigs) < 5 else (", ".join(list(missing_contigs)[:3]) + "...")) Logger.warn( f"The BED file '{inpname}' contained {len(missing_contigs)} contigs ({contiglist}) that were missing from the reference: {value_ref}" )
def prepare_all_tools(): JanisShed.hydrate(modules=[janis_unix, janis_bioinformatics]) data_types = JanisShed.get_all_datatypes() tools = { ts[0].id(): {t.version(): t for t in ts} for ts in JanisShed.get_all_tools() } Logger.info(f"Preparing documentation for {len(tools)} tools") Logger.info(f"Preparing documentation for {len(data_types)} data_types") tool_module_index = {} dt_module_index = {} ROOT_KEY = "root" if os.path.exists(tools_dir): rmtree(tools_dir) for toolname, toolsbyversion in tools.items(): # tool = tool_vs[0][0]() tool_versions = sort_tool_versions(list(toolsbyversion.keys())) default_version = tool_versions[0] Logger.log( f"Preparing {toolname}, found {len(tool_versions)} version[s] ({','.join(tool_versions)})" ) defaulttool = toolsbyversion[default_version] if isclass(defaulttool): defaulttool = defaulttool() try: tool_path_components = list( filter( lambda a: bool(a), [defaulttool.tool_module(), defaulttool.tool_provider()], )) except Exception as e: Logger.critical(f"Failed to generate docs for {toolname}: {e}") continue # (toolURL, tool, isPrimary) toolurl_to_tool = [(toolname.lower(), defaulttool, True)] + [ (get_tool_url(toolname, v), toolsbyversion[v], False) for v in tool_versions ] path_components = "/".join(tool_path_components) output_dir = f"{tools_dir}/{path_components}/".lower() if not os.path.exists(output_dir): os.makedirs(output_dir) for (toolurl, tool, isprimary) in toolurl_to_tool: output_str = prepare_tool(tool, tool_versions, not isprimary) output_filename = output_dir + toolurl + ".rst" if output_str is None: Logger.warn(f"Skipping {tool.id()}") continue with open(output_filename, "w+") as tool_file: tool_file.write(output_str) nested_keys_append_with_root(tool_module_index, tool_path_components, toolname, root_key=ROOT_KEY) Logger.log("Prepared " + toolname) for d in data_types: # tool = tool_vs[0][0]() if issubclass(d, Array): Logger.info("Skipping Array DataType") continue try: dt = d() except: print(d.__name__ + " failed to instantiate") continue did = dt.name().lower() Logger.log("Preparing " + dt.name()) output_str = prepare_data_type(dt) dt_path_components = [] # dt_path_components = list(filter( # lambda a: bool(a), # [, tool.tool_provider()] # )) path_components = "/".join(dt_path_components) output_dir = f"{dt_dir}{path_components}/" output_filename = output_dir + did + ".rst" if not os.path.exists(output_dir): os.makedirs(output_dir) nested_keys_append_with_root(dt_module_index, dt_path_components, did, root_key=ROOT_KEY) with open(output_filename, "w+") as dt_file: dt_file.write(output_str) Logger.log("Prepared " + did) def prepare_modules_in_index(contents, title, dir, max_depth=1): module_filename = dir + "/index.rst" module_tools = sorted( set(contents[ROOT_KEY] if ROOT_KEY in contents else [])) submodule_keys = sorted(m for m in contents.keys() if m != ROOT_KEY) indexed_submodules_tools = [m.lower() for m in submodule_keys] with open(module_filename, "w+") as module_file: module_file.write( get_tool_toc( alltoolsmap=tools, title=title, intro_text= f"Automatically generated index page for {title}:", subpages=indexed_submodules_tools, tools=module_tools, max_depth=max_depth, )) for submodule in submodule_keys: prepare_modules_in_index(contents=contents[submodule], title=submodule, dir=f"{dir}/{submodule}/") def prepare_dtmodules_in_index(contents, title, dir, max_depth=1): module_filename = dir + "/index.rst" module_tools = sorted( set(contents[ROOT_KEY] if ROOT_KEY in contents else [])) submodule_keys = sorted(m for m in contents.keys() if m != ROOT_KEY) indexed_submodules_tools = [ m.lower() + "/index" for m in submodule_keys ] with open(module_filename, "w+") as module_file: module_file.write( get_toc( title=title, intro_text= f"Automatically generated index page for {title}:", subpages=indexed_submodules_tools + module_tools, max_depth=max_depth, )) for submodule in submodule_keys: prepare_modules_in_index(contents=contents[submodule], title=submodule, dir=f"{dir}/{submodule}/") prepare_modules_in_index(tool_module_index, title="Tools", dir=tools_dir) prepare_dtmodules_in_index(dt_module_index, title="Data Types", dir=dt_dir, max_depth=1)
def get_workflow_from_file(file, name, include_commandtools=False): # How to import a module given the full path # https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path import importlib.util try: import sys basefilename = os.path.basename(file) sys.path.append(os.path.dirname(file)) spec = importlib.util.spec_from_file_location("module.name", file) foo = importlib.util.module_from_spec(spec) spec.loader.exec_module(foo) ptypes = get_janis_from_module_spec( foo, include_commandtools=include_commandtools, name=name) except Exception as e: raise Exception( f"Unrecognised python file when getting workflow / command tool: {file} :: {e}" ) # Per https://github.com/PMCC-BioinformaticsCore/janis-core/issues/31, we'll use the following process: # 1. If a `name` is defined: # - Force parse every token with a case-insensitive match # - If a single item is returned from a case-sensitive match, then use that # 2. If multiple workflows are defined in the same file, use the last defined workflow # - This covers the existing _If a single workflow is defined, use that_ case # 3. If no tools were found, raise an Exception # 4. If multiple tools are defined in the file, use the last one: # - If a name was defined, `warn` the user that the case-insensitive match returned no results and use the last one # - Otherwise, just tell the user we'll use the last defined tool ptypes_casesensitive = [(k, v) for (k, v) in ptypes if k == name] if len(ptypes_casesensitive) == 1: return ptypes_casesensitive[0][1] if name is None: mains = [v for (k, v) in ptypes if k == "__JANIS_ENTRYPOINT"] if len(mains) > 0: Logger.debug( "Using workflow defined by '__JANIS_ENTRYPOINT' as no name was used" ) return mains[0] wftypes = [ t for t in ptypes if (issubclass(t[1], WorkflowBase ) if isclass(t[1]) else isinstance(t[1], WorkflowBase)) ] detected_tokens = ", ".join(f"'{x[0]}' ({x[1].__class__.__name__})" for x in ptypes) if len(wftypes) > 0: if len(wftypes) > 1: if name: Logger.warn( f"Providing the `--name` parameter performs a case-insensitive search for the tokens in " f"'{basefilename}, and a case-sensitive search returned no results. You had {len(wftypes)} " f"tokens that matched this search. Janis will use the last one, defined as " f"'{ptypes[-1][0]}' from: {detected_tokens}") else: Logger.info( f"Multiple workflows were found in '{basefilename}', using '{wftypes[-1][0]}'" ) return wftypes[-1][1] if len(ptypes) == 0: raise Exception( f"There were no valid tools in '{file}', try running with the `--name YourToolName` parameter " f"to get more information (it might have abstract / unimplemented methods)." ) if len(ptypes) > 1: if name: Logger.warn( f"Providing the `--name` parameter performs a case-insensitive search for the tokens in " f"'{basefilename}, and a case-sensitive search returned no results. You had {len(ptypes)} " f"tokens that matched this search. Janis will use the last one, defined as " f"'{ptypes[-1][0]}' from: {detected_tokens}") else: Logger.info( f"There were multiple tools (an no workflows) detected in {basefilename}, " f"Janis will use '{ptypes[-1][0]}' (the last defined)") return ptypes[-1][1]
def insert_or_update_many(self, els: List[T]): if len(els) == 0: return queries: Dict[str, List[List[any]]] = {} update_separator = ",\n" tab = "\t" idkeys = set(self.get_id_keys()) idkeys_ordered = list(idkeys) pkeys_ordered = self.get_primary_keys() existing_keys = set() # (*pkeys_ordered) # get all primary keys dbalias_map: Dict[str, DatabaseObjectField] = { t.dbalias: t for t in self._base.keymap() } updates, inserts = self.filter_updates(els) def add_query(query, values): if query in queries: queries[query].append(values) else: queries[query] = [values] for job in updates: keys, values = job.prepare_insert() # el_pkeys = [getattr(job, dbalias_map[_k]) for _k in idkeys_ordered] keys_np, values_np = [], [] for k, v in zip(keys, values): if k in idkeys: continue keys_np.append(k) values_np.append(v) # problem is we want to update matching on some fields when they are NULL, our WHERE statement # should be something like: # WHERE id1 = ? AND id2 = ? AND id3 is null AND id4 is null id_keyvalues = { pkey: prep_object_for_db( getattr(job, dbalias_map[pkey].name), encode=dbalias_map[pkey].encode, ) for pkey in idkeys_ordered } id_withvalues_keyvalue_ordered = [ (idkey, idvalue) for idkey, idvalue in id_keyvalues.items() if idvalue is not None ] id_withvalues_updater_keys = [ f"{idkey} = ?" for idkey, _ in id_withvalues_keyvalue_ordered ] id_withvalues_updater_values = [ idvalue for _, idvalue in id_withvalues_keyvalue_ordered ] id_novalues_updater_keys = [ f"{idkey} is NULL" for idkey, idvalue in id_keyvalues.items() if idvalue is None ] prepared_statement = f""" UPDATE {self._tablename} SET {', '.join(f'{k} = ?' for k in keys_np)} WHERE {" AND ".join([*id_withvalues_updater_keys, *id_novalues_updater_keys])} """ vtuple = ( *values_np, *id_withvalues_updater_values, ) add_query(prepared_statement, vtuple) for job in inserts: keys, values = job.prepare_insert() # el_pkeys = [getattr(job, dbalias_map[_k]) for _k in idkeys_ordered] prepared_statement = f""" INSERT INTO {self._tablename} ({', '.join(keys)}) VALUES ({', '.join(f'?' for _ in keys)}); """ add_query(prepared_statement, values) Logger.log( f"DB {self._tablename}: Inserting {len(inserts)} and updating {len(updates)} rows" ) with self.with_cursor() as cursor: start = DateUtil.now() if len(inserts) + len(updates) > 300: Logger.warn( f"DB '{self._tablename}' is inserting {len(inserts)} and updating {len(updates)} rows, this might take a while" ) for query, vvalues in queries.items(): try: Logger.log( f"Running query: {query}\n\t: values: {vvalues}") cursor.executemany(query, vvalues) except OperationalError as e: Logger.log_ex(e) seconds = (DateUtil.now() - start).total_seconds() if seconds > 2: Logger.warn( f"DB '{self._tablename}' took {second_formatter(seconds)} to insert {len(inserts)} and update {len(updates)} rows" ) return True