class ShippingAPI: """Class to communicate with the tool shipping Shipping is used as a unified tool for deploying in the Clinical Genomics environments """ def __init__(self, config: Dict[str, str]): self.config = config self.host_config = config["host_config"] self.binary_path = config["binary_path"] self.process = Process(binary=str(self.binary_path), config=self.host_config, config_parameter="--host-config") self.dry_run = False def set_dry_run(self, dry_run: bool) -> None: """Update dry run""" LOG.info("Set dry run to %s", dry_run) self.dry_run = dry_run def deploy(self, app_name: str, app_config: Path = None): """Command to deploy a tool according to the specifications in the config files""" LOG.info("Deploying the %s software", app_name) deploy_args = [] if app_config: LOG.info("Use app config %s", app_config) deploy_args.extend(["--app-config", str(app_config)]) else: deploy_args.extend(["--tool-name", app_name]) deploy_args.append("deploy") self.process.run_command(deploy_args, dry_run=self.dry_run) for line in self.process.stderr_lines(): LOG.info(line)
def run(madeline_process: Process, ped_stream: List[str]): """Run madeline and generate a file with the results.""" output_dir = mkdtemp() output_prefix = f"{output_dir}/madeline" out_path = f"{output_prefix}.xml" # write the input to a temp file with NamedTemporaryFile("w") as in_file: madeline_content = "\n".join(ped_stream) in_file.write(madeline_content) in_file.flush() madeline_call = [ "--color", "--nolabeltruncation", "--outputprefix", output_prefix, in_file.name, ] madeline_process.run_command(madeline_call) with open(out_path, "r") as output: svg_content = output.read() # strip away the script tag script_tag = '<script type="text/javascript" xlink:href=' '"javascript/madeline.js"></script>' svg_content.replace(script_tag, "") with open(out_path, "w") as out_handle: out_handle.write(svg_content) return out_path
def test_process_run_invalid_command(): # GIVEN a binary with non existing command binary = "false" process = Process(binary=binary) # WHEN running the command with pytest.raises(CalledProcessError): # THEN assert an exception is raised process.run_command()
class LoqusdbAPI: """ API for loqusdb """ def __init__(self, config: dict, analysis_type: str = "wgs"): super(LoqusdbAPI, self).__init__() self.analysis_type = analysis_type self.loqusdb_config = config["loqusdb"]["config_path"] self.loqusdb_binary = config["loqusdb"]["binary_path"] if self.analysis_type == "wes": self.loqusdb_config = config["loqusdb-wes"]["config_path"] self.loqusdb_binary = config["loqusdb-wes"]["binary_path"] self.process = Process(self.loqusdb_binary, self.loqusdb_config) def load(self, family_id: str, ped_path: str, vcf_path: str, gbcf_path: str, vcf_sv_path: str = None) -> dict: """Add observations from a VCF.""" load_call_parameters = [ "load", "-c", family_id, "-f", ped_path, "--variant-file", vcf_path, "--check-profile", gbcf_path, "--hard-threshold", "0.95", "--soft-threshold", "0.90", ] if self.analysis_type == "wgs" and vcf_sv_path: load_call_parameters.extend(["--sv-variants", vcf_sv_path]) nr_variants = 0 self.process.run_command(load_call_parameters) for line in self.process.stderr_lines(): line_content = line.split("INFO")[-1].strip() if "inserted" in line_content: nr_variants = int(line_content.split(":")[-1].strip()) return dict(variants=nr_variants) def get_case(self, case_id: str) -> dict: """Find a case in the database by case id.""" case_obj = None cases_parameters = ["cases", "-c", case_id, "--to-json"] self.process.run_command(cases_parameters) output = self.process.stdout # If case not in loqusdb, stdout of loqusdb command will be empty. if not output: raise CaseNotFoundError(f"Case {case_id} not found in loqusdb") case_obj = json.loads(output)[0] return case_obj def get_duplicate(self, vcf_file: str) -> dict: """Find matching profiles in loqusdb""" ind_obj = {} duplicates_params = [ "profile", "--check-vcf", vcf_file, "--profile-threshold", "0.95" ] try: self.process.run_command(duplicates_params) except CalledProcessError: # If CalledProcessError is raised, log and raise error LOG.critical("Could not run profile command") raise output = self.process.stdout if not output: LOG.info("No duplicates found") return ind_obj ind_obj = json.loads(output) return ind_obj def __repr__(self): return f"LoqusdbAPI(binary={self.loqusdb_binary}," f"config={self.loqusdb_config})"
class CrunchyAPI: """ API for samtools """ def __init__(self, config: dict): self.process = Process("sbatch") self.slurm_account = config["crunchy"]["slurm"]["account"] self.crunchy_env = config["crunchy"]["slurm"]["conda_env"] self.mail_user = config["crunchy"]["slurm"]["mail_user"] self.reference_path = config["crunchy"]["cram_reference"] def bam_to_cram(self, bam_path: Path, ntasks: int, mem: int, dry_run: bool = False): """ Compress BAM file into CRAM """ cram_path = self.get_cram_path_from_bam(bam_path) job_name = bam_path.name + "_bam_to_cram" flag_path = self.get_flag_path(file_path=cram_path) pending_path = self.get_pending_path(file_path=bam_path) log_dir = bam_path.parent sbatch_header = self._get_slurm_header( job_name=job_name, account=self.slurm_account, log_dir=log_dir, mail_user=self.mail_user, conda_env=self.crunchy_env, ntasks=ntasks, mem=mem, ) sbatch_body = self._get_slurm_bam_to_cram( bam_path=bam_path, cram_path=cram_path, flag_path=flag_path, pending_path=pending_path, reference_path=self.reference_path, ) sbatch_content = sbatch_header + "\n" + sbatch_body self._submit_sbatch(sbatch_content=sbatch_content, dry_run=dry_run) def _submit_sbatch(self, sbatch_content: str, dry_run: bool = False): """Submit slurm job""" if not dry_run: with tempfile.NamedTemporaryFile(mode="w+t") as sbatch_file: sbatch_file.write(sbatch_content) sbatch_file.flush() sbatch_parameters = [sbatch_file.name] self.process.run_command(sbatch_parameters) LOG.info(self.process.stderr) LOG.info(self.process.stdout) else: LOG.info("Would submit following to slurm:\n\n%s", sbatch_content) def is_cram_compression_done(self, bam_path: Path) -> bool: """Check if CRAM compression already done for BAM file""" cram_path = self.get_cram_path_from_bam(bam_path) flag_path = self.get_flag_path(file_path=cram_path) if not cram_path.exists(): LOG.info("No cram-file for %s", bam_path) return False index_paths = self.get_index_path(cram_path) index_single_suffix = index_paths["single_suffix"] index_double_suffix = index_paths["double_suffix"] if (not index_single_suffix.exists()) and ( not index_double_suffix.exists()): LOG.info("No index-file for %s", cram_path) return False if not flag_path.exists(): LOG.info("No %s file for %s", FLAG_PATH_SUFFIX, cram_path) return False return True def is_cram_compression_pending(self, bam_path: Path) -> bool: """Check if cram compression has started, but not yet finished""" pending_path = self.get_pending_path(file_path=bam_path) if pending_path.exists(): LOG.info("Cram compression is pending for %s", bam_path) return True return False def is_bam_compression_possible(self, bam_path: Path) -> bool: """Check if it CRAM compression for BAM file is possible""" if bam_path is None or not bam_path.exists(): LOG.warning("Could not find bam %s", bam_path) return False if self.is_cram_compression_done(bam_path=bam_path): LOG.info("cram compression already exists for %s", bam_path) return False return True @staticmethod def get_flag_path(file_path): """Get path to 'finished' flag""" return file_path.with_suffix(FLAG_PATH_SUFFIX) @staticmethod def get_pending_path(file_path): """Gives path to pending-flag path""" return file_path.with_suffix(PENDING_PATH_SUFFIX) @staticmethod def get_index_path(file_path): """Get possible paths for index Args: file_path (Path): path to BAM or CRAM Returns (dict): path with single_suffix, e.g. .bai and path with double_suffix, e.g. .bam.bai """ index_type = CRAM_INDEX_SUFFIX if file_path.suffix == BAM_SUFFIX: index_type = BAM_INDEX_SUFFIX with_single_suffix = file_path.with_suffix(index_type) with_double_suffix = file_path.with_suffix(file_path.suffix + index_type) return { "single_suffix": with_single_suffix, "double_suffix": with_double_suffix, } @staticmethod def get_cram_path_from_bam(bam_path: Path) -> Path: """ Get corresponding CRAM file path from bam file path """ if not bam_path.suffix == BAM_SUFFIX: LOG.error("%s does not end with %s", bam_path, BAM_SUFFIX) raise ValueError cram_path = bam_path.with_suffix(CRAM_SUFFIX) return cram_path @staticmethod def _get_slurm_header( job_name: str, log_dir: str, account: str, mail_user: str, conda_env: str, ntasks: int, mem: int, ) -> str: sbatch_header = SBATCH_HEADER_TEMPLATE.format( job_name=job_name, account=account, log_dir=log_dir, conda_env=conda_env, mail_user=mail_user, ntasks=ntasks, mem=mem, ) return sbatch_header @staticmethod def _get_slurm_bam_to_cram( bam_path: str, cram_path: str, flag_path: str, pending_path: str, reference_path: str, ) -> str: sbatch_body = SBATCH_BAM_TO_CRAM.format( bam_path=bam_path, cram_path=cram_path, flag_path=flag_path, pending_path=pending_path, reference_path=reference_path, ) return sbatch_body
class ChanjoAPI: """Interface to Chanjo, the coverage analysis tool.""" def __init__(self, config: dict): self.chanjo_config = config["chanjo"]["config_path"] self.chanjo_binary = config["chanjo"]["binary_path"] self.process = Process(self.chanjo_binary, self.chanjo_config) def upload( self, sample_id: str, sample_name: str, group_id: str, group_name: str, bed_file: str, ): """Upload coverage for a sample.""" load_parameters = [ "load", "--sample", sample_id, "--name", sample_name, "--group", group_id, "--group-name", group_name, "--threshold", "10", bed_file, ] self.process.run_command(load_parameters) def sample(self, sample_id: str) -> dict: """Fetch sample from the database.""" sample_parameters = ["db", "samples", "-s", sample_id] self.process.run_command(sample_parameters) samples = json.loads(self.process.stdout) for sample in samples: if sample["id"] == sample_id: return sample return None def delete_sample(self, sample_id: str): """Delete sample from database.""" delete_parameters = ["db", "remove", sample_id] self.process.run_command(delete_parameters) def omim_coverage(self, samples: List[str]) -> dict: """Calculate omim coverage for samples""" omim_parameters = ["calculate", "coverage", "--omim"] for sample in samples: omim_parameters.extend(["-s", sample["id"]]) self.process.run_command(omim_parameters) data = json.loads(self.process.stdout) return data def sample_coverage(self, sample_id: str, panel_genes: list) -> dict: """Calculate coverage for samples.""" with tempfile.NamedTemporaryFile(mode="w+t") as tmp_gene_file: tmp_gene_file.write("\n".join([str(gene) for gene in panel_genes])) tmp_gene_file.flush() coverage_parameters = [ "calculate", "coverage", "-s", sample_id, "-f", tmp_gene_file.name, ] self.process.run_command(coverage_parameters) data = json.loads(self.process.stdout).get(sample_id) return data
class GisaidAPI: """Interface with Gisaid cli uppload""" def __init__(self, config: CGConfig): self.housekeeper_api: HousekeeperAPI = config.housekeeper_api self.lims_api: LimsAPI = config.lims_api self.status_db: Store = config.status_db self.gisaid_submitter: str = config.gisaid.submitter self.upload_password: str = config.gisaid.upload_password self.upload_cid: str = config.gisaid.upload_cid self.gisaid_binary: str = config.gisaid.binary_path self.gisaid_log_dir: str = config.gisaid.log_dir self.log_watch: str = config.gisaid.logwatch_email self.email_base_settings = config.email_base_settings self.mutant_root_dir = Path(config.mutant.root) self.process = Process(binary=self.gisaid_binary) def get_completion_file_from_hk(self, case_id: str) -> File: """Find completon file in Housekeeper and return it""" completion_file: Optional[ File] = self.housekeeper_api.find_file_in_latest_version( case_id=case_id, tags=["komplettering"]) if not completion_file: msg = f"completion file missing for bundle {case_id}" raise HousekeeperFileMissingError(message=msg) return completion_file def get_completion_dataframe(self, completion_file: File) -> pd.DataFrame: """Read completion file in to dataframe, drop duplicates, and return the dataframe""" completion_df = pd.read_csv(completion_file.full_path, index_col=None, header=0) completion_df.drop_duplicates(inplace=True) completion_df = completion_df[completion_df["provnummer"].str.contains( SARS_COV_REGEX)] return completion_df def get_gisaid_sample_list(self, case_id: str) -> List[models.Sample]: """Get list of Sample objects eligeble for upload. The criteria is that the sample reached 20x coverage for >95% bases. The sample will be included in completion file.""" completion_file = self.get_completion_file_from_hk(case_id=case_id) completion_df = self.get_completion_dataframe( completion_file=completion_file) sample_names = list(completion_df["provnummer"].unique()) return [ self.status_db.get_sample_by_name(name=sample_name) for sample_name in sample_names ] def get_gisaid_fasta_path(self, case_id: str) -> Path: """Get path to gisaid fasta""" return Path(self.mutant_root_dir, case_id, "results", f"{case_id}.fasta") def get_gisaid_csv_path(self, case_id: str) -> Path: """Get path to gisaid csv""" return Path(self.mutant_root_dir, case_id, "results", f"{case_id}.csv") def get_gisaid_samples(self, case_id: str) -> List[GisaidSample]: """Get list of Gisaid sample objects.""" samples: List[models.Sample] = self.get_gisaid_sample_list( case_id=case_id) gisaid_samples = [] for sample in samples: sample_id: str = sample.internal_id LOG.info(f"Creating GisaidSample for {sample_id}") gisaid_sample = GisaidSample( case_id=case_id, cg_lims_id=sample_id, covv_subm_sample_id=sample.name, submitter=self.gisaid_submitter, fn=f"{case_id}.fasta", covv_collection_date=str( self.lims_api.get_sample_attribute(lims_id=sample_id, key="collection_date")), region=self.lims_api.get_sample_attribute(lims_id=sample_id, key="region"), region_code=self.lims_api.get_sample_attribute( lims_id=sample_id, key="region_code"), covv_orig_lab=self.lims_api.get_sample_attribute( lims_id=sample_id, key="original_lab"), covv_orig_lab_addr=self.lims_api.get_sample_attribute( lims_id=sample_id, key="original_lab_address"), ) gisaid_samples.append(gisaid_sample) return gisaid_samples def create_gisaid_fasta(self, gisaid_samples: List[GisaidSample], case_id: str) -> None: """Writing a new fasta with headers adjusted for gisaid upload_results_to_gisaid""" gisaid_fasta_file = self.housekeeper_api.find_file_in_latest_version( case_id=case_id, tags=["gisaid-fasta", case_id]) if gisaid_fasta_file: gisaid_fasta_path = gisaid_fasta_file.full_path else: gisaid_fasta_path: Path = self.get_gisaid_fasta_path( case_id=case_id) fasta_lines: List[str] = [] for sample in gisaid_samples: fasta_file: File = self.housekeeper_api.find_file_in_latest_version( case_id=case_id, tags=[sample.cg_lims_id, "consensus-sample"]) if not fasta_file: raise HousekeeperFileMissingError( message= f"No fasta file found for sample {sample.cg_lims_id}") with open(str(fasta_file.full_path)) as handle: for line in handle.readlines(): if line[0] == ">": fasta_lines.append(f">{sample.covv_virus_name}\n") else: fasta_lines.append(line) with open(gisaid_fasta_path, "w") as write_file_obj: write_file_obj.writelines(fasta_lines) if gisaid_fasta_file: return self.housekeeper_api.add_and_include_file_to_latest_version( case_id=case_id, file=gisaid_fasta_path, tags=["gisaid-fasta", case_id]) def create_gisaid_csv(self, gisaid_samples: List[GisaidSample], case_id: str) -> None: """Create csv file for gisaid upload""" samples_df = pd.DataFrame( data=[gisaid_sample.dict() for gisaid_sample in gisaid_samples], columns=HEADERS, ) gisaid_csv_file = self.housekeeper_api.find_file_in_latest_version( case_id=case_id, tags=["gisaid-csv", case_id]) if gisaid_csv_file: LOG.info(f"GISAID CSV for case {case_id} exists, will be replaced") gisaid_csv_path = gisaid_csv_file.full_path else: gisaid_csv_path = self.get_gisaid_csv_path(case_id=case_id) samples_df.to_csv(gisaid_csv_path, sep=",", index=False) if gisaid_csv_file: return self.housekeeper_api.add_and_include_file_to_latest_version( case_id=case_id, file=gisaid_csv_path, tags=["gisaid-csv", case_id]) def create_gisaid_log_file(self, case_id: str) -> None: """Path for gisaid bundle log""" gisaid_log_file = self.housekeeper_api.get_files( bundle=case_id, tags=["gisaid-log", case_id]).first() if gisaid_log_file: LOG.info("GISAID log exists in case bundle in Housekeeper") return log_file_path = Path(self.gisaid_log_dir, case_id).with_suffix(".log") if not log_file_path.parent.exists(): raise ValueError( f"Gisaid log dir: {self.gisaid_log_dir} doesnt exist") if not log_file_path.exists(): log_file_path.touch() self.housekeeper_api.add_and_include_file_to_latest_version( case_id=case_id, file=log_file_path, tags=["gisaid-log", case_id]) def create_gisaid_files_in_housekeeper(self, case_id: str) -> None: """Create all gisaid files in Housekeeper, if needed.""" gisaid_samples = self.get_gisaid_samples(case_id=case_id) self.create_gisaid_csv(gisaid_samples=gisaid_samples, case_id=case_id) self.create_gisaid_fasta(gisaid_samples=gisaid_samples, case_id=case_id) self.create_gisaid_log_file(case_id=case_id) def authenticate_gisaid(self): load_call: list = [ "CoV", "authenticate", "--cid", self.upload_cid, "--user", self.gisaid_submitter, "--pass", self.upload_password, ] self.process.run_command(parameters=load_call) def upload_results_to_gisaid(self, case_id: str) -> None: """Load batch data to GISAID using the gisiad cli.""" temp_log_file = tempfile.NamedTemporaryFile(dir=self.gisaid_log_dir, mode="w+", delete=False) gisaid_csv_path = self.housekeeper_api.find_file_in_latest_version( case_id=case_id, tags=["gisaid-csv", case_id]).full_path gisaid_fasta_path = self.housekeeper_api.find_file_in_latest_version( case_id=case_id, tags=["gisaid-fasta", case_id]).full_path gisaid_log_path = (self.housekeeper_api.get_files( bundle=case_id, tags=["gisaid-log", case_id]).first().full_path) self.authenticate_gisaid() load_call: list = [ "--logfile", temp_log_file.name, "CoV", "upload", "--csv", gisaid_csv_path, "--fasta", gisaid_fasta_path, ] self.process.run_command(parameters=load_call) self.append_log(temp_log=Path(temp_log_file.name), gisaid_log=Path(gisaid_log_path)) temp_log_file.close() if self.process.stderr: LOG.info(f"gisaid stderr:\n{self.process.stderr}") if self.process.stdout: LOG.info(f"gisaid stdout:\n{self.process.stdout}") def append_log(self, temp_log: Path, gisaid_log: Path) -> None: """Appends temp log to gisaid log and delete temp file""" with open(str(temp_log.absolute()), "r") as open_temp_log: new_log_data: List = json.load(open_temp_log) if gisaid_log.stat().st_size != 0: with open(str(gisaid_log.absolute()), "r") as open_gisaid_log: old_log_data: List = json.load(open_gisaid_log) new_log_data.extend(old_log_data) with open(str(gisaid_log.absolute()), "w") as open_gisaid_log: json.dump(new_log_data, open_gisaid_log) temp_log.unlink() def get_accession_numbers(self, case_id: str) -> Dict[str, str]: """Parse accession numbers and sample ids from log file""" LOG.info("Parsing accession numbers from log file") accession_numbers = {} log_file = Path( self.housekeeper_api.get_files(bundle=case_id, tags=["gisaid-log", case_id]).first().full_path) if log_file.stat().st_size != 0: with open(str(log_file.absolute())) as log_file: log_data = json.load(log_file) for log in log_data: if log.get("code") == "epi_isl_id": log_message = log.get("msg") elif log.get( "code" ) == "validation_error" and "existing_ids" in log.get( "msg"): log_message = ( f'{log.get("msg").split(";")[0]}; ' f'{re.findall(UPLOADED_REGEX_MATCH, log.get("msg"))[0]}' ) else: continue accession_obj = GisaidAccession(log_message=log_message) accession_numbers[ accession_obj.sample_id] = accession_obj.accession_nr return accession_numbers def update_completion_file(self, case_id: str) -> None: """Update completion file with accession numbers""" completion_file = self.get_completion_file_from_hk(case_id=case_id) accession_dict = self.get_accession_numbers(case_id=case_id) completion_df = self.get_completion_dataframe( completion_file=completion_file) completion_df["GISAID_accession"] = completion_df["provnummer"].apply( lambda x: accession_dict[x]) completion_df.to_csv( completion_file.full_path, sep=",", index=False, ) def upload(self, case_id: str) -> None: """Uploading results to gisaid and saving the accession numbers in completion file""" completion_file = self.get_completion_file_from_hk(case_id=case_id) completion_df = self.get_completion_dataframe( completion_file=completion_file) if len(completion_df["GISAID_accession"].dropna()) == len( completion_df["provnummer"]): LOG.info("All samples already uploaded") return self.create_gisaid_files_in_housekeeper(case_id=case_id) self.upload_results_to_gisaid(case_id=case_id) self.update_completion_file(case_id=case_id)
class MadelineAPI: """Interface to madeline, tool to generate pedigree pictures""" def __init__(self, config: dict): self.madeline_binary = str( pathlib.Path(config["madeline_exe"]).absolute()) self.process = Process(self.madeline_binary) @staticmethod def make_ped(family_id: str, samples: List[dict]) -> Iterable[str]: """Yield lines that are used as madeline input.""" columns = { "family": "FamilyId", "sample": "IndividualId", "sex": "Gender", "father": "Father", "mother": "Mother", "deceased": "Deceased", "proband": "Proband", "status": "Affected", } sex_gender = {"male": "M", "female": "F"} status_affected = {"affected": "Y", "unaffected": "N"} LOG.info("Generating madeline input lines") yield "\t".join(columns.values()) for sample in samples: row = [ family_id, sample["sample"], sex_gender.get(sample["sex"]) or ".", sample.get("father") or ".", sample.get("mother") or ".", "Y" if sample.get("deceased") else ".", "Y" if sample.get("proband") else ".", status_affected.get(sample.get("status")) or ".", ] yield "\t".join(row) @staticmethod def strip_script_tag(content: str) -> str: """Strip away a script tag from a string""" script_tag = ('<script type="text/javascript" xlink:href=' '"javascript/madeline.js"></script>') return content.replace(script_tag, "") def run(self, family_id: str, samples: List[dict], out_path: str = None) -> pathlib.Path: """Run madeline and generate a file with the results.""" if out_path: out_path = pathlib.Path(out_path) else: output_dir = pathlib.Path(tempfile.mkdtemp()) out_path = output_dir / "madeline.xml" output_prefix = str(out_path.with_suffix("")) LOG.info("Generate madeline output to %s", out_path) ped_stream = self.make_ped(family_id, samples) # write the input to a temp file with tempfile.NamedTemporaryFile("w") as in_file: madeline_content = "\n".join(ped_stream) in_file.write(madeline_content) in_file.flush() madeline_call = [ "--color", "--nolabeltruncation", "--outputprefix", output_prefix, in_file.name, ] self.process.run_command(parameters=madeline_call) with open(out_path, "r") as output: svg_content = output.read() svg_content = self.strip_script_tag(svg_content) with open(out_path, "w") as out_handle: out_handle.write(svg_content) return out_path def __repr__(self): return f"MadelineApi({self.madeline_binary})"