def download_entrez_ftp(ftp_url, output_file, attempt=1): """ Read the FTP stream, unzip the contents and write them one line at a time to our bgzip file """ if get_smk_config().get("debug"): print(ftp_url) # set bash strict mode in case the bgzip command fails strict_mode = "set -euo pipefail; " # stream the file from the remote FTP server and recode on the fly into bgzip format cmd = f"bgzip -cd '{ftp_url}' | bgzip -c > {output_file}" # run the command proc = subprocess.Popen(strict_mode + cmd, shell=True, executable="/bin/bash") # fetch any output and error (out, err) = proc.communicate() # bail if something went wrong if proc.returncode != 0: if attempt < ENTREZ_MAX_ATTEMPTS: # try downloading it again download_entrez_ftp(ftp_url, output_file, attempt + 1) else: print_error(f"Unable to download assembly {ftp_url}\n{err}")
def __init__(self): parser = argparse.ArgumentParser(usage="""haystac <command> [<args>] The haystac commands are: config Configuration options database Build a database of target species sample Prepare a sample for analysis analyse Analyse a sample against a database """, ) parser.add_argument("command", choices=COMMANDS, help="Command to run") # get the CLI arguments args = self._parse_args(parser, level=1) # load the config files self._load_config() try: # call the class method with the name given by `command` reval = getattr(self, args.command)() exit(reval) except ValidationError as error: print_error(f"{error}")
def entrez_find_replacement_accession(accession): """ If the updated version of an assembly accession if possible """ # try getting a new accession for the master record accession_new = accession[:-1] + "0000000" try: # send a request and see if we get back an xml result r = entrez_request( "efetch.fcgi", { "db": "nuccore", "id": accession_new, "rettype": "gb", "retmode": "xml" }, ) except requests.exceptions.HTTPError: print_error( f"Could not find either the GenBank record for '{accession}' or an alternative accession" ) # noinspection PyUnboundLocalVariable etree = ElementTree.XML(r.text) replacement = etree.find(".//GBSeq_accession-version") # check that the new accession is a WGS project keywords = [ keyword.text.lower() for keyword in etree.findall(".//GBKeyword") ] if replacement is not None and "wgs" in keywords: print_warning( f"Replacing the superseded WGS accession '{accession}' with '{replacement.text}'" ) return replacement.text else: print_error( f"Could not find either the GenBank record for '{accession}' or a valid alternative accession. " f"Please consider using the `--exclude-accessions` flag to remove accession '{accession}' from this query." )
def entrez_request(action, params=None, attempt=1): """ Helper function to ensure that we never exceed the rate limit. """ params = params or dict() # tell NCBI which application is making these requests params["tool"] = ENTREZ_TOOL params["email"] = ENTREZ_EMAIL config = get_smk_config() if config.get("api_key"): # append the user specified api_key params["api_key"] = config["api_key"] url = ENTREZ_URL + action if len(params.get("id", [])) > ENTREZ_MAX_UID: print_error( f"List of Entrez IDs exceeds the maximum: {ENTREZ_MAX_UID}") if config.get("debug"): # turn into a get request print(params.items()) get = dict((key, value if isinstance(value, (str, int)) else ",".join( str(val) for val in value)) for key, value in params.items()) print(f"{url}?{urlencode(get)}") # make the request r = requests.post(url, params) # enforce the rate limit (even when the request failed) time.sleep(ENTREZ_WAIT_TIME) if not r.ok: if r.status_code in ENTREZ_ERRORS and attempt < ENTREZ_MAX_ATTEMPTS: return entrez_request(action, params, attempt + 1) else: r.raise_for_status() return r
def entrez_nuccore_query(query, output_file): """ Query the NCBI nuccore database to get a list of sequence accessions and their metadata. """ # execute the search key, webenv, id_list = entrez_esearch("nuccore", query) # check that there was at least one record found if len(id_list) == 0: print_error(f"The --query '{query}' returned no results") # fetch the results etree = entrez_esummary("nuccore", key, webenv) # convert the ElementTree into a a list of dicts data = entrez_xml_to_dict(etree) with open(output_file, "w") as fout: w = csv.DictWriter(fout, data[0].keys(), delimiter="\t") w.writeheader() w.writerows(data)
def entrez_range_accessions(accession, first, last): """ Return a range between two accession codes (e.g. ABC001 -> ABC005) """ # sanity check that the items are equal length and in order assert len(first) == len(last) and first < last # find the index of the first difference idx = [i for i in range(len(first)) if first[i] != last[i]][0] pad = len(first) - idx try: # return the range return [ f"{first[:idx]}{str(item).zfill(pad)}" for item in range(int(first[idx:]), int(last[idx:]) + 1) ] except ValueError: print_error( f"Could not resolve the accession range '{first}-{last}' for master record '{accession}'" )
def calculate_bt2_idx_chunks(mem_resources, mem_rescale_factor, fasta_files, output_tsv, output_txt): """Calculate the number of chunks that the db sequences are going to be split into""" fasta_paths_random = [] with open(str(fasta_files), "r") as fin: for line in fin: fasta_paths_random.append(line.strip()) chunk_size = float(mem_resources) / float(mem_rescale_factor) total_size = 0.0 chunks = 1 chunk_df_list = [] for fasta_file in fasta_paths_random: file_size = os.stat(fasta_file).st_size / (1024 ** 2) if file_size >= mem_resources or file_size >= chunk_size: print_error( f"Fasta file {fasta_file} is bigger than the RAM resources provided. " f"Unfortunately an index cannot be built." ) if total_size + file_size >= chunk_size: total_size = file_size chunks += 1 else: total_size += file_size chunk_df_list.append([chunks, fasta_file]) chunk_df = pd.DataFrame(chunk_df_list, columns=["chunk", "path"]) chunk_df.to_csv(output_tsv, sep="\t", index=False, header=False) idx_chunk_total = chunk_df["chunk"].max() with open(output_txt, "w") as outfile: print(idx_chunk_total, file=outfile)
def _run_snakemake(self, module, args, config, target_list): """ Helper function for running the snakemake workflow """ print(f"HAYSTAC v {__version__}\n") print(f"Date: {datetime.datetime.now()}\n") config["module"] = module config["workflow_dir"] = os.path.join(CODE_DIR, "workflow") print("Config parameters:\n") params = config if args.debug else vars(args) for key, value in params.items(): if value or args.debug: print(f" {key}: {value}") print("\n") if args.debug: print("Target files:\n") for target in target_list: print(" " + target) print("\n") try: os.makedirs(SNAKE_DIR, exist_ok=True) except PermissionError: print_error("Cannot write to the current working directory.") # save the run-time config file with open(CONFIG_RUNTIME, "w") as fout: yaml.safe_dump(config, fout, default_flow_style=False) # get any extra snakemake params smk_params = config.pop("snakemake") or {} # get any rule targets for batching and convert them into a compatible object exec_batch = Batch(config["batch"][0], config["batch"][1], config["batch"][2]) if config.get("batch") else None if config.get("batch"): config.pop("batch") success = snakemake.snakemake( snakefile=os.path.join(CODE_DIR, "workflow/workflow.smk"), config=config, targets=target_list, batch=exec_batch, cores=int(args.cores), resources={ "entrez_api": self.max_entrez_requests, "mem_mb": int(args.mem) }, force_incomplete=True, scheduler="greedy", # handle the rule-specific conda environments use_conda=config["use_conda"], conda_frontend="mamba", conda_prefix=os.path.join(config["cache"], "conda") if config["use_conda"] else None, # set all the debugging flags printreason=args.debug, printshellcmds=args.debug, show_failed_logs=args.debug, verbose=args.debug, keep_incomplete=args.debug, restart_times=0 if args.debug else RESTART_TIMES, keepgoing=(not args.debug), unlock=args.unlock, # pass on any CLI arguments from the --snakemake flag **smk_params, ) # tidy up all the snakemake metadata if success and (len(os.listdir(os.path.join(SNAKE_DIR, "locks"))) == 0): shutil.rmtree(SNAKE_DIR) # translate "success" into shell exit code of 0 return 0 if success else 1
def entrez_assembly_ftp(accession, force=False): """ Get an NCBI ftp url from the assembly database. """ # find out if we are looking for a virus to apply the correct filter filter_condition = ' AND "latest"[filter] NOT suppressed*' config = get_smk_config() if config.get("refseq_rep") and config["refseq_rep"] == "viruses": # append the virus filter filter_condition += " AND viruses[filter] " # query the assembly database to get the latest assembly for this accession code key, webenv, id_list = entrez_esearch( "assembly", accession + filter_condition, ) if len(id_list) > 1: # should never happen, but... msg = f"Multiple assembly accessions found for '{accession}': {id_list}. " # if force-accessions is true pick the largest int value, assuming it is also the altest if force: msg += f"Using assembly: id pair '{accession}': '{max([int(id_num) for id_num in id_list])}'" id_list = [str(max([int(id_num) for id_num in id_list]))] print_warning(msg) # if not raise an error else: msg += ( f"Either consider using the `--force-accessions` flag for the largest ID to be picked, " f"or the `--exclude-accessions` flag to remove accession '{accession}' from this query." ) print_error(msg) elif len(id_list) == 0: # no entry in the assembly database for this accession code return "" # fetch the summary record for the assembly r = entrez_request("esummary.fcgi", {"db": "assembly", "id": id_list}) # parse the XML result etree = ElementTree.XML(r.text) # check if the assembly is anomalous anomalous = [ reason.text for reason in etree.findall(".//Anomalous/Property") ] if anomalous: message = ( f"Assembly '{accession}' has been marked as anomalous for the following reasons: " f"'{'; '.join(anomalous)}'") if force: print_warning(message) else: print_error(message) refseq = etree.find(".//FtpPath_RefSeq") genbank = etree.find(".//FtpPath_GenBank") # preference RefSeq URLs over GenBank URLs if refseq is not None and refseq.text not in ["", None]: ftp_stub = refseq.text elif genbank is not None and genbank.text not in ["", None]: ftp_stub = genbank.text else: return "" # append the fasta filename ftp_url = os.path.join(ftp_stub, os.path.basename(ftp_stub) + "_genomic.fna.gz") return ftp_url
def entrez_download_sequence(accession, output_file, force=False, mtdna=False): """ Fetch the Entrez fasta record for a nuccore accession. """ # query the assembly database to see if there is an FTP url we can use ftp_url = entrez_assembly_ftp(accession, force) if not mtdna else "" try: if ftp_url: download_entrez_ftp(ftp_url, output_file) return except urllib.error.URLError: pass try: # fetch the fasta record from nuccore r = entrez_request("efetch.fcgi", {"db": "nuccore", "id": accession, "rettype": "fasta", "retmode": "text"}) # the fasta may be empty if this is a "master record" containing multiple other records (NZ_APLR00000000.1) if len(r.text.strip()) > 1: with bgzf.open(output_file, "w") as fout: print(r.text, file=fout) return except requests.exceptions.HTTPError: pass try: # get the full GenBank XML record r = entrez_request("efetch.fcgi", {"db": "nuccore", "id": accession, "rettype": "gb", "retmode": "xml"}) except requests.exceptions.HTTPError: # check for a replacement accession (there may be a newer version if this a WGS project) updated_accession = entrez_find_replacement_accession(accession) # download the updated accession instead entrez_download_sequence(updated_accession, output_file, force) return # parse the XML result etree = ElementTree.XML(r.text) # get the first and last accession codes for this master record first = etree.find(".//GBAltSeqItem_first-accn") last = etree.find(".//GBAltSeqItem_last-accn") if first is None or last is None: print_error( f"Could not download the fasta file for {accession}. Please consider using the `--exclude-accessions` " f"flag to remove accession '{accession}' from this query." ) # get all the related accession codes accessions = entrez_range_accessions(accession, first.text, last.text) try: with bgzf.open(output_file, "w") as fout: # fetch all the accessions in batches for id_list in chunker(accessions, ENTREZ_MAX_UID): r = entrez_request( "efetch.fcgi", {"db": "nuccore", "id": id_list, "rettype": "fasta", "retmode": "text"}, ) # write the fasta data to our bgzip file print(r.text, file=fout) except requests.exceptions.HTTPError: print_error( f"Could not download the accession range '{first.text}-{last.text}' for master record '{accession}'. " f"Please consider using the `--exclude-accessions` flag to remove accession '{accession}' from this query." )
def entrez_refseq_virus_create_files( config, input_file, viral_genomes_out, ): """Function to parse the refseq genomes report for viruses.""" # read the file refseq_viruses = pd.read_csv(input_file, sep="\t") # drop duplicate species/strains refseq_viruses_rmdup = refseq_viruses[~refseq_viruses["#Organism/Name"]. duplicated(keep="first")] viruses_unique = refseq_viruses_rmdup[["#Organism/Name", "Segmemts"]].copy() viruses_unique["Accession"] = "" # assign a segment accession code to a species. # 1 segment acc should be enough as all of them will be fetched through the assembly database. for index, row in viruses_unique.iterrows(): seq_list = row["Segmemts"].split("; ") if len(seq_list) == 1 and ":" not in seq_list[0]: if "/" in seq_list[0]: row["Accession"] = seq_list[0].split("/")[0] elif "-" == seq_list[0]: continue else: row["Accession"] = seq_list[0] elif len(seq_list) == 1 and ":" in seq_list[0]: row["Accession"] = seq_list[0].split(":")[1].split("/")[0] elif len(seq_list) > 1: if "/" in seq_list[0].split(":")[1]: row["Accession"] = seq_list[0].split(":")[1].split("/")[0] else: row["Accession"] = seq_list[0].split(":")[1] else: print_error(seq_list) break # rename columns viruses = viruses_unique[["#Organism/Name", "Accession"]] viruses.rename(columns={ "#Organism/Name": "species", "Accession": "AccessionVersion" }, inplace=True) # drop rows that have no accessions viruses = viruses[viruses["AccessionVersion"] != ""] # regex for species name viruses["species"] = viruses["species"].replace(REGEX_BLACKLIST, "_", regex=True) # check for duplicates from user input if config["sequences"] or config["accessions"]: user_inputs = [] if os.path.isfile(config["sequences"]): custom_fasta_paths = pd.read_csv( config["sequences"], sep="\t", header=None, names=["species", "accession", "path"], ) user_inputs.append(custom_fasta_paths) if os.path.isfile(config["accessions"]): custom_accessions = pd.read_csv( config["accessions"], sep="\t", header=None, names=["species", "accession"], ) user_inputs.append(custom_accessions) for user_df in user_inputs: viruses = viruses[(~viruses["species"].isin(user_df["species"]))] # print the output to csv header = ["species", "AccessionVersion"] viruses.to_csv(viral_genomes_out, sep="\t", header=header, index=False)