def download_entrez_ftp(ftp_url, output_file, attempt=1):
    """
    Read the FTP stream, unzip the contents and write them one line at a time to our bgzip file
    """
    if get_smk_config().get("debug"):
        print(ftp_url)

    # set bash strict mode in case the bgzip command fails
    strict_mode = "set -euo pipefail; "

    # stream the file from the remote FTP server and recode on the fly into bgzip format
    cmd = f"bgzip -cd '{ftp_url}' | bgzip -c > {output_file}"

    # run the command
    proc = subprocess.Popen(strict_mode + cmd, shell=True, executable="/bin/bash")

    # fetch any output and error
    (out, err) = proc.communicate()

    # bail if something went wrong
    if proc.returncode != 0:
        if attempt < ENTREZ_MAX_ATTEMPTS:
            # try downloading it again
            download_entrez_ftp(ftp_url, output_file, attempt + 1)
        else:
            print_error(f"Unable to download assembly {ftp_url}\n{err}")
Beispiel #2
0
    def __init__(self):
        parser = argparse.ArgumentParser(usage="""haystac <command> [<args>]

The haystac commands are:
   config         Configuration options
   database       Build a database of target species
   sample         Prepare a sample for analysis
   analyse        Analyse a sample against a database
   
""", )
        parser.add_argument("command", choices=COMMANDS, help="Command to run")

        # get the CLI arguments
        args = self._parse_args(parser, level=1)

        # load the config files
        self._load_config()

        try:
            # call the class method with the name given by `command`
            reval = getattr(self, args.command)()
            exit(reval)

        except ValidationError as error:
            print_error(f"{error}")
Beispiel #3
0
def entrez_find_replacement_accession(accession):
    """
    If the updated version of an assembly accession if possible
    """

    # try getting a new accession for the master record
    accession_new = accession[:-1] + "0000000"

    try:
        # send a request and see if we get back an xml result
        r = entrez_request(
            "efetch.fcgi",
            {
                "db": "nuccore",
                "id": accession_new,
                "rettype": "gb",
                "retmode": "xml"
            },
        )

    except requests.exceptions.HTTPError:
        print_error(
            f"Could not find either the GenBank record for '{accession}' or an alternative accession"
        )

    # noinspection PyUnboundLocalVariable
    etree = ElementTree.XML(r.text)
    replacement = etree.find(".//GBSeq_accession-version")

    # check that the new accession is a WGS project
    keywords = [
        keyword.text.lower() for keyword in etree.findall(".//GBKeyword")
    ]

    if replacement is not None and "wgs" in keywords:
        print_warning(
            f"Replacing the superseded WGS accession '{accession}' with '{replacement.text}'"
        )
        return replacement.text
    else:
        print_error(
            f"Could not find either the GenBank record for '{accession}' or a valid alternative accession. "
            f"Please consider using the `--exclude-accessions` flag to remove accession '{accession}' from this query."
        )
Beispiel #4
0
def entrez_request(action, params=None, attempt=1):
    """
    Helper function to ensure that we never exceed the rate limit.
    """
    params = params or dict()

    # tell NCBI which application is making these requests
    params["tool"] = ENTREZ_TOOL
    params["email"] = ENTREZ_EMAIL

    config = get_smk_config()

    if config.get("api_key"):
        # append the user specified api_key
        params["api_key"] = config["api_key"]

    url = ENTREZ_URL + action

    if len(params.get("id", [])) > ENTREZ_MAX_UID:
        print_error(
            f"List of Entrez IDs exceeds the maximum: {ENTREZ_MAX_UID}")

    if config.get("debug"):
        # turn into a get request
        print(params.items())
        get = dict((key, value if isinstance(value, (str, int)) else ",".join(
            str(val) for val in value)) for key, value in params.items())
        print(f"{url}?{urlencode(get)}")

    # make the request
    r = requests.post(url, params)

    # enforce the rate limit (even when the request failed)
    time.sleep(ENTREZ_WAIT_TIME)

    if not r.ok:
        if r.status_code in ENTREZ_ERRORS and attempt < ENTREZ_MAX_ATTEMPTS:
            return entrez_request(action, params, attempt + 1)
        else:
            r.raise_for_status()

    return r
Beispiel #5
0
def entrez_nuccore_query(query, output_file):
    """
    Query the NCBI nuccore database to get a list of sequence accessions and their metadata.
    """
    # execute the search
    key, webenv, id_list = entrez_esearch("nuccore", query)

    # check that there was at least one record found
    if len(id_list) == 0:
        print_error(f"The --query '{query}' returned no results")

    # fetch the results
    etree = entrez_esummary("nuccore", key, webenv)

    # convert the ElementTree into a a list of dicts
    data = entrez_xml_to_dict(etree)

    with open(output_file, "w") as fout:
        w = csv.DictWriter(fout, data[0].keys(), delimiter="\t")
        w.writeheader()
        w.writerows(data)
Beispiel #6
0
def entrez_range_accessions(accession, first, last):
    """
    Return a range between two accession codes (e.g. ABC001 -> ABC005)
    """
    # sanity check that the items are equal length and in order
    assert len(first) == len(last) and first < last

    # find the index of the first difference
    idx = [i for i in range(len(first)) if first[i] != last[i]][0]
    pad = len(first) - idx

    try:
        # return the range
        return [
            f"{first[:idx]}{str(item).zfill(pad)}"
            for item in range(int(first[idx:]),
                              int(last[idx:]) + 1)
        ]
    except ValueError:
        print_error(
            f"Could not resolve the accession range '{first}-{last}' for master record '{accession}'"
        )
def calculate_bt2_idx_chunks(mem_resources, mem_rescale_factor, fasta_files, output_tsv, output_txt):
    """Calculate the number of chunks that the db sequences are going to be split into"""

    fasta_paths_random = []
    with open(str(fasta_files), "r") as fin:
        for line in fin:
            fasta_paths_random.append(line.strip())

    chunk_size = float(mem_resources) / float(mem_rescale_factor)
    total_size = 0.0
    chunks = 1

    chunk_df_list = []

    for fasta_file in fasta_paths_random:
        file_size = os.stat(fasta_file).st_size / (1024 ** 2)

        if file_size >= mem_resources or file_size >= chunk_size:
            print_error(
                f"Fasta file {fasta_file} is bigger than the RAM resources provided. "
                f"Unfortunately an index cannot be built."
            )

        if total_size + file_size >= chunk_size:
            total_size = file_size
            chunks += 1
        else:
            total_size += file_size

        chunk_df_list.append([chunks, fasta_file])

    chunk_df = pd.DataFrame(chunk_df_list, columns=["chunk", "path"])

    chunk_df.to_csv(output_tsv, sep="\t", index=False, header=False)

    idx_chunk_total = chunk_df["chunk"].max()

    with open(output_txt, "w") as outfile:
        print(idx_chunk_total, file=outfile)
Beispiel #8
0
    def _run_snakemake(self, module, args, config, target_list):
        """
        Helper function for running the snakemake workflow
        """
        print(f"HAYSTAC v {__version__}\n")
        print(f"Date: {datetime.datetime.now()}\n")

        config["module"] = module
        config["workflow_dir"] = os.path.join(CODE_DIR, "workflow")

        print("Config parameters:\n")
        params = config if args.debug else vars(args)

        for key, value in params.items():
            if value or args.debug:
                print(f" {key}: {value}")
        print("\n")

        if args.debug:
            print("Target files:\n")
            for target in target_list:
                print(" " + target)
            print("\n")

        try:
            os.makedirs(SNAKE_DIR, exist_ok=True)
        except PermissionError:
            print_error("Cannot write to the current working directory.")

        # save the run-time config file
        with open(CONFIG_RUNTIME, "w") as fout:
            yaml.safe_dump(config, fout, default_flow_style=False)

        # get any extra snakemake params
        smk_params = config.pop("snakemake") or {}

        # get any rule targets for batching and convert them into a compatible object
        exec_batch = Batch(config["batch"][0], config["batch"][1],
                           config["batch"][2]) if config.get("batch") else None
        if config.get("batch"):
            config.pop("batch")

        success = snakemake.snakemake(
            snakefile=os.path.join(CODE_DIR, "workflow/workflow.smk"),
            config=config,
            targets=target_list,
            batch=exec_batch,
            cores=int(args.cores),
            resources={
                "entrez_api": self.max_entrez_requests,
                "mem_mb": int(args.mem)
            },
            force_incomplete=True,
            scheduler="greedy",
            # handle the rule-specific conda environments
            use_conda=config["use_conda"],
            conda_frontend="mamba",
            conda_prefix=os.path.join(config["cache"], "conda")
            if config["use_conda"] else None,
            # set all the debugging flags
            printreason=args.debug,
            printshellcmds=args.debug,
            show_failed_logs=args.debug,
            verbose=args.debug,
            keep_incomplete=args.debug,
            restart_times=0 if args.debug else RESTART_TIMES,
            keepgoing=(not args.debug),
            unlock=args.unlock,
            # pass on any CLI arguments from the --snakemake flag
            **smk_params,
        )

        # tidy up all the snakemake metadata
        if success and (len(os.listdir(os.path.join(SNAKE_DIR, "locks")))
                        == 0):
            shutil.rmtree(SNAKE_DIR)

        # translate "success" into shell exit code of 0
        return 0 if success else 1
Beispiel #9
0
def entrez_assembly_ftp(accession, force=False):
    """
    Get an NCBI ftp url from the assembly database.
    """

    # find out if we are looking for a virus to apply the correct filter
    filter_condition = ' AND "latest"[filter] NOT suppressed*'

    config = get_smk_config()

    if config.get("refseq_rep") and config["refseq_rep"] == "viruses":
        # append the virus filter
        filter_condition += " AND viruses[filter] "

    # query the assembly database to get the latest assembly for this accession code
    key, webenv, id_list = entrez_esearch(
        "assembly",
        accession + filter_condition,
    )

    if len(id_list) > 1:
        # should never happen, but...
        msg = f"Multiple assembly accessions found for '{accession}': {id_list}. "

        # if force-accessions is true pick the largest int value, assuming it is also the altest
        if force:
            msg += f"Using assembly: id pair '{accession}': '{max([int(id_num) for id_num in id_list])}'"
            id_list = [str(max([int(id_num) for id_num in id_list]))]
            print_warning(msg)

        # if not raise an error
        else:
            msg += (
                f"Either consider using the `--force-accessions` flag for the largest ID to be picked, "
                f"or the `--exclude-accessions` flag to remove accession '{accession}' from this query."
            )
            print_error(msg)

    elif len(id_list) == 0:
        # no entry in the assembly database for this accession code
        return ""

    # fetch the summary record for the assembly
    r = entrez_request("esummary.fcgi", {"db": "assembly", "id": id_list})

    # parse the XML result
    etree = ElementTree.XML(r.text)

    # check if the assembly is anomalous
    anomalous = [
        reason.text for reason in etree.findall(".//Anomalous/Property")
    ]

    if anomalous:
        message = (
            f"Assembly '{accession}' has been marked as anomalous for the following reasons: "
            f"'{'; '.join(anomalous)}'")

        if force:
            print_warning(message)
        else:
            print_error(message)

    refseq = etree.find(".//FtpPath_RefSeq")
    genbank = etree.find(".//FtpPath_GenBank")

    # preference RefSeq URLs over GenBank URLs
    if refseq is not None and refseq.text not in ["", None]:
        ftp_stub = refseq.text
    elif genbank is not None and genbank.text not in ["", None]:
        ftp_stub = genbank.text
    else:
        return ""

    # append the fasta filename
    ftp_url = os.path.join(ftp_stub,
                           os.path.basename(ftp_stub) + "_genomic.fna.gz")

    return ftp_url
def entrez_download_sequence(accession, output_file, force=False, mtdna=False):
    """
    Fetch the Entrez fasta record for a nuccore accession.
    """

    # query the assembly database to see if there is an FTP url we can use
    ftp_url = entrez_assembly_ftp(accession, force) if not mtdna else ""

    try:
        if ftp_url:
            download_entrez_ftp(ftp_url, output_file)
            return

    except urllib.error.URLError:
        pass

    try:
        # fetch the fasta record from nuccore
        r = entrez_request("efetch.fcgi", {"db": "nuccore", "id": accession, "rettype": "fasta", "retmode": "text"})

        # the fasta may be empty if this is a "master record" containing multiple other records (NZ_APLR00000000.1)
        if len(r.text.strip()) > 1:
            with bgzf.open(output_file, "w") as fout:
                print(r.text, file=fout)

            return

    except requests.exceptions.HTTPError:
        pass

    try:
        # get the full GenBank XML record
        r = entrez_request("efetch.fcgi", {"db": "nuccore", "id": accession, "rettype": "gb", "retmode": "xml"})

    except requests.exceptions.HTTPError:
        # check for a replacement accession (there may be a newer version if this a WGS project)
        updated_accession = entrez_find_replacement_accession(accession)

        # download the updated accession instead
        entrez_download_sequence(updated_accession, output_file, force)

        return

    # parse the XML result
    etree = ElementTree.XML(r.text)

    # get the first and last accession codes for this master record
    first = etree.find(".//GBAltSeqItem_first-accn")
    last = etree.find(".//GBAltSeqItem_last-accn")

    if first is None or last is None:
        print_error(
            f"Could not download the fasta file for {accession}. Please consider using the `--exclude-accessions` "
            f"flag to remove accession '{accession}' from this query."
        )

    # get all the related accession codes
    accessions = entrez_range_accessions(accession, first.text, last.text)

    try:
        with bgzf.open(output_file, "w") as fout:
            # fetch all the accessions in batches
            for id_list in chunker(accessions, ENTREZ_MAX_UID):
                r = entrez_request(
                    "efetch.fcgi",
                    {"db": "nuccore", "id": id_list, "rettype": "fasta", "retmode": "text"},
                )

                # write the fasta data to our bgzip file
                print(r.text, file=fout)

    except requests.exceptions.HTTPError:
        print_error(
            f"Could not download the accession range '{first.text}-{last.text}' for master record '{accession}'. "
            f"Please consider using the `--exclude-accessions` flag to remove accession '{accession}' from this query."
        )
Beispiel #11
0
def entrez_refseq_virus_create_files(
    config,
    input_file,
    viral_genomes_out,
):
    """Function to parse the refseq genomes report for viruses."""

    # read the file

    refseq_viruses = pd.read_csv(input_file, sep="\t")

    # drop duplicate species/strains

    refseq_viruses_rmdup = refseq_viruses[~refseq_viruses["#Organism/Name"].
                                          duplicated(keep="first")]

    viruses_unique = refseq_viruses_rmdup[["#Organism/Name",
                                           "Segmemts"]].copy()
    viruses_unique["Accession"] = ""

    # assign a segment accession code to a species.
    # 1 segment acc should be enough as all of them will be fetched through the assembly database.

    for index, row in viruses_unique.iterrows():

        seq_list = row["Segmemts"].split("; ")

        if len(seq_list) == 1 and ":" not in seq_list[0]:
            if "/" in seq_list[0]:
                row["Accession"] = seq_list[0].split("/")[0]
            elif "-" == seq_list[0]:
                continue
            else:
                row["Accession"] = seq_list[0]
        elif len(seq_list) == 1 and ":" in seq_list[0]:
            row["Accession"] = seq_list[0].split(":")[1].split("/")[0]
        elif len(seq_list) > 1:
            if "/" in seq_list[0].split(":")[1]:
                row["Accession"] = seq_list[0].split(":")[1].split("/")[0]
            else:
                row["Accession"] = seq_list[0].split(":")[1]
        else:
            print_error(seq_list)
            break

    # rename columns

    viruses = viruses_unique[["#Organism/Name", "Accession"]]
    viruses.rename(columns={
        "#Organism/Name": "species",
        "Accession": "AccessionVersion"
    },
                   inplace=True)

    # drop rows that have no accessions

    viruses = viruses[viruses["AccessionVersion"] != ""]

    # regex for species name

    viruses["species"] = viruses["species"].replace(REGEX_BLACKLIST,
                                                    "_",
                                                    regex=True)

    # check for duplicates from user input

    if config["sequences"] or config["accessions"]:
        user_inputs = []
        if os.path.isfile(config["sequences"]):
            custom_fasta_paths = pd.read_csv(
                config["sequences"],
                sep="\t",
                header=None,
                names=["species", "accession", "path"],
            )
            user_inputs.append(custom_fasta_paths)
        if os.path.isfile(config["accessions"]):
            custom_accessions = pd.read_csv(
                config["accessions"],
                sep="\t",
                header=None,
                names=["species", "accession"],
            )
            user_inputs.append(custom_accessions)

        for user_df in user_inputs:
            viruses = viruses[(~viruses["species"].isin(user_df["species"]))]

    # print the output to csv

    header = ["species", "AccessionVersion"]
    viruses.to_csv(viral_genomes_out, sep="\t", header=header, index=False)