def fetch_assembly_meta_xml(accession):
    """
    Fetch assembly metadata xml from ENA.
    """
    url = "%s/xml/%s" % (ENA_API, accession)
    xml = tofetch.fetch_url(url)
    return xml
def fetch_goat_data(taxon_id):
    """Fetch taxon metadata from GoaT."""
    LOGGER.info("Fetching taxon metadata")
    url = "%s/record?recordId=taxon_id-%s&result=taxon" % (GOAT_API, taxon_id)
    result = tofetch.fetch_url(url)
    if result is None:
        LOGGER.error("Unable to fetch taxon metadata for '%s' from GoaT",
                     taxon_id)
        sys.exit(1)
    data = ujson.loads(result)
    return data["records"][0]["record"]
Beispiel #3
0
def fetch_accession(bioproject):
    """Fetch a GCA accession for a bioproject."""
    LOGGER.info("Fetching GCA accession for bioproject %s" % bioproject)
    url = "%s/search?result=assembly&query=study_accession%%3D%%22%s%%22&fields=accession%%2Cversion&format=tsv" % (
        ENA_API,
        bioproject,
    )
    result = tofetch.fetch_url(url)
    accession = None
    if result and result is not None:
        line = result.split("\n")[1]
        if line and "\t" in line:
            accession, version = line.split("\t")
            accession += ".%s" % version
    return accession
def fetch_bioproject_children(bioproject,
                              *,
                              projects=None,
                              children=None,
                              new_projects=None):
    """
    Fetch children of a bioproject.
    """
    if projects is None:
        projects = []
    if new_projects is None:
        new_projects = {}
    if children is None:
        children = []
    url = "%s/search?result=study&query=parent_study%%3D%%22%s%%22&format=tsv" % (
        ENA_API,
        bioproject,
    )
    result = tofetch.fetch_url(url)
    if result and result is not None:
        for line in result.split("\n")[1:]:
            if line and "\t" in line:
                child_accession, description = line.split("\t")
                if child_accession in projects:
                    continue
                if "genome assembly" in description:
                    if "alternate haplotype" not in description:
                        children.append(child_accession)
                        new_projects.update({child_accession: bioproject})
                else:
                    sleep(0.5)
                    LOGGER.info(
                        "Fetching nested accessions under bioproject %s" %
                        child_accession)
                    fetch_bioproject_children(
                        child_accession,
                        projects=projects,
                        children=children,
                        new_projects=new_projects,
                    )
    return children, new_projects
def fetch_busco_lineages(busco_sets, buscodir):
    """Fetch busco lineages."""
    if not busco_sets:
        return
    lineages_to_fetch = []
    for lineage in busco_sets:
        busco_lineage = "%s/lineages/%s" % (buscodir, lineage)
        if not os.path.isdir(busco_lineage):
            lineages_to_fetch.append(lineage)
    if not lineages_to_fetch:
        return
    lineage_urls = {}
    LOGGER.info("Fetching BUSCO lineage directory listing")
    listing = tofetch.fetch_url("%s/" % BUSCO_URL)
    for entry in listing.split("\n"):
        parts = re.split(r"[\"\s]+", entry)
        if len(parts) == 8:
            busco_set = re.sub(r"\..+$", "", parts[2])
            lineage_urls.update({busco_set: "%s/%s" % (BUSCO_URL, parts[2])})
    for lineage in lineages_to_fetch:
        LOGGER.info("Fetching BUSCO lineage %s" % lineage)
        tofetch.fetch_tar(lineage_urls[lineage], buscodir)
def fetch_read_info(accession, per_platform):
    """Fetch read info for an accession."""
    portal = "https://www.ebi.ac.uk/ena/portal/api"
    url = (
        "%s/filereport?accession=%s&result=read_run&fields=run_accession,fastq_bytes,base_count,library_strategy,library_selection,library_layout,instrument_platform,experiment_title,fastq_ftp"
        % (portal, accession))
    data = tofetch.fetch_url(url)
    if data is None:
        return
    header = None
    for line in data.split("\n"):
        if not line or line == "":
            continue
        if header is None:
            header = line.split("\t")
            continue
        fields = line.split("\t")
        values = {}
        platform = "OTHER"
        for i in range(0, len(header)):
            value = fields[i]
            if header[
                    i] == "instrument_platform" and platform != "ILLUMINA_XTEN":
                platform = fields[i]
            if header[i] == "experiment_title":
                if value == "HiSeq X Ten paired end sequencing":
                    platform = "ILLUMINA_XTEN"
            values.update({header[i]: value})
        if "base_count" in values:
            values["base_count"] = int(values["base_count"])
        else:
            values["base_count"] = 0
        try:
            per_platform[platform].append(values)
        except KeyError:
            per_platform["OTHER"].append(values)