def bulk_query_ncbi(accessions, args):
    """Bulk query NCBI and retrieve webenvironment and history tags

    :param accessions: list of GenBank protein accessions
    :param args: cmd-line args parser

    Return webenv and query key
    """
    # perform batch query of Entrez
    try:
        accessions_string = ",".join(accessions)
    except TypeError:
        accessions_string = accessions

    # Runtime error captured by try/except function call
    epost_result = Entrez.read(
        entrez_retry(
            args.retries,
            Entrez.epost,
            db="Protein",
            id=accessions_string,
        ))

    # retrieve the web environment and query key from the Entrez post
    epost_webenv = epost_result["WebEnv"]
    epost_query_key = epost_result["QueryKey"]

    return epost_webenv, epost_query_key
def get_ncbi_counts(args):
    """Retrieve the number of genomic assemblies per Kingdom from NCBI.
    
    :param args: cmd-line args parser
    
    Return dict {kingdom: count}
    """
    counts = {}

    for kingdom in KINGDOMS:
        with entrez_retry(
                args.retries,
                Entrez.esearch,
                db="Assembly",
                term=kingdom,
                retmode="xml",
        ) as record_handle:
            record = Entrez.read(record_handle, validate=False)

        number_of_assemblies = record['Count']
        counts[kingdom] = number_of_assemblies

    return counts
Example #3
0
def link_nucleotide_ids_individually(accessions_to_parse, no_accession_logger,
                                     args):
    """Use Entrez.elink to get the ID of the NCBI.Nucleotide db record linked to the NCBI.Protein
    db record.
    
    In the resulting dict, the protein record ID is used to group nucleotide records that are 
    linked to the same protein record, and from which it will need to be determined which is
    the longest record, so that only one (and the longest) Nucleotide record is parsed for this protein
    not all retrieved Nucleotide records.

    :param accessions_to_parse: list of protein accessions
    :param no_accession_logger: path to log file for logging protein accessions for which no 
        linked nucleotide record was retrieved
    :param args: cmd-line args parser

    Return dict {protein_accession: {nucleotide records IDs}}
    no_nucleotides: set of protein accessions for which no nucleotide record could be retrieved
    """
    logger = logging.getLogger(__name__)

    nucleotide_ids = {}  # {protein_record_id: {nucleotide_ids}}
    no_nucleotides = set(
    )  # protein accessions for which no nucleotide record could be retrieved

    for protein_accession in tqdm(accessions_to_parse,
                                  "Retrieving Nucleotide IDs individually"):
        # used for identifying which nucleotide record to retrieve protein accessions from
        with entrez_retry(
                args.retries,
                Entrez.elink,
                dbfrom="Protein",
                db="nuccore",
                id=protein_accession,
        ) as handle:
            try:
                batch_nuccore = Entrez.read(handle)
            except Exception as err:
                logger.warning(
                    f"Could not retrieve linked Nucletoide record for {protein_accession}"
                )
                with open(no_accession_logger, "a") as fh:
                    fh.write(
                        f"Could not retrieve linked Nucletoide record for {protein_accession}\t"
                        f"Error: {err}")
                no_nucleotides.add(protein_accession)
                continue

        for record in tqdm(
                batch_nuccore,
                desc="Retrieving Nucleotide db records IDs from nuccore"):
            protein_record_id = record['IdList'][0]
            record_nucleotide_ids = set()

            # Linked db records are contained in the 'LinkSetDb' field
            # multiple linked records may be retrieved

            if len(record['LinkSetDb']) == 0:
                logger.warning(
                    f"Could not retrieve linked Nucletoide record for {protein_accession}"
                )
                err = "No data containiend in 'LinkSetDb' field from eLink output"
                with open(no_accession_logger, "a") as fh:
                    fh.write(
                        f"Could not retrieve linked Nucletoide record for {protein_accession}\t"
                        f"Error: {err}")
                no_nucleotides.add(protein_accession)
                continue

            for link_dict in record['LinkSetDb']:
                # record['LinkSetDb'] = [{'Link': [{'Id': '1127815138'}], 'DbTo': 'nuccore', 'LinkName': 'protein_nuccore'}]
                # link_list = {'Link': [{'Id': '1127815138'}], 'DbTo': 'nuccore', 'LinkName': 'protein_nuccore'}
                links = link_dict['Link']
                for link in links:
                    # links = [{'Id': '1127815138'}]
                    # link = {'Id': '1127815138'}
                    nucleotide_id = link['Id']

                    record_nucleotide_ids.add(nucleotide_id)

            try:
                existing_ids = nucleotide_ids[protein_record_id]
                all_ids = existing_ids.union(record_nucleotide_ids)
                nucleotide_ids[protein_record_id] = all_ids
            except KeyError:
                nucleotide_ids[protein_record_id] = record_nucleotide_ids

    return nucleotide_ids, no_nucleotides
Example #4
0
def parse_longest_record_individually(nucleotide_record_ids,
                                      retrieved_proteins, gbk_accessions,
                                      args):
    """Identify the longest NCBI.Nucleotide record, and extract Protein GenBank accessions
    
    :param nucleotide_record_ids: set, NCBI.Nucleotide records IDs retrieved for one Protein record
    :param retrieved_proteins: dict, {protein_accession: nucleotide record accession}
    :param gbk_accessions: list of protein GenBank accessions from the local CAZyme database
    :param args: cmd-line args parser
    
    Return retrieved_proteins (dict)
        newly_retrieved_proteins: set of CAZyme protein accessions retrieved from parsed records
        bool: True if successful Entrez connection, False is connection fails
    """
    newly_retrieved_proteins = set()

    record_lengths = {
    }  # {Nucleotide record accession: {len: Number of features (int), record: record}
    # longest (most features) record interpretted as the most complete record

    for nucleotide_id in tqdm(nucleotide_record_ids,
                              desc="Parsing nucleotide records individually"):
        with entrez_retry(
                args.retries,
                Entrez.efetch,
                db="Nucleotide",
                id=nucleotide_id,
                retmode="xml",
        ) as handle:
            try:
                nucleotide_records = Entrez.read(handle)
            except Exception:
                pass

            for record in nucleotide_records:
                nucleotide_accession = record['GBSeq_accession-version']

                number_of_features = 0

                for feature_dict in record['GBSeq_feature-table']:
                    for feature_qual in feature_dict['GBFeature_quals']:
                        if feature_qual['GBQualifier_name'] == 'protein_id':
                            number_of_features += 1

                record_lengths[nucleotide_accession] = {
                    'length': number_of_features,
                    'record': record
                }

    list_of_lengths = [acc['length'] for acc in list(record_lengths.keys())]
    list_of_lengths.sort(reverse=True)
    longest_length = list_of_lengths[0]

    for nucleotide_accession in record_lengths:
        if record_lengths[nucleotide_accession]['length'] == longest_length:
            # found the longest record
            # extract protein accessions for CAZymes in the local CAZyme database
            # method explained in extract_protein_accessions()
            for feature_qual in feature_dict['GBFeature_quals']:
                if feature_qual['GBQualifier_name'] == 'protein_id':
                    protein_accession = feature_qual['GBQualifier_value']
                    if protein_accession in gbk_accessions:
                        try:
                            retrieved_proteins[protein_accession].add(
                                nucleotide_accession)
                        except KeyError:
                            retrieved_proteins[protein_accession] = {
                                nucleotide_accession
                            }

                        newly_retrieved_proteins.add(protein_accession)
            break

    return retrieved_proteins, newly_retrieved_proteins, True
Example #5
0
def parse_longest_record(nucleotide_record_ids, retrieved_proteins,
                         gbk_accessions, args):
    """Identify the longest NCBI.Nucleotide record, and extract Protein GenBank accessions
    
    :param nucleotide_record_ids: set, NCBI.Nucleotide records IDs retrieved for one Protein record
    :param retrieved_proteins: dict, {protein_accession: nucleotide record accession}
    :param gbk_accessions: list of protein GenBank accessions from the local CAZyme database
    :param args: cmd-line args parser
    
    Return retrieved_proteins (dict)
        newly_retrieved_proteins: set of CAZyme protein accessions retrieved from parsed records
        bool: True if successful Entrez connection, False is connection fails
    """
    logger = logging.getLogger(__name__)
    newly_retrieved_proteins = set()

    batch_query_ids = ",".join(list(nucleotide_record_ids))
    with entrez_retry(
            args.retries,
            Entrez.epost,
            "Nucleotide",
            id=batch_query_ids,
    ) as handle:
        batch_post = Entrez.read(handle)
    print("posted")
    with entrez_retry(
            args.retries,
            Entrez.efetch,
            db="Nucleotide",
            query_key=batch_post['QueryKey'],
            WebEnv=batch_post['WebEnv'],
            retmode="xml",
    ) as handle:
        try:
            batch_nucleotide = Entrez.read(handle)
        except Exception as err:
            logger.warning(
                f"Failed Entrez connection for fetching Nucleotide records: {err}"
            )
            return retrieved_proteins, newly_retrieved_proteins, False
    print("fetched")
    record_lengths = {
    }  # {Nucleotide record accession: {len: Number of features (int), record: record}
    # longest (most features) record interpretted as the most complete record

    for record in tqdm(batch_nucleotide,
                       desc="Selecting longest Nucleotide record"):
        nucleotide_accession = record['GBSeq_accession-version']

        number_of_features = 0

        for feature_dict in record['GBSeq_feature-table']:
            for feature_qual in feature_dict['GBFeature_quals']:
                if feature_qual['GBQualifier_name'] == 'protein_id':
                    number_of_features += 1

        record_lengths[nucleotide_accession] = {
            'length': number_of_features,
            'record': record
        }

    list_of_lengths = [acc['length'] for acc in list(record_lengths.keys())]
    list_of_lengths.sort(reverse=True)
    longest_length = list_of_lengths[0]

    for nucleotide_accession in record_lengths:
        if record_lengths[nucleotide_accession]['length'] == longest_length:
            # found the longest record
            # extract protein accessions for CAZymes in the local CAZyme database
            # method explained in extract_protein_accessions()
            for feature_qual in feature_dict['GBFeature_quals']:
                if feature_qual['GBQualifier_name'] == 'protein_id':
                    protein_accession = feature_qual['GBQualifier_value']
                    if protein_accession in gbk_accessions:
                        try:
                            retrieved_proteins[protein_accession].add(
                                nucleotide_accession)
                        except KeyError:
                            retrieved_proteins[protein_accession] = {
                                nucleotide_accession
                            }

                        newly_retrieved_proteins.add(protein_accession)
            break

    return retrieved_proteins, newly_retrieved_proteins, True
Example #6
0
def extract_protein_accessions_individually(single_nucleotide_ids,
                                            retrieved_proteins, gbk_accessions,
                                            args):
    """Retrieve and parse Nucleotide db records, for NCBI.Protein records from which only
    one NCBI.Nucleotide db record ID was retrieved.
    
    :param retrieved_proteins: dict, {protein_accession: nucleotide record accession}
    :param single_nucleotide_ids: list of nucloetide record IDs
    :param gbk_accessions: list of protein GenBank accessions from the local CAZyme database
    :param args: cmd-line args parser
    
    Return retrieved_proteins (dict)
        newly_retrieved_proteins: set of CAZyme protein accessions retrieved from parsed records
    """
    logger = logging.getLogger(__name__)

    newly_retrieved_proteins = set()

    for nucleotide_id in tqdm(single_nucleotide_ids,
                              desc="Parsing nucelotide records individually"):
        with entrez_retry(
                args.retries,
                Entrez.efetch,
                db="Nucleotide",
                id=nucleotide_id,
                retmode="xml",
        ) as handle:
            try:
                batch_nucleotide = Entrez.read(handle)
            except Exception as err:
                logger.warning(
                    f"Failed Entrez connection for fetching Nucleotide records: {err}"
                )
                pass

        for record in tqdm(batch_nucleotide,
                           desc="Extracting data from Nucleotide records"):
            nucleotide_accession = record['GBSeq_accession-version']

            # retrieve protein accessions of proteins features in the nucletide record
            for feature_dict in record['GBSeq_feature-table']:
                # feature-table contains a list of features, one feature is one feature_dict

                for feature_qual in feature_dict['GBFeature_quals']:
                    # feature_quals contains a list of dicts, one dict is feature_qual
                    # looking for dict containing protein accession (protein_id)
                    # e.g. {'GBQualifier_name': 'protein_id', 'GBQualifier_value': 'APS93952.1'}
                    if feature_qual['GBQualifier_name'] == 'protein_id':
                        protein_accession = feature_qual['GBQualifier_value']

                        if protein_accession in gbk_accessions:
                            # protein is in the local CAZyme database
                            try:
                                retrieved_proteins[protein_accession].add(
                                    nucleotide_accession)
                            except KeyError:
                                retrieved_proteins[protein_accession] = {
                                    nucleotide_accession
                                }

                            newly_retrieved_proteins.add(protein_accession)

    return retrieved_proteins, newly_retrieved_proteins
def get_genomic_accessions(nucleotide_accessions_dict, no_accession_logger,
                           args):
    """Retrieve genomic accessions for the genomic assemblies

    :param nucleotide_accessions_dict: dict
        {kingdom: {genus: {species: {nucleotide record accession: {protein_accessions},},},},}
    :param no_accession_logger: Path, path to log file to write out assembly names for which no
        genomic accession was retrieved
    :param args: cmd-line args parser
    
    Return dict,
    {kingdom: {genus: {species: {genomic_accession: {proteins: {protein_accessions}, count=int},},},},}
    """
    logger = logging.getLogger(__name__)

    genomic_accession_dict = {}
    # {kingdom: {genus: {species: {genomic_accession: {proteins: {protein_accessions}, count=int},},},},}

    for kingdom in tqdm(nucleotide_accessions_dict,
                        desc='Retrieving genomic accessions per kingdom'):
        genera = nucleotide_accessions_dict[kingdom]
        for genus in genera:
            organisms = genera[genus]
            for species in organisms:
                # retrieve all genomic assembly names for the given species
                assembly_names = list(organisms[species].keys())

                # break up the list into a series of smaller lists that can be batched querried
                batch_queries = get_chunks_list(args.batch_size,
                                                assembly_names)

                for batch_query in tqdm(
                        batch_queries,
                        desc=f"Batch querying for {genus} {species}"):
                    batch_query_ids = ",".join(batch_query)

                # retrieve the records IDs for the assembly names
                with entrez_retry(
                        args.retries,
                        Entrez.esearch,
                        "Assembly",
                        id=batch_query_ids,
                ) as handle:
                    batch_post = Entrez.read(handle)

                with entrez_retry(
                        args.retries,
                        Entrez.efetch,
                        db="Assembly",
                        query_key=batch_post['QueryKey'],
                        WebEnv=batch_post['WebEnv'],
                        retmode="xml",
                ) as handle:
                    batch_fetch = Entrez.read(handle)

                genomic_accessions = {}

                for genome_record in tqdm(batch_fetch,
                                          desc="Retrieving assembly IDs"):
                    index = 0
                    accessions = set()

                    for index in range(len(genome_record['IdList'])):
                        with entrez_retry(
                                10,
                                Entrez.efetch,
                                db="Assembly",
                                id=genome_record['IdList'][index],
                                retmode="xml",
                                rettype="docsum",
                        ) as handle:
                            result = Entrez.read(handle)
                        genomic_accession = result['DocumentSummarySet'][
                            'DocumentSummary'][0]['AssemblyAccession']
                        assembly_name = result['DocumentSummarySet'][
                            'DocumentSummary'][0]['AssemblyName']

                        genomic_accessions[genomic_accession] = assembly_name

                    accessions = list(genomic_accessions.keys)
                    accessions.sort(reverse=True)
                    latest_accession = accessions[0]
                    latest_assembly_name = genomic_accessions[latest_accession]

                    # replace assemlby name for genomic accession
                    try:
                        protein_accessions = nucleotide_accessions_dict[
                            kingdom][genus][species][latest_assembly_name]

                    except KeyError:
                        logger.warning(
                            f"Retrieved assembly name {latest_assembly_name}, but not retrieved previously"
                        )
                        with open(no_accession_logger, 'a') as fh:
                            fh.write(
                                f"{latest_assembly_name}\tRetrieved assembly name, but not retrieved previously\t"
                                f"{latest_accession}\t{genus} {species}\n")
                        continue

                    try:
                        genomic_accession_dict[kingdom]

                        try:
                            genomic_accession_dict[kingdom][genus]

                            try:
                                genomic_accession_dict[kingdom][genus][species]

                            except KeyError:
                                genomic_accession_dict[kingdom][genus][
                                    species] = {
                                        genomic_accession: {
                                            'proteins': protein_accessions,
                                            'count': len(protein_accessions),
                                        },
                                    }

                        except KeyError:
                            genomic_accession_dict[kingdom][genus] = {
                                species: {
                                    genomic_accession: {
                                        'proteins': protein_accessions,
                                        'count': len(protein_accessions),
                                    },
                                },
                            }

                    except KeyError:
                        genomic_accession_dict[kingdom] = {
                            genus: {
                                species: {
                                    genomic_accession: {
                                        'proteins': protein_accessions,
                                        'count': len(protein_accessions),
                                    },
                                },
                            },
                        }

    return genomic_accession_dict
def get_nucleotide_accessions(genbank_kingdom_dict, no_accession_logger,
                              cache_dir, args):
    """Retrieve the NCBI Nucleotide db records ID's containing the GenBank protein accessions.
    
    :param genbank_kingdom_dict: dict of Genbank and Kingdom records from db
        {kingdom: {genus: {species: {protein_accessions}}}
    :param no_accession_logger: Path, path to log file to save protein accessions for which no 
        genomic accession was retrieved
    :param cache_dir: Path to cache directory
    :param args: cmd-line args parser
    
    Return dict {kingdom: {genus: {species: {nucleotide_accession: {protein_accessions},},},},}
    """
    logger = logging.getLogger(__name__)

    # convert the structure of genbank_kingdom_dict
    # { protein_accession: {species: str, genus: str, kingdom} }
    if args.gbk_organisms_relationships is not None:
        logger.warning(
            f"Retriving protein-organisms relationships from file: {args.gbk_organisms_relationships}"
        )
        try:
            with open(args.gbk_organisms_relationships, "r") as fh:
                gbk_organism_dict = json.load(fh)
        except FileNotFoundError:
            logger.warning(
                f"Could not find JSON file containing protein-organisms relationships at: {args.gbk_organisms_relationships}\n"
                "Check the correct path was given.\n"
                "Terminating program")
            sys.exit(1)
    else:
        logger.warning(
            "Retrieving protein-organism relationships from the local database"
        )
        gbk_organism_dict = get_gbk_organism_relationships(
            genbank_kingdom_dict)

    # create dict to store the nucleotide records accessions
    nucleotide_accessions_dict = {}
    # {kingdom: {genus: {species: {nucleotide record accessio: {protein_accessions},},},},}

    # create a dict to keep track of protein accessions for which a nucleotide record was retrieved
    # store all retrieved ncucleotide accessions to prevent retrieval of the
    # same assembly record multiple times
    retrieved_proteins = {}  # {protein_accession: nucleotide record accession}

    # create set of protein accessions for which no nucleotide ID was retrievied
    no_retrieved_nucleotide_ids = set()

    # gbk accessions waiting for a nucleotide id to be retrieved
    remaining_accessions = list(gbk_organism_dict.keys())

    # break up the list into batches
    gbk_accessions = list(gbk_organism_dict.keys())
    gbk_batches = get_chunks_list(gbk_accessions, args.batch_size)

    failed_batches = set()

    for batch in tqdm(gbk_batches,
                      desc="Batching quering NCBI for nucleotide IDs"):
        # eLink Protein to Nuccore db and retrieve IDs of linked nucleotide records
        batch_query_ids = ",".join(batch)

        try:
            with entrez_retry(
                    args.retries,
                    Entrez.epost,
                    "Protein",
                    id=batch_query_ids,
            ) as handle:
                epost_result = Entrez.read(handle)
        except RuntimeError:
            logger.warning(
                "Runtime error raised when batch quering\n"
                "Possible result of a accessions not being in NCBI\n"
                "Attempt identification of the causal accession later\n")
            failed_batches.add(batch)
            continue

        epost_webenv = epost_result["WebEnv"]
        epost_query_key = epost_result["QueryKey"]

        try:
            with entrez_retry(
                    args.retries,
                    Entrez.elink,
                    dbfrom="Protein",
                    db="nuccore",
                    query_key=epost_webenv,
                    WebEnv=epost_query_key,
                    linkname='protein_nuccore',
            ) as handle:
                batch_nuccore = Entrez.read(handle)
        except Exception as err:
            logger.warning(f"Failed Entrez connection: {err}\n"
                           "No nucletoide IDs retrieved for batch query\n"
                           "Will try again later")
            failed_batches.add(batch)
            continue

        # parse the query result from nuccore

        # {protein record ID: {nucleotide records IDs}}
        nucleotide_ids = entrez.get_linked_nucleotide_record_ids(batch_nuccore)

        if nucleotide_ids is None:
            # issue with at least one accession in the batch
            # e.g. it is not longer stored in NCBI
            # pass individually to find/parse the bad accession(s)
            nucleotide_ids, no_nucleotides = entrez.link_nucleotide_ids_individually(
                accessions_to_parse,
                no_accession_logger,
                args,
            )

            # no nucleotide IDs retrieved for at least one protein
            if len(no_nucleotides) != 0:
                logger.warning(
                    f"{len(no_nucleotides)} proteins found with no linked Nucleotide records"
                )
                for protein_accession in no_nucleotides:
                    # already logged in link_nucleotide_ids_individually()
                    try:
                        remaining_accessions.remove(protein_accession)
                        no_retrieved_nucleotide_ids.add(protein_accession)
                    except ValueError:
                        pass

        if len(list(nucleotide_ids.values())) == 0:
            # no linked Nucleotide records retrieved for the current batch of protein accessions
            for protein_accession in accessions_to_parse:
                logger.warning(
                    f"Could not reitreve linked Nucleotide record for {protein_accession}"
                )
                no_retrieved_nucleotide_ids.add(protein_accession)
                # do not try to retrieve record again
                try:
                    remaining_accessions.remove(protein_accession)
                except ValueError:
                    pass
            continue

        # retrieves the nucleotide records IDs for protein records for whcih only one
        # nuclotide ID was retrieved
        single_nucleotide_ids = set()

        # retrieve the protein_record_ids of protein records from which multiple nucletoide
        # record IDs were retrieved
        protein_records_multi_nuc = set()

        for protein_record_id in tqdm(
                nucleotide_ids,
                desc=
                "Idenitfying proteins with multiple linked nucleotide records"
        ):
            if len(nucleotide_ids[protein_record_id]) == 1:
                single_nucleotide_ids.add(
                    list(nucleotide_ids[protein_record_id])[0])
            else:
                logger.warning(
                    f"Found {len(nucleotide_ids[protein_record_id])} linked nucletoide records "
                    f"for protein record {protein_record_id}")
                protein_records_multi_nuc.add(protein_record_id)

        # batch query to fetch nucletoide records for protein records
        # from which only a sinlge nucleotide ID was retrieved
        if len(single_nucleotide_ids) != 0:
            retrieved_proteins, newly_retrieved_proteins, succcess = entrez.extract_protein_accessions(
                single_nucleotide_ids,
                retrieved_proteins,
                gbk_accessions,
                args,
            )
            if succcess is False:
                # issue with at least one accession in the batch
                # e.g. it is not longer stored in NCBI
                # pass individually to find/parse the bad accession(s)
                retrieved_proteins, newly_retrieved_proteins = entrez.extract_protein_accessions_individually(
                    single_nucleotide_ids,
                    retrieved_proteins,
                    gbk_accessions,
                    args,
                )

            # add the nucleotide accessions to the nucleotide_accessions_dict
            # {kingdom: {genus: {species: {nucleotide record accessio: {protein_accessions},},},},}
            nucleotide_accessions_dict = add_nucleotide_accessions(
                nucleotide_accessions_dict,
                gbk_organism_dict,
                retrieved_proteins,
                newly_retrieved_proteins,
                kingdom,
                no_accession_logger,
            )

        # for Protein records for which multiple Nucleotide record IDs were retrieved
        # Identify the longest Nucleotide record and retrieve protein accessions from it
        # The longest record is most likely to be the most complete record
        for protein_record_id in tqdm(
                protein_records_multi_nuc,
                "Parsing protein records with multiple linked Nucletide records"
        ):
            nucleotide_record_ids = nucleotide_ids[protein_record_id]

            retrieved_proteins, newly_retrieved_proteins, success = entrez.parse_longest_record(
                nucleotide_record_ids,
                retrieved_proteins,
                gbk_accessions,
                args,
            )

            if success is False:
                # issue with at least one accession in the batch
                # e.g. it is not longer stored in NCBI
                # pass individually to find/parse the bad accession(s)
                retrieved_proteins, newly_retrieved_proteins, success = entrez.parse_longest_record_individually(
                    nucleotide_record_ids,
                    retrieved_proteins,
                    gbk_accessions,
                    args,
                )

            nucleotide_accessions_dict = add_nucleotide_accessions(
                nucleotide_accessions_dict,
                gbk_organism_dict,
                retrieved_proteins,
                newly_retrieved_proteins,
                kingdom,
                no_accession_logger,
            )

        # remove protein accessions from remaining_accessions because the linked Nucleotide
        # record ID has already been retrieved
        for protein_accession in retrieved_proteins:
            try:
                remaining_accessions.remove(protein_accession)
            except ValueError:
                pass

        if starting_loop_length == len(
                remaining_accessions) and len(remaining_accessions) != 0:
            # failing to retrieve data for protein accessions
            nucleotide_ids, no_nucleotides = entrez.link_nucleotide_ids_individually(
                accessions_to_parse,
                no_accession_logger,
                args,
            )

            if nucleotide_ids is None:
                for protein_accession in remaining_accessions:
                    logger.warning(
                        f"Could not retrieve  Nucletoide record for {protein_accession}"
                    )
                    try:
                        remaining_accessions.remove(protein_accession)
                    except ValueError:
                        pass
                continue

        if len(list(nucleotide_ids.values())) == 0:
            # no linked Nucleotide records retrieved for the current batch of protein accessions
            for protein_accession in accessions_to_parse:
                logger.warning(
                    f"Could not reitreve linked Nucleotide record for {protein_accession}"
                )
                # do not try to retrieve record again
                try:
                    remaining_accessions.remove(protein_accession)
                except ValueError:
                    pass

        if len(no_nucleotides) != 0:
            for protein_accession in no_nucleotides:
                # already logged in link_nucleotide_ids_individually()
                try:
                    remaining_accessions.remove(protein_accession)
                except ValueError:
                    pass

    return nucleotide_accessions_dict
def get_sequences(genbank_accessions, args, retry=False):
    """Retrieve protein sequences from Entrez.

    :param genbank_accessions: list, GenBank accessions
    :param args: cmb-line args parser
    :param retry: bool, default False, if get_sequences is being called for retrying a previously failed query

    Return dict keyed by GenBank accession and valued by Seq instance, and a list of all GenBank accessions 
    for which no record from NCBI was retrieved.
    """
    logger = logging.getLogger(__name__)

    seq_dict = {}  # {gbk_accession: SeqRecord}

    # the list of accessions is to long, break down into smaller chunks for batch querying
    all_queries = get_chunks_list(genbank_accessions, args.batch_size)

    failed_queries = [
    ]  # lists which raised an error, likely because contain an accession not in NCBI

    irregular_accessions = []

    success_accessions = set()  # accessions for which seqs were retrieved

    for query_list in tqdm(all_queries, desc="Batch querying NCBI.Entrez"):

        try:
            epost_webenv, epost_query_key = bulk_query_ncbi(query_list, args)
        except RuntimeError:
            logger.warning(
                "Runtime error raised when batch quering\n"
                "Possible result of a accessions not being in NCBI\n"
                "Attempt identification of the causal accession later\n")

            if retry:
                return None, None

            failed_queries.append(query_list)
            continue

        try:
            # retrieve the protein sequences
            with entrez_retry(
                    args.retries,
                    Entrez.efetch,
                    db="Protein",
                    query_key=epost_query_key,
                    WebEnv=epost_webenv,
                    rettype="fasta",
                    retmode="text",
            ) as seq_handle:
                for record in SeqIO.parse(seq_handle, "fasta"):
                    temp_accession = record.id

                    # check if multiple items returned in ID
                    temp_accession = temp_accession.split("|")
                    retrieved_accession = None

                    for acc in temp_accession:
                        if acc.strip() in genbank_accessions:
                            retrieved_accession = acc

                    if retrieved_accession is None:  # if could not retrieve GenBank accession from the record
                        logger.error(
                            "Could not retrieve a GenBank protein accession matching an accession from the local database from:\n"
                            f"{record.id}\n"
                            "The sequence from this record will not be added to the db"
                        )
                        irregular_accessions.append(temp_accession)
                        continue

                    seq_dict[retrieved_accession] = record.seq

                    success_accessions.add(retrieved_accession)

        except IncompleteRead as err:
            logger.warning("IncompleteRead error raised:\n"
                           f"{err}\n"
                           "Will reattempt NCBI query later")

            if retry:
                return None, None

            failed_queries.append(all_queries)
            continue

    # list of GenBank accessions for which no protein sequence was retrieved

    no_seq = [
        acc for acc in genbank_accessions if acc not in success_accessions
    ]
    no_seq += irregular_accessions

    if retry:
        return seq_dict, no_seq

    if len(failed_queries) != 0:
        for failed_query in tqdm(failed_queries,
                                 desc="Reparsing failed queries"):
            first_half = failed_query[:int((len(failed_query) / 2))]

            seq_dict, success_accessions, failed_accessions = retry_failed_queries(
                first_half,
                seq_dict,
                success_accessions,
                args,
            )

            no_seq += failed_accessions

            second_half = failed_query[int((len(failed_query) / 2)):]

            seq_dict, success_accessions, failed_accessions = retry_failed_queries(
                second_half,
                seq_dict,
                success_accessions,
                args,
            )

            no_seq += failed_accessions

    logger.warning(
        f"Retrieved sequences for {len(success_accessions)} proteins")

    return seq_dict, no_seq
Example #10
0
def replace_multiple_tax(cazy_data, genbank_accessions, replaced_taxa_logger, args, invalid_ids):
    """Identify GenBank accessions which have multiple source organisms listedi in CAZy. Replace with
    the latest source organism from NCBI.

    :param cazy_data: dict of CAZy data
    :param genbank_accessions: list of genbank accessions with multiple taxa in CAZy
    :param replaced_taxa_logger: logger, used for logging GenBank accessions with multiple taxa in CAZy,
        and the data this replaced and what is replaced by
    :param args: cmd-line args parser
    :param invalid_ids: boolean, potential presence of invalid GenBank accessions
        Set as true when func is called by replace_multiple_tax_with_invalid_ids()

    Return dict updated cazy_data dict and boolean, if multiple taxa were replaced by single taxa
    """
    logger = logging.getLogger(__name__)
    
    id_post_list = str(",".join(genbank_accessions))

    success = False

    try:
        epost_results = Entrez.read(
            entrez_retry(
                args.retries,
                Entrez.epost,
                "Protein",
                id=id_post_list,
            )
        )
        success = True

    except (TypeError, AttributeError):  # if no record is returned from call to Entrez
        logger.error(
            f"Entrez failed to post assembly IDs.\n"
            "Not retrieving taxonomy classification from NCBI.\n"
            "Selecting the first organism retrieved from CAZy as the source organism"
        )
        cazy_data = select_first_organism(cazy_data, genbank_accessions)
        success = True

    except RuntimeError:
        logger.warning("Found GenBank accessions in CAZy data that are no longer in NCBI")

        if invalid_ids:
            # replace_multiple_tax was called by replace_multiple_tax_with_invalid_ids
            # return results, don't use recursive programming
            return cazy_data, success 

        else:
            # first time replace_multiple_tax was called
            cazy_data, success = replace_multiple_tax_with_invalid_ids(cazy_data, args)

    if success is False:
        logger.error(
            "Could not retrieve taxonomy data from NCBI,\n"
             "Using the first source organism retrieved from CAZy for each GenBank accession"
        )

        cazy_data = select_first_organism(cazy_data, genbank_accessions, replaced_taxa_logger)
        success = True
    
    else:
        logger.info("Parsing data retrieved from NCBI")
        cazy_data = get_ncbi_tax(epost_results, cazy_data, replaced_taxa_logger, args)
        
    return cazy_data, success
Example #11
0
def get_ncbi_tax(epost_results, cazy_data, replaced_taxa_logger, args):
    """Parse the ePost output from Entrez and add the NCBI tax classifications to the CAZy data
    
    :param epost_results: Entrez ePost output
    :param cazy_data: dict, data retrieved from CAZy
    :param args: cmd-line args parser
    :param replaced_taxa_logger: logger, used for logging GenBank accessions with multiple taxa in CAZy
    
    Return cazy_data (dict)
    """
    logger = logging.getLogger(__name__)
    
    # Retrieve web environment and query key from Entrez epost
    epost_webenv = epost_results["WebEnv"]
    epost_query_key = epost_results["QueryKey"]

    try:
        with entrez_retry(
            args.retries,
            Entrez.efetch,
            db="Protein",
            query_key=epost_query_key,
            WebEnv=epost_webenv,
            retmode="xml",
        ) as record_handle:
            protein_records = Entrez.read(record_handle, validate=False)

    # if no record is returned from call to Entrez
    except (TypeError, AttributeError) as error:
        logger.error(
            f"Entrez failed to retireve accession numbers."
            "Exiting retrieval of accession numbers, and returning null value 'NA'"
        )
    
    for protein in tqdm(protein_records, desc="Retrieving organism from NCBI"):
        # retrieve NCBI taxonomy data
        accession = protein['GBSeq_accession-version']
        organism = protein['GBSeq_organism']
        kingdom = protein['GBSeq_taxonomy'].split(';')[0]

        # retrieve CAZy taxonomy data
        cazy_kingdom = cazy_data[accession]["kingdom"]
        cazy_organisms = cazy_data[accession]["organism"]
    
        cazy_kingdom_str = ",".join(cazy_kingdom)
        cazy_organism_str = ','.join(cazy_organisms)
        
        try:
            cazy_data[accession]['kingdom'] = {kingdom}
            cazy_data[accession]['organism'] = {organism}

            # log the difference
            replaced_taxa_logger.warning(
               f"{accession}\t{cazy_kingdom_str}: {cazy_organism_str}\t{kingdom}: {organism}"
            )

        except KeyError:
            err = f'GenBank accession {accession} retrieved from NCBI, but it is not present in CAZy'
            logger.error(err)
            
            replaced_taxa_logger.warning(
               f"{accession}\t{cazy_kingdom_str}: {cazy_organism_str}\t{err}"
            )

    return cazy_data