コード例 #1
0
ファイル: get_data.py プロジェクト: cdshaffer/pdm_utils
def retrieve_records(accession_dict, ncbi_folder, batch_size=200):
    """Retrieve GenBank records."""
    print("\n\nRetrieving records from NCBI")
    genome_folder = pathlib.Path(ncbi_folder, GENOME_FOLDER)
    genome_folder.mkdir()
    retrieval_errors = []
    results = []
    tickets_list = []
    accessions = list(accession_dict.keys())
    mod_accessions = [accession + "[ACCN]" for accession in accessions]

    # When retrieving in batch sizes, first create the list of values
    # indicating which indices of the accessions should be used
    # to create each batch.
    # For instace, if there are five accessions, batch size of two produces
    # indices = 0,2,4

    batch_indices = basic.create_indices(mod_accessions, batch_size)
    print(f"There are {len(mod_accessions)} GenBank accession(s) to check.")
    for indices in batch_indices:
        start = indices[0]
        stop = indices[1]
        print(f"Checking accessions {start + 1} to {stop}...")
        esearch_term = " | ".join(mod_accessions[start:stop])

        # Use esearch for each accession
        # First use esearch to verify the accessions are valid.
        search_record = ncbi.run_esearch(db="nucleotide",
                                         term=esearch_term,
                                         usehistory="y")
        search_count = int(search_record["Count"])
        search_webenv = search_record["WebEnv"]
        search_query_key = search_record["QueryKey"]

        # Keep track of the accessions that failed to be located in NCBI
        # Each accession in the error list is formatted "accession[ACCN]"
        current_batch_size = stop - start
        if search_count < current_batch_size:
            search_failure = search_record["ErrorList"]["PhraseNotFound"]
            for accession in search_failure:
                retrieval_errors.append(accession[:-6])

        # Now get summaries for these records using esummary
        summary_records = ncbi.get_summaries(db="nucleotide",
                                             query_key=search_query_key,
                                             webenv=search_webenv)
        results_tuple = get_accessions_to_retrieve(summary_records,
                                                   accession_dict)
        accessions_to_retrieve = results_tuple[0]
        results.extend(results_tuple[1])

        if len(accessions_to_retrieve) > 0:
            # Use efetch to retrieve the record.
            output_list = ncbi.get_records(accessions_to_retrieve,
                                           db="nucleotide",
                                           rettype="gb",
                                           retmode="text")

            # TODO check_record_date may be redundant. It checks date within the
            # record. Earlier in the pipeline, the docsum date has already been
            # checked though. So if docsum date is identical to date in the
            # record, this is redundant.
            tup = check_record_date(output_list, accession_dict)
            new_record_list = tup[0]
            # list of results dictionaries
            results.extend(tup[1])

            if len(new_record_list) > 0:
                tickets = save_and_tickets(new_record_list, accession_dict,
                                           genome_folder)
                tickets_list.extend(tickets)

    if len(tickets_list) > 0:
        create_ticket_table(tickets_list, ncbi_folder)

    # Remove genome folder if empty.
    if len(basic.identify_contents(genome_folder, kind=None)) == 0:
        genome_folder.rmdir()

    # Report the genomes that could not be retrieved.
    failed = process_failed_retrieval(retrieval_errors, accession_dict)
    results.extend(failed)

    return results
コード例 #2
0
ファイル: get_gb_records.py プロジェクト: stjacqrm/pdm_utils
def get_data(output_folder, acc_id_dict, ncbi_cred_dict={}, batch_size=200):
    """Retrieve genomes from GenBank.

    output_folder = Path to where files will be saved.
    acc_id_dict = Dictionary where key = Accession and value = List[PhageIDs]
    """

    # More setup variables if NCBI updates are desired.  NCBI Bookshelf resource
    # "The E-utilities In-Depth: Parameters, Syntax and More", by Dr. Eric
    # Sayers, recommends that a single request not contain more than about 200
    # UIDS so we will use that as our batch size, and all Entrez requests must
    # include the user's email address and tool name.
    ncbi.set_entrez_credentials(tool=ncbi_cred_dict["ncbi_tool"],
                                email=ncbi_cred_dict["ncbi_email"],
                                api_key=ncbi_cred_dict["ncbi_api_key"])

    # Use esearch to verify the accessions are valid and efetch to retrieve
    # the record
    # Create batches of accessions
    unique_accession_list = list(acc_id_dict.keys())

    # Add [ACCN] field to each accession number
    appended_accessions = \
        [accession + "[ACCN]" for accession in unique_accession_list]

    # When retrieving in batch sizes, first create the list of values
    # indicating which indices of the unique_accession_list should be used
    # to create each batch.
    # For instace, if there are five accessions, batch size of two produces
    # indices = 0,2,4
    batch_indices = basic.create_indices(unique_accession_list, batch_size)
    print(
        f"There are {len(unique_accession_list)} GenBank accessions to check.")
    for indices in batch_indices:
        start = indices[0]
        stop = indices[1]
        print(f"Checking accessions {start + 1} to {stop}...")

        delimiter = " | "
        esearch_term = delimiter.join(appended_accessions[start:stop])

        # Use esearch for each accession
        search_record = ncbi.run_esearch(db="nucleotide",
                                         term=esearch_term,
                                         usehistory="y")
        search_count = int(search_record["Count"])
        search_webenv = search_record["WebEnv"]
        search_query_key = search_record["QueryKey"]
        summary_records = ncbi.get_summaries(db="nucleotide",
                                             query_key=search_query_key,
                                             webenv=search_webenv)

        accessions_to_retrieve = ncbi.get_accessions_to_retrieve(
            summary_records)
        if len(accessions_to_retrieve) > 0:
            records = ncbi.get_records(accessions_to_retrieve,
                                       db="nucleotide",
                                       rettype="gb",
                                       retmode="text")
            for record in records:
                output_data(record, acc_id_dict, output_folder)
コード例 #3
0
def retrieve_records(accession_dict, batch_size=200):
    """Retrieve GenBank records."""
    # First use esearch to verify the accessions are valid.
    # Seoncd use efetch to retrieve the record.
    print("\n\nRetrieving records from NCBI")
    retrieved_records = []  # GenBank records that have been retrieved.
    retrieval_errors = []
    tally_not_new = 0  # Keeps track if docsum date is new or not.
    results = []  # Summary of retrieval results.
    accessions = list(accession_dict.keys())
    mod_accessions = [accession + "[ACCN]" for accession in accessions]

    # When retrieving in batch sizes, first create the list of values
    # indicating which indices of the accessions should be used
    # to create each batch.
    # For instace, if there are five accessions, batch size of two produces
    # indices = 0,2,4

    batch_indices = basic.create_indices(mod_accessions, batch_size)
    print(f"There are {len(mod_accessions)} GenBank accession(s) to check.")
    for indices in batch_indices:
        start = indices[0]
        stop = indices[1]
        print(f"Checking accessions {start + 1} to {stop}...")
        delimiter = " | "
        esearch_term = delimiter.join(mod_accessions[start:stop])

        # Use esearch for each accession
        search_record = ncbi.run_esearch(db="nucleotide",
                                         term=esearch_term,
                                         usehistory="y")
        search_count = int(search_record["Count"])
        search_webenv = search_record["WebEnv"]
        search_query_key = search_record["QueryKey"]

        # Keep track of the accessions that failed to be located in NCBI
        # Each accession in the error list is formatted "accession[ACCN]"
        current_batch_size = stop - start
        if search_count < current_batch_size:
            search_failure = search_record["ErrorList"]["PhraseNotFound"]
            for accession in search_failure:
                retrieval_errors.append(accession[:-6])

        # Now get summaries for these records using esummary
        summary_records = ncbi.get_summaries(db="nucleotide",
                                             query_key=search_query_key,
                                             webenv=search_webenv)

        results_tuple = get_accessions_to_retrieve(summary_records,
                                                   accession_dict)
        accessions_to_retrieve = results_tuple[0]
        results.extend(results_tuple[1])
        tally_not_new += len(summary_records) - len(accessions_to_retrieve)

        if len(accessions_to_retrieve) > 0:
            output_list = ncbi.get_records(accessions_to_retrieve,
                                           db="nucleotide",
                                           rettype="gb",
                                           retmode="text")
            retrieved_records.extend(output_list)

    return (tally_not_new, retrieved_records, retrieval_errors, results)
コード例 #4
0
def get_genbank_data(output_folder, accession_set, ncbi_cred_dict={}):
    """Retrieve genomes from GenBank."""

    batch_size = 200

    # More setup variables if NCBI updates are desired.  NCBI Bookshelf resource
    # "The E-utilities In-Depth: Parameters, Syntax and More", by Dr. Eric
    # Sayers, recommends that a single request not contain more than about 200
    # UIDS so we will use that as our batch size, and all Entrez requests must
    # include the user's email address and tool name.
    ncbi.set_entrez_credentials(tool=ncbi_cred_dict["ncbi_tool"],
                                email=ncbi_cred_dict["ncbi_email"],
                                api_key=ncbi_cred_dict["ncbi_api_key"])

    # Use esearch to verify the accessions are valid and efetch to retrieve
    # the record
    # Create batches of accessions
    unique_accession_list = list(accession_set)

    # Add [ACCN] field to each accession number
    appended_accessions = \
        [accession + "[ACCN]" for accession in unique_accession_list]

    # When retrieving in batch sizes, first create the list of values
    # indicating which indices of the unique_accession_list should be used
    # to create each batch.
    # For instace, if there are five accessions, batch size of two produces
    # indices = 0,2,4
    batch_indices = basic.create_indices(unique_accession_list, batch_size)
    print(
        f"There are {len(unique_accession_list)} GenBank accessions to check.")
    for indices in batch_indices:
        batch_index_start = indices[0]
        batch_index_stop = indices[1]
        print("Checking accessions "
              f"{batch_index_start + 1} to {batch_index_stop}...")
        current_batch_size = batch_index_stop - batch_index_start
        delimiter = " | "
        esearch_term = delimiter.join(
            appended_accessions[batch_index_start:batch_index_stop])

        # Use esearch for each accession
        search_record = ncbi.run_esearch(db="nucleotide",
                                         term=esearch_term,
                                         usehistory="y")
        search_count = int(search_record["Count"])
        search_webenv = search_record["WebEnv"]
        search_query_key = search_record["QueryKey"]
        summary_records = ncbi.get_summaries(db="nucleotide",
                                             query_key=search_query_key,
                                             webenv=search_webenv)

        accessions_to_retrieve = []
        for doc_sum in summary_records:
            doc_sum_accession = doc_sum["Caption"]
            accessions_to_retrieve.append(doc_sum_accession)

        if len(accessions_to_retrieve) > 0:
            output_list = ncbi.get_records(accessions_to_retrieve,
                                           db="nucleotide",
                                           rettype="gb",
                                           retmode="text")
            for retrieved_record in output_list:
                ncbi_filename = (f"{retrieved_record.name}.gb")
                flatfile_path = pathlib.Path(output_folder, ncbi_filename)
                SeqIO.write(retrieved_record, str(flatfile_path), "genbank")