def get_final_data(output_folder, matched_genomes): """Run sub-pipeline to retrieve 'final' genomes from PhagesDB.""" print(f"\n\nDownloading genome(s) from PhagesDB.") phagesdb_folder = pathlib.Path(output_folder, PHAGESDB_FOLDER) phagesdb_folder.mkdir() genome_folder = pathlib.Path(phagesdb_folder, GENOME_FOLDER) genome_folder.mkdir() import_tickets = [] failed_list = [] # Iterate through each phage in the MySQL database for gnm_pair in matched_genomes: mysqldb_gnm = gnm_pair.genome1 phagesdb_gnm = gnm_pair.genome2 # Not all phages have associated Genbank-formatted files # available on PhagesDB. Check to see if there is a flatfile for # this phage. Download the flatfile only if there is a date tag, # and only if that date is more recent than the date stored in # the MySQL database for that genome. The tagged date only reflects when # the file was uploaded into PhagesDB. The date the actual # Genbank record was created is stored within the file, # and this too could be less recent than the current version in # the MySQL database; however, this part gets checked during the import # stage. set_phagesdb_gnm_date(phagesdb_gnm) set_phagesdb_gnm_file(phagesdb_gnm) if (phagesdb_gnm.filename != "" and phagesdb_gnm.date > mysqldb_gnm.date): # Save the file on the hard drive with the same name as # stored on PhagesDB flatfile_data = phagesdb.retrieve_url_data(phagesdb_gnm.filename) if flatfile_data == "": failed_list.append(mysqldb_gnm.id) else: save_phagesdb_file(flatfile_data, phagesdb_gnm, genome_folder) tkt = create_phagesdb_ticket(mysqldb_gnm.id) import_tickets.append(tkt) if len(import_tickets) > 0: print(f"\n\n{len(import_tickets)} genome(s) " "were retrieved from PhagesDB.") create_ticket_table(import_tickets, phagesdb_folder) if len(failed_list) > 0: print(f"{len(failed_list)} genome(s) failed to be retrieved:") for element in failed_list: print(element) input("\n\nPress ENTER to continue.") # Now remove empty folders. if len(basic.identify_contents(genome_folder, kind=None)) == 0: genome_folder.rmdir() if len(basic.identify_contents(phagesdb_folder, kind=None)) == 0: phagesdb_folder.rmdir()
def get_files(directory, file, ignore): """ Get the list of file(s) that need to be uploaded. :param directory: (optional) directory containing files for upload :type: directory: pathlib.Path :param file: (optional) file to upload :type file: pathlib.Path :param ignore: file(s) to ignore during upload process :type ignore: set :return: file_list """ file_list = [] if directory is not None: directory = basic.set_path(directory, kind="dir", expect=True) folder_files = basic.identify_contents(directory, kind="file", ignore_set=ignore) file_list.extend(folder_files) if file is not None: file = basic.set_path(file, kind="file", expect=True) file_list.append(file) return file_list
def test_identify_contents_7(self): """Verify None is returned due to incorrect kind.""" Path(self.base_dir, "new_dir1").mkdir() Path(self.base_dir, "new_dir2").mkdir() Path(self.base_dir, "file1.txt").touch() Path(self.base_dir, ".DS_Store").touch() list_of_items = basic.identify_contents(self.base_dir, kind="invalid") self.assertIsNone(list_of_items)
def test_identify_contents_1(self): """Verify the correct number of files are returned when no ignore set is provided.""" Path(self.base_dir, "new_dir").mkdir() Path(self.base_dir, "file1.txt").touch() Path(self.base_dir, ".DS_Store").touch() list_of_items = basic.identify_contents(self.base_dir, kind="file") exp_num_items = 2 self.assertEqual(len(list_of_items), exp_num_items)
def main(unparsed_args_list): """Run the push_db pipeline.""" args = parse_args(unparsed_args_list) file_list = [] if args.directory is not None: args.directory = basic.set_path(args.directory, kind="dir", expect=True) folder_files = basic.identify_contents(args.directory, kind="file", ignore_set=set([".DS_Store"])) file_list.extend(folder_files) if args.file is not None: args.file = basic.set_path(args.file, kind="file", expect=True) file_list.append(args.file) status = True if len(file_list) == 0: print("There are no files to upload.") status = False if status == True: server.set_log_file(str(args.log_file)) transport = server.get_transport(constants.DB_HOST) if transport is None: status = False if status == True: sftp = server.setup_sftp_conn(transport, attempts=3) if sftp is None: status = False success = [] fail = [] if status == True: for local_filepath in file_list: print(f"Uploading {local_filepath.name}...") remote_filepath = pathlib.Path(constants.DB_HOST_DIR, local_filepath.name) result = server.upload_file(sftp, str(local_filepath), str(remote_filepath)) if result: success.append(local_filepath.name) else: fail.append(local_filepath.name) sftp.close() transport.close() if len(fail) > 0: print("The following files were not uploaded:") for file in fail: print(file)
def test_identify_contents_6(self): """Verify the correct number of files and folders are returned when an ignore set is provided.""" Path(self.base_dir, "new_dir1").mkdir() Path(self.base_dir, "new_dir2").mkdir() Path(self.base_dir, "file1.txt").touch() Path(self.base_dir, ".DS_Store").touch() ignore_set = set(["new_dir2"]) list_of_items = basic.identify_contents(self.base_dir, kind=None, ignore_set=ignore_set) exp_num_items = 3 self.assertEqual(len(list_of_items), exp_num_items)
def get_genbank_data(output_folder, genome_dict, ncbi_cred_dict={}, genbank_results=False, force=False): """Run sub-pipeline to retrieve genomes from GenBank.""" # Flow of the NCBI record retrieval process: # 1 Create list of phages to check for updates at NCBI (completed above) # 2 Using esearch, verify which accessions are valid # 3 Using esummary, get update date for each valid accession # 4 Using efetch, retrieve flat files for NCBI records newer than # the MySQL database date # 5 Save new records in a folder and create an import table for them print(f"\n\nDownloading genome(s) from GenBank.") # Create output folder ncbi_folder = pathlib.Path(output_folder, GENBANK_FOLDER) ncbi_folder.mkdir() ncbi_results_list = [] # Iterate through each phage in the MySQL database tup1 = sort_by_accession(genome_dict, force=force) ncbi_results_list.extend(tup1[0]) accession_dict = tup1[1] # More setup variables if NCBI updates are desired. NCBI Bookshelf resource # "The E-utilities In-Depth: Parameters, Syntax and More", by Dr. Eric # Sayers, recommends that a single request not contain more than about 200 # UIDS so we will use that as our batch size, and all Entrez requests must # include the user's email address and tool name. ncbi.set_entrez_credentials(tool=ncbi_cred_dict["tool"], email=ncbi_cred_dict["email"], api_key=ncbi_cred_dict["api_key"]) results = retrieve_records(accession_dict, ncbi_folder, batch_size=200) ncbi_results_list.extend(results) # Record retrieval results for all phages. if genbank_results == True: output_genbank_summary(ncbi_folder, ncbi_results_list) # Print summary of script tallies = compute_genbank_tallies(ncbi_results_list) print_genbank_tallies(tallies) # Now remove empty folders. if len(basic.identify_contents(ncbi_folder, kind=None)) == 0: ncbi_folder.rmdir()
def get_update_data(output_folder, matched_genomes): """Run sub-pipeline to retrieve field updates from PhagesDB.""" updates_folder = pathlib.Path(output_folder, UPDATES_FOLDER) updates_folder.mkdir() update_tickets = [] for gnm_pair in matched_genomes: tkt_list = compare_data(gnm_pair) update_tickets.extend(tkt_list) # Field updates if len(update_tickets) > 0: print(f"\n\n{len(update_tickets)} field updates are available.") filepath = pathlib.Path(updates_folder, UPDATE_TABLE) fileio.export_data_dict(update_tickets, filepath, UPDATE_COLUMNS, include_headers=True) else: print("\n\nNo field updates.") # Now remove empty folders. if len(basic.identify_contents(updates_folder, kind=None)) == 0: updates_folder.rmdir()
def retrieve_records(accession_dict, ncbi_folder, batch_size=200): """Retrieve GenBank records.""" print("\n\nRetrieving records from NCBI") genome_folder = pathlib.Path(ncbi_folder, GENOME_FOLDER) genome_folder.mkdir() retrieval_errors = [] results = [] tickets_list = [] accessions = list(accession_dict.keys()) mod_accessions = [accession + "[ACCN]" for accession in accessions] # When retrieving in batch sizes, first create the list of values # indicating which indices of the accessions should be used # to create each batch. # For instace, if there are five accessions, batch size of two produces # indices = 0,2,4 batch_indices = basic.create_indices(mod_accessions, batch_size) print(f"There are {len(mod_accessions)} GenBank accession(s) to check.") for indices in batch_indices: start = indices[0] stop = indices[1] print(f"Checking accessions {start + 1} to {stop}...") esearch_term = " | ".join(mod_accessions[start:stop]) # Use esearch for each accession # First use esearch to verify the accessions are valid. search_record = ncbi.run_esearch(db="nucleotide", term=esearch_term, usehistory="y") search_count = int(search_record["Count"]) search_webenv = search_record["WebEnv"] search_query_key = search_record["QueryKey"] # Keep track of the accessions that failed to be located in NCBI # Each accession in the error list is formatted "accession[ACCN]" current_batch_size = stop - start if search_count < current_batch_size: search_failure = search_record["ErrorList"]["PhraseNotFound"] for accession in search_failure: retrieval_errors.append(accession[:-6]) # Now get summaries for these records using esummary summary_records = ncbi.get_summaries(db="nucleotide", query_key=search_query_key, webenv=search_webenv) results_tuple = get_accessions_to_retrieve(summary_records, accession_dict) accessions_to_retrieve = results_tuple[0] results.extend(results_tuple[1]) if len(accessions_to_retrieve) > 0: # Use efetch to retrieve the record. output_list = ncbi.get_records(accessions_to_retrieve, db="nucleotide", rettype="gb", retmode="text") # TODO check_record_date may be redundant. It checks date within the # record. Earlier in the pipeline, the docsum date has already been # checked though. So if docsum date is identical to date in the # record, this is redundant. tup = check_record_date(output_list, accession_dict) new_record_list = tup[0] # list of results dictionaries results.extend(tup[1]) if len(new_record_list) > 0: tickets = save_and_tickets(new_record_list, accession_dict, genome_folder) tickets_list.extend(tickets) if len(tickets_list) > 0: create_ticket_table(tickets_list, ncbi_folder) # Remove genome folder if empty. if len(basic.identify_contents(genome_folder, kind=None)) == 0: genome_folder.rmdir() # Report the genomes that could not be retrieved. failed = process_failed_retrieval(retrieval_errors, accession_dict) results.extend(failed) return results
def get_genbank_data(output_folder, genome_dict, ncbi_cred_dict={}, genbank_results=False): """Run sub-pipeline to retrieve genomes from GenBank.""" # Flow of the NCBI record retrieval process: # 1 Create list of phages to check for updates at NCBI (completed above) # 2 Using esearch, verify which accessions are valid # 3 Using esummary, get update date for each valid accession # 4 Using efetch, retrieve flat files for NCBI records newer than # the MySQL database date # 5 Save new records in a folder and create an import table for them # Create output folder ncbi_folder = pathlib.Path(output_folder, f"genbank") ncbi_folder.mkdir() ncbi_results_list = [] tallies = {} tallies["total"] = len(genome_dict.keys()) # Iterate through each phage in the MySQL database result_tuple1 = sort_by_accession(genome_dict) tallies["not_auto_updated"] = result_tuple1[0] tallies["no_accession"] = result_tuple1[1] tallies["duplicate_accession"] = result_tuple1[2] ncbi_results_list.extend(result_tuple1[3]) unique_accession_dict = result_tuple1[4] # More setup variables if NCBI updates are desired. NCBI Bookshelf resource # "The E-utilities In-Depth: Parameters, Syntax and More", by Dr. Eric # Sayers, recommends that a single request not contain more than about 200 # UIDS so we will use that as our batch size, and all Entrez requests must # include the user's email address and tool name. ncbi.set_entrez_credentials(tool=ncbi_cred_dict["ncbi_tool"], email=ncbi_cred_dict["ncbi_email"], api_key=ncbi_cred_dict["ncbi_api_key"]) results_tuple2 = retrieve_records(unique_accession_dict, batch_size=200) tallies["docsum_not_new"] = results_tuple2[0] retrieved_record_list = results_tuple2[1] retrieval_error_list = results_tuple2[2] ncbi_results_list.extend(results_tuple2[3]) # Report the genomes that could not be retrieved. results3 = process_failed_retrieval(retrieval_error_list, unique_accession_dict) ncbi_results_list.extend(results3) tallies["retrieval_failure"] = len(retrieval_error_list) results_tuple4 = check_record_date(retrieved_record_list, unique_accession_dict) new_record_list = results_tuple4[0] ncbi_results_list.extend(results_tuple4[1]) tallies["retrieved_for_import"] = len(new_record_list) tallies["record_not_new"] = (len(retrieved_record_list) - len(new_record_list)) if len(new_record_list) > 0: save_files_and_tkts(new_record_list, unique_accession_dict, ncbi_folder) # Record retrieval results for all phages. if genbank_results == True: filepath3 = basic.prepare_filepath(ncbi_folder, "genbank_results.csv") basic.export_data_dict(ncbi_results_list, filepath3, NCBI_RESULTS_COLUMNS, include_headers=True) # Print summary of script tallies["auto_updated"] = tallies["total"] - tallies["not_auto_updated"] tallies["accession"] = (tallies["auto_updated"] - tallies["no_accession"] - tallies["duplicate_accession"]) print("\n\n\nSummary of GenBank data retrieval:") print("Of the genomes in the MySQL database:") print(f"{tallies['total']:>6}: total") print(f"{tallies['not_auto_updated']:>6}: not auto-updated") print(f"{tallies['auto_updated']:>6}: auto-updated") print("\nOf the auto-updated genomes:") print(f"{tallies['no_accession']:>6}: no accession") print(f"{tallies['duplicate_accession']:>6}: duplicated accession") print(f"{tallies['accession']:>6}: unique accession") print("\nOf the auto-updated genomes with unique accessions:") print(f"{tallies['retrieval_failure']:>6}: could not be retrieved") print(f"{tallies['docsum_not_new']:>6}: retrieved but docsum not new") print(f"{tallies['record_not_new']:>6}: retrieved but record not new") print(f"{tallies['retrieved_for_import']:>6}: retrieved for import") # Now remove empty folders. if len(basic.identify_contents(ncbi_folder, kind=None)) == 0: ncbi_folder.rmdir()
def get_final_data(output_folder, matched_genomes): """Run sub-pipeline to retrieve 'final' genomes from PhagesDB.""" phagesdb_folder = pathlib.Path(output_folder, "phagesdb") phagesdb_folder.mkdir() genome_folder = pathlib.Path(phagesdb_folder, GENOMES_DIR) genome_folder.mkdir() import_tickets = [] failed_list = [] # Iterate through each phage in the MySQL database for gnm_pair in matched_genomes: mysqldb_gnm = gnm_pair.genome1 phagesdb_gnm = gnm_pair.genome2 # Not all phages have associated Genbank-formatted files # available on PhagesDB. Check to see if there is a flatfile for # this phage. Download the flatfile only if there is a date tag, # and only if that date is more recent than the date stored in # the MySQL database for that genome. The tagged date only reflects when # the file was uploaded into PhagesDB. The date the actual # Genbank record was created is stored within the file, # and this too could be less recent than the current version in # the MySQL database; however, this part gets checked during the import # stage. set_phagesdb_gnm_date(phagesdb_gnm) set_phagesdb_gnm_file(phagesdb_gnm) if (phagesdb_gnm.filename != "" and phagesdb_gnm.date > mysqldb_gnm.date): # Save the file on the hard drive with the same name as # stored on PhagesDB flatfile_data = phagesdb.retrieve_url_data(phagesdb_gnm.filename) if flatfile_data == "": failed_list.append(mysqldb_gnm.id) else: flatfile_filename = phagesdb_gnm.filename.split("/")[-1] flatfile_path = pathlib.Path(genome_folder, flatfile_filename) with flatfile_path.open("w") as fh: fh.write(flatfile_data) # Create the new import ticket # Since the PhagesDB phage has been matched to # the MySQL database phage, the AnnotationAuthor field # could be assigned from the current mysqldb author # variable. However, since this genbank-formatted # file is acquired through PhagesDB, both the # Annotation status is expected to be 'final' and # the Annotation author is expected to be 'hatfull'. tkt = ticket.ImportTicket() tkt.type = "replace" tkt.phage_id = mysqldb_gnm.id tkt.data_dict["host_genus"] = "retrieve" tkt.data_dict["cluster"] = "retrieve" tkt.data_dict["subcluster"] = "retrieve" tkt.data_dict["annotation_status"] = "final" tkt.data_dict["annotation_author"] = 1 tkt.description_field = "product" tkt.data_dict["accession"] = "retrieve" tkt.eval_mode = "final" # TODO secondary_phage_id data is for old ticket format. tkt.data_dict["secondary_phage_id"] = mysqldb_gnm.id tkt.data_dict["retrieve_record"] = 1 import_tickets.append(tkt) count1 = len(import_tickets) if count1 > 0: print(f"\n\n{count1} phage(s) were retrieved from PhagesDB.") filepath = basic.prepare_filepath(phagesdb_folder, "legacy_import_table.csv") import_tickets1 = convert_tickets_to_dict(import_tickets, old_format=True) basic.export_data_dict(import_tickets1, filepath, IMPORT_COLUMNS1) # TODO new dictwriter. Use this block instead of above once the # new import script is functioning. if BOTH: filepath2 = basic.prepare_filepath(phagesdb_folder, "import_table.csv") import_tickets2 = convert_tickets_to_dict(import_tickets) basic.export_data_dict(import_tickets2, filepath2, IMPORT_COLUMNS2, include_headers=True) if len(failed_list) > 0: print(f"{len(failed_list)} phage(s) failed to be retrieved:") for element in failed_list: print(element) input("\n\nPress ENTER to continue.") # Now remove empty folders. if len(basic.identify_contents(genome_folder, kind=None)) == 0: genome_folder.rmdir() if len(basic.identify_contents(phagesdb_folder, kind=None)) == 0: phagesdb_folder.rmdir()