Ejemplo n.º 1
0
def get_final_data(output_folder, matched_genomes):
    """Run sub-pipeline to retrieve 'final' genomes from PhagesDB."""

    print(f"\n\nDownloading genome(s) from PhagesDB.")
    phagesdb_folder = pathlib.Path(output_folder, PHAGESDB_FOLDER)
    phagesdb_folder.mkdir()
    genome_folder = pathlib.Path(phagesdb_folder, GENOME_FOLDER)
    genome_folder.mkdir()
    import_tickets = []
    failed_list = []

    # Iterate through each phage in the MySQL database
    for gnm_pair in matched_genomes:
        mysqldb_gnm = gnm_pair.genome1
        phagesdb_gnm = gnm_pair.genome2

        # Not all phages have associated Genbank-formatted files
        # available on PhagesDB. Check to see if there is a flatfile for
        # this phage. Download the flatfile only if there is a date tag,
        # and only if that date is more recent than the date stored in
        # the MySQL database for that genome. The tagged date only reflects when
        # the file was uploaded into PhagesDB. The date the actual
        # Genbank record was created is stored within the file,
        # and this too could be less recent than the current version in
        # the MySQL database; however, this part gets checked during the import
        # stage.
        set_phagesdb_gnm_date(phagesdb_gnm)
        set_phagesdb_gnm_file(phagesdb_gnm)
        if (phagesdb_gnm.filename != ""
                and phagesdb_gnm.date > mysqldb_gnm.date):
            # Save the file on the hard drive with the same name as
            # stored on PhagesDB
            flatfile_data = phagesdb.retrieve_url_data(phagesdb_gnm.filename)
            if flatfile_data == "":
                failed_list.append(mysqldb_gnm.id)
            else:
                save_phagesdb_file(flatfile_data, phagesdb_gnm, genome_folder)
                tkt = create_phagesdb_ticket(mysqldb_gnm.id)
                import_tickets.append(tkt)

    if len(import_tickets) > 0:
        print(f"\n\n{len(import_tickets)} genome(s) "
              "were retrieved from PhagesDB.")
        create_ticket_table(import_tickets, phagesdb_folder)

    if len(failed_list) > 0:
        print(f"{len(failed_list)} genome(s) failed to be retrieved:")
        for element in failed_list:
            print(element)
        input("\n\nPress ENTER to continue.")

    # Now remove empty folders.
    if len(basic.identify_contents(genome_folder, kind=None)) == 0:
        genome_folder.rmdir()
    if len(basic.identify_contents(phagesdb_folder, kind=None)) == 0:
        phagesdb_folder.rmdir()
Ejemplo n.º 2
0
def get_files(directory, file, ignore):
    """
    Get the list of file(s) that need to be uploaded.

    :param directory: (optional) directory containing files for upload
    :type: directory: pathlib.Path
    :param file: (optional) file to upload
    :type file: pathlib.Path
    :param ignore: file(s) to ignore during upload process
    :type ignore: set
    :return: file_list
    """
    file_list = []

    if directory is not None:
        directory = basic.set_path(directory, kind="dir", expect=True)
        folder_files = basic.identify_contents(directory,
                                               kind="file",
                                               ignore_set=ignore)
        file_list.extend(folder_files)

    if file is not None:
        file = basic.set_path(file, kind="file", expect=True)
        file_list.append(file)

    return file_list
Ejemplo n.º 3
0
 def test_identify_contents_7(self):
     """Verify None is returned due to incorrect kind."""
     Path(self.base_dir, "new_dir1").mkdir()
     Path(self.base_dir, "new_dir2").mkdir()
     Path(self.base_dir, "file1.txt").touch()
     Path(self.base_dir, ".DS_Store").touch()
     list_of_items = basic.identify_contents(self.base_dir, kind="invalid")
     self.assertIsNone(list_of_items)
Ejemplo n.º 4
0
 def test_identify_contents_1(self):
     """Verify the correct number of files are returned when
     no ignore set is provided."""
     Path(self.base_dir, "new_dir").mkdir()
     Path(self.base_dir, "file1.txt").touch()
     Path(self.base_dir, ".DS_Store").touch()
     list_of_items = basic.identify_contents(self.base_dir, kind="file")
     exp_num_items = 2
     self.assertEqual(len(list_of_items), exp_num_items)
Ejemplo n.º 5
0
def main(unparsed_args_list):
    """Run the push_db pipeline."""
    args = parse_args(unparsed_args_list)

    file_list = []
    if args.directory is not None:
        args.directory = basic.set_path(args.directory,
                                        kind="dir",
                                        expect=True)
        folder_files = basic.identify_contents(args.directory,
                                               kind="file",
                                               ignore_set=set([".DS_Store"]))
        file_list.extend(folder_files)
    if args.file is not None:
        args.file = basic.set_path(args.file, kind="file", expect=True)
        file_list.append(args.file)

    status = True
    if len(file_list) == 0:
        print("There are no files to upload.")
        status = False

    if status == True:
        server.set_log_file(str(args.log_file))
        transport = server.get_transport(constants.DB_HOST)
        if transport is None:
            status = False

    if status == True:
        sftp = server.setup_sftp_conn(transport, attempts=3)
        if sftp is None:
            status = False

    success = []
    fail = []
    if status == True:
        for local_filepath in file_list:
            print(f"Uploading {local_filepath.name}...")
            remote_filepath = pathlib.Path(constants.DB_HOST_DIR,
                                           local_filepath.name)
            result = server.upload_file(sftp, str(local_filepath),
                                        str(remote_filepath))
            if result:
                success.append(local_filepath.name)
            else:
                fail.append(local_filepath.name)
        sftp.close()
        transport.close()

    if len(fail) > 0:
        print("The following files were not uploaded:")
        for file in fail:
            print(file)
Ejemplo n.º 6
0
 def test_identify_contents_6(self):
     """Verify the correct number of files and folders are returned when
     an ignore set is provided."""
     Path(self.base_dir, "new_dir1").mkdir()
     Path(self.base_dir, "new_dir2").mkdir()
     Path(self.base_dir, "file1.txt").touch()
     Path(self.base_dir, ".DS_Store").touch()
     ignore_set = set(["new_dir2"])
     list_of_items = basic.identify_contents(self.base_dir, kind=None,
                                             ignore_set=ignore_set)
     exp_num_items = 3
     self.assertEqual(len(list_of_items), exp_num_items)
Ejemplo n.º 7
0
def get_genbank_data(output_folder,
                     genome_dict,
                     ncbi_cred_dict={},
                     genbank_results=False,
                     force=False):
    """Run sub-pipeline to retrieve genomes from GenBank."""
    # Flow of the NCBI record retrieval process:
    # 1 Create list of phages to check for updates at NCBI (completed above)
    # 2 Using esearch, verify which accessions are valid
    # 3 Using esummary, get update date for each valid accession
    # 4 Using efetch, retrieve flat files for NCBI records newer than
    # the MySQL database date
    # 5 Save new records in a folder and create an import table for them

    print(f"\n\nDownloading genome(s) from GenBank.")
    # Create output folder
    ncbi_folder = pathlib.Path(output_folder, GENBANK_FOLDER)
    ncbi_folder.mkdir()
    ncbi_results_list = []

    # Iterate through each phage in the MySQL database
    tup1 = sort_by_accession(genome_dict, force=force)
    ncbi_results_list.extend(tup1[0])
    accession_dict = tup1[1]

    # More setup variables if NCBI updates are desired.  NCBI Bookshelf resource
    # "The E-utilities In-Depth: Parameters, Syntax and More", by Dr. Eric
    # Sayers, recommends that a single request not contain more than about 200
    # UIDS so we will use that as our batch size, and all Entrez requests must
    # include the user's email address and tool name.
    ncbi.set_entrez_credentials(tool=ncbi_cred_dict["tool"],
                                email=ncbi_cred_dict["email"],
                                api_key=ncbi_cred_dict["api_key"])

    results = retrieve_records(accession_dict, ncbi_folder, batch_size=200)
    ncbi_results_list.extend(results)

    # Record retrieval results for all phages.
    if genbank_results == True:
        output_genbank_summary(ncbi_folder, ncbi_results_list)

    # Print summary of script
    tallies = compute_genbank_tallies(ncbi_results_list)
    print_genbank_tallies(tallies)

    # Now remove empty folders.
    if len(basic.identify_contents(ncbi_folder, kind=None)) == 0:
        ncbi_folder.rmdir()
Ejemplo n.º 8
0
def get_update_data(output_folder, matched_genomes):
    """Run sub-pipeline to retrieve field updates from PhagesDB."""
    updates_folder = pathlib.Path(output_folder, UPDATES_FOLDER)
    updates_folder.mkdir()
    update_tickets = []
    for gnm_pair in matched_genomes:
        tkt_list = compare_data(gnm_pair)
        update_tickets.extend(tkt_list)

    # Field updates
    if len(update_tickets) > 0:
        print(f"\n\n{len(update_tickets)} field updates are available.")
        filepath = pathlib.Path(updates_folder, UPDATE_TABLE)
        fileio.export_data_dict(update_tickets,
                                filepath,
                                UPDATE_COLUMNS,
                                include_headers=True)
    else:
        print("\n\nNo field updates.")

    # Now remove empty folders.
    if len(basic.identify_contents(updates_folder, kind=None)) == 0:
        updates_folder.rmdir()
Ejemplo n.º 9
0
def retrieve_records(accession_dict, ncbi_folder, batch_size=200):
    """Retrieve GenBank records."""
    print("\n\nRetrieving records from NCBI")
    genome_folder = pathlib.Path(ncbi_folder, GENOME_FOLDER)
    genome_folder.mkdir()
    retrieval_errors = []
    results = []
    tickets_list = []
    accessions = list(accession_dict.keys())
    mod_accessions = [accession + "[ACCN]" for accession in accessions]

    # When retrieving in batch sizes, first create the list of values
    # indicating which indices of the accessions should be used
    # to create each batch.
    # For instace, if there are five accessions, batch size of two produces
    # indices = 0,2,4

    batch_indices = basic.create_indices(mod_accessions, batch_size)
    print(f"There are {len(mod_accessions)} GenBank accession(s) to check.")
    for indices in batch_indices:
        start = indices[0]
        stop = indices[1]
        print(f"Checking accessions {start + 1} to {stop}...")
        esearch_term = " | ".join(mod_accessions[start:stop])

        # Use esearch for each accession
        # First use esearch to verify the accessions are valid.
        search_record = ncbi.run_esearch(db="nucleotide",
                                         term=esearch_term,
                                         usehistory="y")
        search_count = int(search_record["Count"])
        search_webenv = search_record["WebEnv"]
        search_query_key = search_record["QueryKey"]

        # Keep track of the accessions that failed to be located in NCBI
        # Each accession in the error list is formatted "accession[ACCN]"
        current_batch_size = stop - start
        if search_count < current_batch_size:
            search_failure = search_record["ErrorList"]["PhraseNotFound"]
            for accession in search_failure:
                retrieval_errors.append(accession[:-6])

        # Now get summaries for these records using esummary
        summary_records = ncbi.get_summaries(db="nucleotide",
                                             query_key=search_query_key,
                                             webenv=search_webenv)
        results_tuple = get_accessions_to_retrieve(summary_records,
                                                   accession_dict)
        accessions_to_retrieve = results_tuple[0]
        results.extend(results_tuple[1])

        if len(accessions_to_retrieve) > 0:
            # Use efetch to retrieve the record.
            output_list = ncbi.get_records(accessions_to_retrieve,
                                           db="nucleotide",
                                           rettype="gb",
                                           retmode="text")

            # TODO check_record_date may be redundant. It checks date within the
            # record. Earlier in the pipeline, the docsum date has already been
            # checked though. So if docsum date is identical to date in the
            # record, this is redundant.
            tup = check_record_date(output_list, accession_dict)
            new_record_list = tup[0]
            # list of results dictionaries
            results.extend(tup[1])

            if len(new_record_list) > 0:
                tickets = save_and_tickets(new_record_list, accession_dict,
                                           genome_folder)
                tickets_list.extend(tickets)

    if len(tickets_list) > 0:
        create_ticket_table(tickets_list, ncbi_folder)

    # Remove genome folder if empty.
    if len(basic.identify_contents(genome_folder, kind=None)) == 0:
        genome_folder.rmdir()

    # Report the genomes that could not be retrieved.
    failed = process_failed_retrieval(retrieval_errors, accession_dict)
    results.extend(failed)

    return results
Ejemplo n.º 10
0
def get_genbank_data(output_folder,
                     genome_dict,
                     ncbi_cred_dict={},
                     genbank_results=False):
    """Run sub-pipeline to retrieve genomes from GenBank."""
    # Flow of the NCBI record retrieval process:
    # 1 Create list of phages to check for updates at NCBI (completed above)
    # 2 Using esearch, verify which accessions are valid
    # 3 Using esummary, get update date for each valid accession
    # 4 Using efetch, retrieve flat files for NCBI records newer than
    # the MySQL database date
    # 5 Save new records in a folder and create an import table for them

    # Create output folder
    ncbi_folder = pathlib.Path(output_folder, f"genbank")
    ncbi_folder.mkdir()

    ncbi_results_list = []
    tallies = {}
    tallies["total"] = len(genome_dict.keys())

    # Iterate through each phage in the MySQL database
    result_tuple1 = sort_by_accession(genome_dict)
    tallies["not_auto_updated"] = result_tuple1[0]
    tallies["no_accession"] = result_tuple1[1]
    tallies["duplicate_accession"] = result_tuple1[2]
    ncbi_results_list.extend(result_tuple1[3])
    unique_accession_dict = result_tuple1[4]

    # More setup variables if NCBI updates are desired.  NCBI Bookshelf resource
    # "The E-utilities In-Depth: Parameters, Syntax and More", by Dr. Eric
    # Sayers, recommends that a single request not contain more than about 200
    # UIDS so we will use that as our batch size, and all Entrez requests must
    # include the user's email address and tool name.
    ncbi.set_entrez_credentials(tool=ncbi_cred_dict["ncbi_tool"],
                                email=ncbi_cred_dict["ncbi_email"],
                                api_key=ncbi_cred_dict["ncbi_api_key"])

    results_tuple2 = retrieve_records(unique_accession_dict, batch_size=200)
    tallies["docsum_not_new"] = results_tuple2[0]
    retrieved_record_list = results_tuple2[1]
    retrieval_error_list = results_tuple2[2]
    ncbi_results_list.extend(results_tuple2[3])

    # Report the genomes that could not be retrieved.
    results3 = process_failed_retrieval(retrieval_error_list,
                                        unique_accession_dict)
    ncbi_results_list.extend(results3)
    tallies["retrieval_failure"] = len(retrieval_error_list)

    results_tuple4 = check_record_date(retrieved_record_list,
                                       unique_accession_dict)
    new_record_list = results_tuple4[0]
    ncbi_results_list.extend(results_tuple4[1])

    tallies["retrieved_for_import"] = len(new_record_list)
    tallies["record_not_new"] = (len(retrieved_record_list) -
                                 len(new_record_list))

    if len(new_record_list) > 0:
        save_files_and_tkts(new_record_list, unique_accession_dict,
                            ncbi_folder)

    # Record retrieval results for all phages.
    if genbank_results == True:
        filepath3 = basic.prepare_filepath(ncbi_folder, "genbank_results.csv")
        basic.export_data_dict(ncbi_results_list,
                               filepath3,
                               NCBI_RESULTS_COLUMNS,
                               include_headers=True)

    # Print summary of script
    tallies["auto_updated"] = tallies["total"] - tallies["not_auto_updated"]
    tallies["accession"] = (tallies["auto_updated"] - tallies["no_accession"] -
                            tallies["duplicate_accession"])

    print("\n\n\nSummary of GenBank data retrieval:")
    print("Of the genomes in the MySQL database:")
    print(f"{tallies['total']:>6}: total")
    print(f"{tallies['not_auto_updated']:>6}: not auto-updated")
    print(f"{tallies['auto_updated']:>6}: auto-updated")

    print("\nOf the auto-updated genomes:")
    print(f"{tallies['no_accession']:>6}: no accession")
    print(f"{tallies['duplicate_accession']:>6}: duplicated accession")
    print(f"{tallies['accession']:>6}: unique accession")

    print("\nOf the auto-updated genomes with unique accessions:")
    print(f"{tallies['retrieval_failure']:>6}: could not be retrieved")
    print(f"{tallies['docsum_not_new']:>6}: retrieved but docsum not new")
    print(f"{tallies['record_not_new']:>6}: retrieved but record not new")
    print(f"{tallies['retrieved_for_import']:>6}: retrieved for import")

    # Now remove empty folders.
    if len(basic.identify_contents(ncbi_folder, kind=None)) == 0:
        ncbi_folder.rmdir()
Ejemplo n.º 11
0
def get_final_data(output_folder, matched_genomes):
    """Run sub-pipeline to retrieve 'final' genomes from PhagesDB."""

    phagesdb_folder = pathlib.Path(output_folder, "phagesdb")
    phagesdb_folder.mkdir()
    genome_folder = pathlib.Path(phagesdb_folder, GENOMES_DIR)
    genome_folder.mkdir()
    import_tickets = []
    failed_list = []

    # Iterate through each phage in the MySQL database
    for gnm_pair in matched_genomes:
        mysqldb_gnm = gnm_pair.genome1
        phagesdb_gnm = gnm_pair.genome2

        # Not all phages have associated Genbank-formatted files
        # available on PhagesDB. Check to see if there is a flatfile for
        # this phage. Download the flatfile only if there is a date tag,
        # and only if that date is more recent than the date stored in
        # the MySQL database for that genome. The tagged date only reflects when
        # the file was uploaded into PhagesDB. The date the actual
        # Genbank record was created is stored within the file,
        # and this too could be less recent than the current version in
        # the MySQL database; however, this part gets checked during the import
        # stage.
        set_phagesdb_gnm_date(phagesdb_gnm)
        set_phagesdb_gnm_file(phagesdb_gnm)
        if (phagesdb_gnm.filename != ""
                and phagesdb_gnm.date > mysqldb_gnm.date):
            # Save the file on the hard drive with the same name as
            # stored on PhagesDB
            flatfile_data = phagesdb.retrieve_url_data(phagesdb_gnm.filename)
            if flatfile_data == "":
                failed_list.append(mysqldb_gnm.id)
            else:
                flatfile_filename = phagesdb_gnm.filename.split("/")[-1]
                flatfile_path = pathlib.Path(genome_folder, flatfile_filename)
                with flatfile_path.open("w") as fh:
                    fh.write(flatfile_data)
                # Create the new import ticket
                # Since the PhagesDB phage has been matched to
                # the MySQL database phage, the AnnotationAuthor field
                # could be assigned from the current mysqldb author
                # variable. However, since this genbank-formatted
                # file is acquired through PhagesDB, both the
                # Annotation status is expected to be 'final' and
                # the Annotation author is expected to be 'hatfull'.
                tkt = ticket.ImportTicket()
                tkt.type = "replace"
                tkt.phage_id = mysqldb_gnm.id
                tkt.data_dict["host_genus"] = "retrieve"
                tkt.data_dict["cluster"] = "retrieve"
                tkt.data_dict["subcluster"] = "retrieve"
                tkt.data_dict["annotation_status"] = "final"
                tkt.data_dict["annotation_author"] = 1
                tkt.description_field = "product"
                tkt.data_dict["accession"] = "retrieve"
                tkt.eval_mode = "final"
                # TODO secondary_phage_id data is for old ticket format.
                tkt.data_dict["secondary_phage_id"] = mysqldb_gnm.id
                tkt.data_dict["retrieve_record"] = 1
                import_tickets.append(tkt)

    count1 = len(import_tickets)
    if count1 > 0:
        print(f"\n\n{count1} phage(s) were retrieved from PhagesDB.")
        filepath = basic.prepare_filepath(phagesdb_folder,
                                          "legacy_import_table.csv")
        import_tickets1 = convert_tickets_to_dict(import_tickets,
                                                  old_format=True)
        basic.export_data_dict(import_tickets1, filepath, IMPORT_COLUMNS1)

        # TODO new dictwriter. Use this block instead of above once the
        # new import script is functioning.
        if BOTH:
            filepath2 = basic.prepare_filepath(phagesdb_folder,
                                               "import_table.csv")
            import_tickets2 = convert_tickets_to_dict(import_tickets)
            basic.export_data_dict(import_tickets2,
                                   filepath2,
                                   IMPORT_COLUMNS2,
                                   include_headers=True)

    if len(failed_list) > 0:
        print(f"{len(failed_list)} phage(s) failed to be retrieved:")
        for element in failed_list:
            print(element)
        input("\n\nPress ENTER to continue.")

    # Now remove empty folders.
    if len(basic.identify_contents(genome_folder, kind=None)) == 0:
        genome_folder.rmdir()
    if len(basic.identify_contents(phagesdb_folder, kind=None)) == 0:
        phagesdb_folder.rmdir()