Esempio n. 1
0
def retrieve_drafts(output_folder, phage_list):
    """Retrieve auto-annotated 'draft' genomes from PECAAN."""

    genome_folder = pathlib.Path(output_folder, GENOME_FOLDER)
    genome_folder.mkdir()
    failed = []
    tickets = []

    # Iterate through each row in the file
    for new_phage in phage_list:
        pecaan_link = constants.PECAAN_PREFIX + new_phage
        response = phagesdb.retrieve_url_data(pecaan_link)
        if response == "":
            print(f"Error: unable to retrieve {new_phage} draft genome.")
            print(pecaan_link)
            failed.append(new_phage)
        else:
            save_pecaan_file(response, new_phage, genome_folder)
            tkt = create_draft_ticket(new_phage)
            tickets.append(tkt)
            print(f"{new_phage} retrieved from PECAAN.")

    if len(tickets) > 0:
        create_ticket_table(tickets, output_folder)
        print(f"{len(tickets)} phage(s) were successfully retrieved")

    if len(failed) > 0:
        print(f"{len(failed)} phage(s) failed to be retrieved:")
        for item in failed:
            print(item)
        input("\n\nPress ENTER to continue.")
Esempio n. 2
0
    def test_retrieve_url_data_2(self):
        """Verify fasta data is not retrieved and an error is produced."""

        url = "https://phagesdb.org/media/fastas/L5_x.fasta"
        fasta_data = phagesdb.retrieve_url_data(url)
        expected_fasta_data_header = ""
        self.assertEqual(fasta_data, expected_fasta_data_header)
Esempio n. 3
0
    def test_retrieve_url_data_1(self):
        """Verify fasta data is retrieved and no error is produced."""

        url = "https://phagesdb.org/media/fastas/L5.fasta"
        fasta_data = phagesdb.retrieve_url_data(url)
        expected_fasta_data_header = ">Mycobacterium phage L5"
        self.assertEqual(fasta_data[:23], expected_fasta_data_header)
Esempio n. 4
0
def get_final_data(output_folder, matched_genomes):
    """Run sub-pipeline to retrieve 'final' genomes from PhagesDB."""

    print(f"\n\nDownloading genome(s) from PhagesDB.")
    phagesdb_folder = pathlib.Path(output_folder, PHAGESDB_FOLDER)
    phagesdb_folder.mkdir()
    genome_folder = pathlib.Path(phagesdb_folder, GENOME_FOLDER)
    genome_folder.mkdir()
    import_tickets = []
    failed_list = []

    # Iterate through each phage in the MySQL database
    for gnm_pair in matched_genomes:
        mysqldb_gnm = gnm_pair.genome1
        phagesdb_gnm = gnm_pair.genome2

        # Not all phages have associated Genbank-formatted files
        # available on PhagesDB. Check to see if there is a flatfile for
        # this phage. Download the flatfile only if there is a date tag,
        # and only if that date is more recent than the date stored in
        # the MySQL database for that genome. The tagged date only reflects when
        # the file was uploaded into PhagesDB. The date the actual
        # Genbank record was created is stored within the file,
        # and this too could be less recent than the current version in
        # the MySQL database; however, this part gets checked during the import
        # stage.
        set_phagesdb_gnm_date(phagesdb_gnm)
        set_phagesdb_gnm_file(phagesdb_gnm)
        if (phagesdb_gnm.filename != ""
                and phagesdb_gnm.date > mysqldb_gnm.date):
            # Save the file on the hard drive with the same name as
            # stored on PhagesDB
            flatfile_data = phagesdb.retrieve_url_data(phagesdb_gnm.filename)
            if flatfile_data == "":
                failed_list.append(mysqldb_gnm.id)
            else:
                save_phagesdb_file(flatfile_data, phagesdb_gnm, genome_folder)
                tkt = create_phagesdb_ticket(mysqldb_gnm.id)
                import_tickets.append(tkt)

    if len(import_tickets) > 0:
        print(f"\n\n{len(import_tickets)} genome(s) "
              "were retrieved from PhagesDB.")
        create_ticket_table(import_tickets, phagesdb_folder)

    if len(failed_list) > 0:
        print(f"{len(failed_list)} genome(s) failed to be retrieved:")
        for element in failed_list:
            print(element)
        input("\n\nPress ENTER to continue.")

    # Now remove empty folders.
    if len(basic.identify_contents(genome_folder, kind=None)) == 0:
        genome_folder.rmdir()
    if len(basic.identify_contents(phagesdb_folder, kind=None)) == 0:
        phagesdb_folder.rmdir()
Esempio n. 5
0
def retrieve_drafts(output_folder, phage_list):
    """Retrieve auto-annotated 'draft' genomes from PECAAN."""

    print(f"\n\nRetrieving {len(phage_list)} new phages from PECAAN")
    genome_folder = pathlib.Path(output_folder, GENOMES_DIR)
    genome_folder.mkdir()

    # Keep track of how many genomes were retrieved from PECAAN
    retrieved_tally = 0
    failed_list = []
    import_tickets = []

    # Iterate through each row in the file
    for new_phage in phage_list:
        pecaan_link = constants.PECAAN_PREFIX + new_phage
        response = phagesdb.retrieve_url_data(pecaan_link)
        if response == "":
            print(f"Error: unable to retrieve {new_phage} draft genome.")
            print(pecaan_link)
            failed_list.append(new_phage)
        else:
            pecaan_filename = f"{new_phage}.txt"
            pecaan_filepath = pathlib.Path(genome_folder, pecaan_filename)
            with pecaan_filepath.open("w") as fh:
                fh.write(response)

            tkt = ticket.ImportTicket()
            tkt.type = "add"
            tkt.phage_id = new_phage
            tkt.data_dict["host_genus"] = "retrieve"
            tkt.data_dict["cluster"] = "retrieve"
            tkt.data_dict["subcluster"] = "retrieve"
            tkt.data_dict["annotation_status"] = "draft"
            tkt.data_dict["annotation_author"] = 1
            tkt.description_field = "product"
            tkt.data_dict["accession"] = "none"
            tkt.eval_mode = "draft"
            # TODO secondary_phage_id data is for old ticket format.
            tkt.data_dict["secondary_phage_id"] = "none"
            tkt.data_dict["retrieve_record"] = 1
            import_tickets.append(tkt)

            print(f"{new_phage} retrieved from PECAAN.")
            retrieved_tally += 1

    # Now make the import table.
    if len(import_tickets) > 0:
        filepath = basic.prepare_filepath(output_folder,
                                          "legacy_import_table.csv")
        import_tickets1 = convert_tickets_to_dict(import_tickets,
                                                  old_format=True)
        basic.export_data_dict(import_tickets1, filepath, IMPORT_COLUMNS1)

        # TODO new dictwriter. Use this block instead of above once the
        # new import script is functioning.
        if BOTH:
            filepath2 = basic.prepare_filepath(output_folder,
                                               "import_table.csv")
            import_tickets2 = convert_tickets_to_dict(import_tickets)
            basic.export_data_dict(import_tickets2,
                                   filepath2,
                                   IMPORT_COLUMNS2,
                                   include_headers=True)

    # Report results
    if retrieved_tally > 0:
        print(f"{retrieved_tally} phage(s) were successfully retrieved")

    if len(failed_list) > 0:
        print(f"{len(failed_list)} phage(s) failed to be retrieved:")
        for element in failed_list:
            print(element)
        input("\n\nPress ENTER to continue.")
Esempio n. 6
0
def get_final_data(output_folder, matched_genomes):
    """Run sub-pipeline to retrieve 'final' genomes from PhagesDB."""

    phagesdb_folder = pathlib.Path(output_folder, "phagesdb")
    phagesdb_folder.mkdir()
    genome_folder = pathlib.Path(phagesdb_folder, GENOMES_DIR)
    genome_folder.mkdir()
    import_tickets = []
    failed_list = []

    # Iterate through each phage in the MySQL database
    for gnm_pair in matched_genomes:
        mysqldb_gnm = gnm_pair.genome1
        phagesdb_gnm = gnm_pair.genome2

        # Not all phages have associated Genbank-formatted files
        # available on PhagesDB. Check to see if there is a flatfile for
        # this phage. Download the flatfile only if there is a date tag,
        # and only if that date is more recent than the date stored in
        # the MySQL database for that genome. The tagged date only reflects when
        # the file was uploaded into PhagesDB. The date the actual
        # Genbank record was created is stored within the file,
        # and this too could be less recent than the current version in
        # the MySQL database; however, this part gets checked during the import
        # stage.
        set_phagesdb_gnm_date(phagesdb_gnm)
        set_phagesdb_gnm_file(phagesdb_gnm)
        if (phagesdb_gnm.filename != ""
                and phagesdb_gnm.date > mysqldb_gnm.date):
            # Save the file on the hard drive with the same name as
            # stored on PhagesDB
            flatfile_data = phagesdb.retrieve_url_data(phagesdb_gnm.filename)
            if flatfile_data == "":
                failed_list.append(mysqldb_gnm.id)
            else:
                flatfile_filename = phagesdb_gnm.filename.split("/")[-1]
                flatfile_path = pathlib.Path(genome_folder, flatfile_filename)
                with flatfile_path.open("w") as fh:
                    fh.write(flatfile_data)
                # Create the new import ticket
                # Since the PhagesDB phage has been matched to
                # the MySQL database phage, the AnnotationAuthor field
                # could be assigned from the current mysqldb author
                # variable. However, since this genbank-formatted
                # file is acquired through PhagesDB, both the
                # Annotation status is expected to be 'final' and
                # the Annotation author is expected to be 'hatfull'.
                tkt = ticket.ImportTicket()
                tkt.type = "replace"
                tkt.phage_id = mysqldb_gnm.id
                tkt.data_dict["host_genus"] = "retrieve"
                tkt.data_dict["cluster"] = "retrieve"
                tkt.data_dict["subcluster"] = "retrieve"
                tkt.data_dict["annotation_status"] = "final"
                tkt.data_dict["annotation_author"] = 1
                tkt.description_field = "product"
                tkt.data_dict["accession"] = "retrieve"
                tkt.eval_mode = "final"
                # TODO secondary_phage_id data is for old ticket format.
                tkt.data_dict["secondary_phage_id"] = mysqldb_gnm.id
                tkt.data_dict["retrieve_record"] = 1
                import_tickets.append(tkt)

    count1 = len(import_tickets)
    if count1 > 0:
        print(f"\n\n{count1} phage(s) were retrieved from PhagesDB.")
        filepath = basic.prepare_filepath(phagesdb_folder,
                                          "legacy_import_table.csv")
        import_tickets1 = convert_tickets_to_dict(import_tickets,
                                                  old_format=True)
        basic.export_data_dict(import_tickets1, filepath, IMPORT_COLUMNS1)

        # TODO new dictwriter. Use this block instead of above once the
        # new import script is functioning.
        if BOTH:
            filepath2 = basic.prepare_filepath(phagesdb_folder,
                                               "import_table.csv")
            import_tickets2 = convert_tickets_to_dict(import_tickets)
            basic.export_data_dict(import_tickets2,
                                   filepath2,
                                   IMPORT_COLUMNS2,
                                   include_headers=True)

    if len(failed_list) > 0:
        print(f"{len(failed_list)} phage(s) failed to be retrieved:")
        for element in failed_list:
            print(element)
        input("\n\nPress ENTER to continue.")

    # Now remove empty folders.
    if len(basic.identify_contents(genome_folder, kind=None)) == 0:
        genome_folder.rmdir()
    if len(basic.identify_contents(phagesdb_folder, kind=None)) == 0:
        phagesdb_folder.rmdir()