Example #1
0
def output_genbank_summary(output_folder, results):
    """Save summary of GenBank retrieval results to file."""
    filepath = pathlib.Path(output_folder, GENBANK_RESULTS_TABLE)
    basic.export_data_dict(results,
                           filepath,
                           NCBI_RESULTS_COLUMNS,
                           include_headers=True)
Example #2
0
def create_ticket_table(tickets, output_folder):
    """Save tickets associated with retrieved from GenBank files."""
    filepath = pathlib.Path(output_folder, IMPORT_TABLE)
    tickets = convert_tickets_to_dict(tickets)
    basic.export_data_dict(tickets,
                           filepath,
                           IMPORT_COLUMNS,
                           include_headers=True)
Example #3
0
def execute_csv_export(db_filter,
                       export_path,
                       folder_path,
                       columns,
                       csv_name,
                       sort=[],
                       raw_bytes=False,
                       verbose=False):
    """Executes csv export of a MySQL database table with select columns.

    :param alchemist: A connected and fully build AlchemyHandler object.
    :type alchemist: AlchemyHandler
    :param export_path: Path to a dir for file creation.
    :type export_path: Path
    :param folder_path: Path to a top-level dir.
    :type folder_path: Path
    :param table: MySQL table name.
    :type table: str
    :param conditionals: MySQL WHERE clause-related SQLAlchemy objects.
    :type conditionals: list[BinaryExpression]
    :param sort: A list of SQLAlchemy Columns to sort by.
    :type sort: list[Column]
    :param values: List of values to fitler database results.
    :type values: list[str]
    :param verbose: A boolean value to toggle progress print statements.
    :type verbose: bool
    """
    if verbose:
        relative_path = str(export_path.relative_to(folder_path))
        print(f"Preparing {csv_name} export for '{relative_path}'...")

    headers = [db_filter._key.name]
    for column in columns:
        if column.name != db_filter._key.name:
            headers.append(column.name)

    results = db_filter.select(columns)

    if not raw_bytes:
        decode_results(results, columns, verbose=verbose)

    if len(results) == 0:
        print(f"No database entries received for {csv_name}.")
        export_path.rmdir()

    else:
        if verbose:
            print(f"...Writing csv {csv_name}.csv in '{export_path.name}'...")
            print("......Database entries retrieved: {len(results)}")

        file_path = export_path.joinpath(f"{csv_name}.csv")
        basic.export_data_dict(results,
                               file_path,
                               headers,
                               include_headers=True)
Example #4
0
    def setUpClass(self):
        test_db_utils.create_filled_test_db()

        self.test_dir = Path(TEST_DIR)
        if self.test_dir.is_dir():
            shutil.rmtree(TEST_DIR)

        self.test_dir.mkdir()
        self.resubmit_form = self.test_dir.joinpath("resubmit_form.txt")

        basic.export_data_dict(TEST_DATA, self.resubmit_form, PF_HEADER,
                                                        include_headers=True)
Example #5
0
def save_files_and_tkts(record_list, accession_dict, output_folder):
    """Save flat files retrieved from GenBank and create import tickets."""
    import_tickets = []
    genome_folder = pathlib.Path(output_folder, GENOMES_DIR)
    genome_folder.mkdir()
    for record in record_list:
        accession = record.name
        accession = accession.split('.')[0]
        gnm = accession_dict[accession]
        ncbi_filename = f"{gnm.name.lower()}__{accession}.gb"
        flatfile_path = pathlib.Path(genome_folder, ncbi_filename)
        SeqIO.write(record, str(flatfile_path), "genbank")

        tkt = ticket.ImportTicket()
        tkt.type = "replace"
        tkt.phage_id = gnm.id
        tkt.data_dict["host_genus"] = gnm.host_genus
        tkt.data_dict["cluster"] = gnm.cluster
        tkt.data_dict["subcluster"] = gnm.subcluster
        tkt.data_dict["annotation_status"] = gnm.annotation_status
        tkt.data_dict["annotation_author"] = gnm.annotation_author
        tkt.description_field = "product"
        # Accession is set to 'parse' to ensure that during import,
        # the file's accession is directly compared to the database
        # record's accession.
        # tkt.data_dict["accession"] = gnm.accession
        tkt.data_dict["accession"] = "parse"
        tkt.eval_mode = "auto"
        # TODO secondary_phage_id data is for old ticket format.
        tkt.data_dict["secondary_phage_id"] = gnm.id
        tkt.data_dict["retrieve_record"] = 1
        import_tickets.append(tkt)

    # Now make the import table.
    if len(import_tickets) > 0:
        filepath = basic.prepare_filepath(output_folder,
                                          "legacy_import_table.csv")
        import_tickets1 = convert_tickets_to_dict(import_tickets,
                                                  old_format=True)
        basic.export_data_dict(import_tickets1, filepath, IMPORT_COLUMNS1)

        # TODO new dictwriter. Use this block instead of above once the
        # new import script is functioning.
        if BOTH:
            filepath2 = basic.prepare_filepath(output_folder,
                                               "import_table.csv")
            import_tickets2 = convert_tickets_to_dict(import_tickets)
            basic.export_data_dict(import_tickets2,
                                   filepath2,
                                   IMPORT_COLUMNS2,
                                   include_headers=True)
Example #6
0
def write_report(data, export_path, header, csv_name="PhamReport",
                                            verbose=False):
    """Outputs a csv file

    """
    if not export_path.is_dir():
        print("Passed in path is not a directory.")
        sys.exit(1)

    file_path = export_path.joinpath(f"{csv_name}.csv")
    if verbose:
        print(f"Writing {file_path.name} in {export_path.name}...")

    basic.export_data_dict(data, file_path, header, include_headers=True)
Example #7
0
    def test_export_data_dict_1(self):
        """Verify data is exported correctly."""

        list_of_data = [self.tkt_dict1, self.tkt_dict2]
        headers = ["type", "phage_id", "host_genus", "cluster"]
        basic.export_data_dict(list_of_data,
                               self.export_file,
                               headers,
                               include_headers=True)

        exp_success_tkts = []
        with open(self.export_file, 'r') as file:
            file_reader = csv.DictReader(file)
            for dict in file_reader:
                exp_success_tkts.append(dict)

        with self.subTest():
            self.assertEqual(len(exp_success_tkts), 2)
        with self.subTest():
            self.assertEqual(set(exp_success_tkts[0].keys()), set(headers))
Example #8
0
def execute_csv_export(alchemist,
                       export_path,
                       table="phage",
                       values=[],
                       verbose=False):
    remove_fields = {
        "phage": ["Sequence"],
        "gene": ["Translation"],
        "domain": [],
        "gene_domain": [],
        "pham": [],
        "pham_color": [],
        "trna": ["Sequence"],
        "tmrna": [],
        "trna_structures": []
    }

    table_obj = alchemist.get_table(table)

    select_columns = []
    headers = []
    for column in table_obj.columns:
        if column.name not in remove_fields[table]:
            select_columns.append(column)
            headers.append(column.name)

    for column in table_obj.primary_key.columns:
        primary_key = column

    query = querying.build_select(alchemist.graph, select_columns)

    if values:
        query = query.where(primary_key.in_(values))

    results = alchemist.execute(query)

    file_path = export_path.joinpath(f"{table}.csv")
    basic.export_data_dict(results, file_path, headers, include_headers=True)
Example #9
0
def get_update_data(output_folder, matched_genomes):
    """Run sub-pipeline to retrieve field updates from PhagesDB."""
    updates_folder = pathlib.Path(output_folder, UPDATES_FOLDER)
    updates_folder.mkdir()
    update_tickets = []
    for gnm_pair in matched_genomes:
        tkt_list = compare_data(gnm_pair)
        update_tickets.extend(tkt_list)

    # Field updates
    if len(update_tickets) > 0:
        print(f"\n\n{len(update_tickets)} field updates are available.")
        filepath = pathlib.Path(updates_folder, UPDATE_TABLE)
        basic.export_data_dict(update_tickets,
                               filepath,
                               UPDATE_COLUMNS,
                               include_headers=True)
    else:
        print("\n\nNo field updates.")

    # Now remove empty folders.
    if len(basic.identify_contents(updates_folder, kind=None)) == 0:
        updates_folder.rmdir()
Example #10
0
def execute_resubmit(alchemist,
                     revisions_data_dicts,
                     folder_path,
                     folder_name,
                     filters="",
                     groups=[],
                     verbose=False):
    """Executes the entirety of the genbank resubmit pipeline.

    :param alchemist: A connected and fully built AlchemyHandler object.
    :type alchemist: AlchemyHandler
    :param revisions_data_dicts: Data dictionaries containing pham/notes data.
    :type revisions_data_dicts: list[dict]
    :param folder_path: Path to a valid dir for new dir creation.
    :type folder_path: Path
    :param folder_name: A name for the export folder.
    :type folder_name: str
    :param verbose: A boolean value to toggle progress print statements.
    :type verbose: bool
    """
    db_filter = Filter(alchemist=alchemist)
    db_filter.key = "gene.PhamID"
    db_filter.add(BASE_CONDITIONALS)

    if filters != "":
        try:
            db_filter.add(filters)
        except:
            print("Please check your syntax for the conditional string:\n"
                  f"{filters}")

    resubmit_columns = db_filter.get_columns(RESUBMIT_COLUMNS)

    phams = []
    for data_dict in revisions_data_dicts:
        phams.append(data_dict["Pham"])

    db_filter.values = phams

    if verbose:
        print("Creating export folder...")
    export_path = folder_path.joinpath(folder_name)
    export_path = basic.make_new_dir(folder_path, export_path, attempt=50)

    conditionals_map = {}
    export_db.build_groups_map(db_filter,
                               export_path,
                               conditionals_map,
                               groups=groups,
                               verbose=verbose)

    if verbose:
        print("Prepared query and path structure, beginning review export...")

    for mapped_path in conditionals_map.keys():
        if verbose:
            print("Retreiving phage data for pham revisions...")
        export_dicts = []
        for data_dict in revisions_data_dicts:
            if verbose:
                print(f"...Retrieving data for pham {data_dict['Pham']}...")

            conditionals = conditionals_map[mapped_path]

            final_call = data_dict["Final Call"]
            if final_call == "Hypothetical Protein":
                final_call = ""
            conditionals.append(
                querying.build_where_clause(alchemist.graph,
                                            f"gene.Notes!={final_call}"))

            query = querying.build_select(alchemist.graph,
                                          resubmit_columns,
                                          where=conditionals)

            results = querying.execute(alchemist.engine,
                                       query,
                                       in_column=db_filter.key,
                                       values=[data_dict["Pham"]])

            for result in results:
                format_resubmit_data(result, data_dict["Final Call"])
                export_dicts.append(result)

        if not export_dicts:
            if verbose:
                print("'{mapped_path.name}' data selected for resubmision "
                      "matches selected call; no resubmision exported...")

            mapped_path.rmdir()
            continue

        export_dicts = sorted(export_dicts,
                              key=lambda export_dict: export_dict["Phage"])

        if verbose:
            print(f"Writing {CSV_NAME} in {mapped_path.name}...")
        file_path = mapped_path.joinpath(CSV_NAME)
        basic.export_data_dict(export_dicts,
                               file_path,
                               RESUBMIT_HEADER,
                               include_headers=True)
Example #11
0
def retrieve_drafts(output_folder, phage_list):
    """Retrieve auto-annotated 'draft' genomes from PECAAN."""

    print(f"\n\nRetrieving {len(phage_list)} new phages from PECAAN")
    genome_folder = pathlib.Path(output_folder, GENOMES_DIR)
    genome_folder.mkdir()

    # Keep track of how many genomes were retrieved from PECAAN
    retrieved_tally = 0
    failed_list = []
    import_tickets = []

    # Iterate through each row in the file
    for new_phage in phage_list:
        pecaan_link = constants.PECAAN_PREFIX + new_phage
        response = phagesdb.retrieve_url_data(pecaan_link)
        if response == "":
            print(f"Error: unable to retrieve {new_phage} draft genome.")
            print(pecaan_link)
            failed_list.append(new_phage)
        else:
            pecaan_filename = f"{new_phage}.txt"
            pecaan_filepath = pathlib.Path(genome_folder, pecaan_filename)
            with pecaan_filepath.open("w") as fh:
                fh.write(response)

            tkt = ticket.ImportTicket()
            tkt.type = "add"
            tkt.phage_id = new_phage
            tkt.data_dict["host_genus"] = "retrieve"
            tkt.data_dict["cluster"] = "retrieve"
            tkt.data_dict["subcluster"] = "retrieve"
            tkt.data_dict["annotation_status"] = "draft"
            tkt.data_dict["annotation_author"] = 1
            tkt.description_field = "product"
            tkt.data_dict["accession"] = "none"
            tkt.eval_mode = "draft"
            # TODO secondary_phage_id data is for old ticket format.
            tkt.data_dict["secondary_phage_id"] = "none"
            tkt.data_dict["retrieve_record"] = 1
            import_tickets.append(tkt)

            print(f"{new_phage} retrieved from PECAAN.")
            retrieved_tally += 1

    # Now make the import table.
    if len(import_tickets) > 0:
        filepath = basic.prepare_filepath(output_folder,
                                          "legacy_import_table.csv")
        import_tickets1 = convert_tickets_to_dict(import_tickets,
                                                  old_format=True)
        basic.export_data_dict(import_tickets1, filepath, IMPORT_COLUMNS1)

        # TODO new dictwriter. Use this block instead of above once the
        # new import script is functioning.
        if BOTH:
            filepath2 = basic.prepare_filepath(output_folder,
                                               "import_table.csv")
            import_tickets2 = convert_tickets_to_dict(import_tickets)
            basic.export_data_dict(import_tickets2,
                                   filepath2,
                                   IMPORT_COLUMNS2,
                                   include_headers=True)

    # Report results
    if retrieved_tally > 0:
        print(f"{retrieved_tally} phage(s) were successfully retrieved")

    if len(failed_list) > 0:
        print(f"{len(failed_list)} phage(s) failed to be retrieved:")
        for element in failed_list:
            print(element)
        input("\n\nPress ENTER to continue.")
Example #12
0
def get_genbank_data(output_folder,
                     genome_dict,
                     ncbi_cred_dict={},
                     genbank_results=False):
    """Run sub-pipeline to retrieve genomes from GenBank."""
    # Flow of the NCBI record retrieval process:
    # 1 Create list of phages to check for updates at NCBI (completed above)
    # 2 Using esearch, verify which accessions are valid
    # 3 Using esummary, get update date for each valid accession
    # 4 Using efetch, retrieve flat files for NCBI records newer than
    # the MySQL database date
    # 5 Save new records in a folder and create an import table for them

    # Create output folder
    ncbi_folder = pathlib.Path(output_folder, f"genbank")
    ncbi_folder.mkdir()

    ncbi_results_list = []
    tallies = {}
    tallies["total"] = len(genome_dict.keys())

    # Iterate through each phage in the MySQL database
    result_tuple1 = sort_by_accession(genome_dict)
    tallies["not_auto_updated"] = result_tuple1[0]
    tallies["no_accession"] = result_tuple1[1]
    tallies["duplicate_accession"] = result_tuple1[2]
    ncbi_results_list.extend(result_tuple1[3])
    unique_accession_dict = result_tuple1[4]

    # More setup variables if NCBI updates are desired.  NCBI Bookshelf resource
    # "The E-utilities In-Depth: Parameters, Syntax and More", by Dr. Eric
    # Sayers, recommends that a single request not contain more than about 200
    # UIDS so we will use that as our batch size, and all Entrez requests must
    # include the user's email address and tool name.
    ncbi.set_entrez_credentials(tool=ncbi_cred_dict["ncbi_tool"],
                                email=ncbi_cred_dict["ncbi_email"],
                                api_key=ncbi_cred_dict["ncbi_api_key"])

    results_tuple2 = retrieve_records(unique_accession_dict, batch_size=200)
    tallies["docsum_not_new"] = results_tuple2[0]
    retrieved_record_list = results_tuple2[1]
    retrieval_error_list = results_tuple2[2]
    ncbi_results_list.extend(results_tuple2[3])

    # Report the genomes that could not be retrieved.
    results3 = process_failed_retrieval(retrieval_error_list,
                                        unique_accession_dict)
    ncbi_results_list.extend(results3)
    tallies["retrieval_failure"] = len(retrieval_error_list)

    results_tuple4 = check_record_date(retrieved_record_list,
                                       unique_accession_dict)
    new_record_list = results_tuple4[0]
    ncbi_results_list.extend(results_tuple4[1])

    tallies["retrieved_for_import"] = len(new_record_list)
    tallies["record_not_new"] = (len(retrieved_record_list) -
                                 len(new_record_list))

    if len(new_record_list) > 0:
        save_files_and_tkts(new_record_list, unique_accession_dict,
                            ncbi_folder)

    # Record retrieval results for all phages.
    if genbank_results == True:
        filepath3 = basic.prepare_filepath(ncbi_folder, "genbank_results.csv")
        basic.export_data_dict(ncbi_results_list,
                               filepath3,
                               NCBI_RESULTS_COLUMNS,
                               include_headers=True)

    # Print summary of script
    tallies["auto_updated"] = tallies["total"] - tallies["not_auto_updated"]
    tallies["accession"] = (tallies["auto_updated"] - tallies["no_accession"] -
                            tallies["duplicate_accession"])

    print("\n\n\nSummary of GenBank data retrieval:")
    print("Of the genomes in the MySQL database:")
    print(f"{tallies['total']:>6}: total")
    print(f"{tallies['not_auto_updated']:>6}: not auto-updated")
    print(f"{tallies['auto_updated']:>6}: auto-updated")

    print("\nOf the auto-updated genomes:")
    print(f"{tallies['no_accession']:>6}: no accession")
    print(f"{tallies['duplicate_accession']:>6}: duplicated accession")
    print(f"{tallies['accession']:>6}: unique accession")

    print("\nOf the auto-updated genomes with unique accessions:")
    print(f"{tallies['retrieval_failure']:>6}: could not be retrieved")
    print(f"{tallies['docsum_not_new']:>6}: retrieved but docsum not new")
    print(f"{tallies['record_not_new']:>6}: retrieved but record not new")
    print(f"{tallies['retrieved_for_import']:>6}: retrieved for import")

    # Now remove empty folders.
    if len(basic.identify_contents(ncbi_folder, kind=None)) == 0:
        ncbi_folder.rmdir()
Example #13
0
def get_final_data(output_folder, matched_genomes):
    """Run sub-pipeline to retrieve 'final' genomes from PhagesDB."""

    phagesdb_folder = pathlib.Path(output_folder, "phagesdb")
    phagesdb_folder.mkdir()
    genome_folder = pathlib.Path(phagesdb_folder, GENOMES_DIR)
    genome_folder.mkdir()
    import_tickets = []
    failed_list = []

    # Iterate through each phage in the MySQL database
    for gnm_pair in matched_genomes:
        mysqldb_gnm = gnm_pair.genome1
        phagesdb_gnm = gnm_pair.genome2

        # Not all phages have associated Genbank-formatted files
        # available on PhagesDB. Check to see if there is a flatfile for
        # this phage. Download the flatfile only if there is a date tag,
        # and only if that date is more recent than the date stored in
        # the MySQL database for that genome. The tagged date only reflects when
        # the file was uploaded into PhagesDB. The date the actual
        # Genbank record was created is stored within the file,
        # and this too could be less recent than the current version in
        # the MySQL database; however, this part gets checked during the import
        # stage.
        set_phagesdb_gnm_date(phagesdb_gnm)
        set_phagesdb_gnm_file(phagesdb_gnm)
        if (phagesdb_gnm.filename != ""
                and phagesdb_gnm.date > mysqldb_gnm.date):
            # Save the file on the hard drive with the same name as
            # stored on PhagesDB
            flatfile_data = phagesdb.retrieve_url_data(phagesdb_gnm.filename)
            if flatfile_data == "":
                failed_list.append(mysqldb_gnm.id)
            else:
                flatfile_filename = phagesdb_gnm.filename.split("/")[-1]
                flatfile_path = pathlib.Path(genome_folder, flatfile_filename)
                with flatfile_path.open("w") as fh:
                    fh.write(flatfile_data)
                # Create the new import ticket
                # Since the PhagesDB phage has been matched to
                # the MySQL database phage, the AnnotationAuthor field
                # could be assigned from the current mysqldb author
                # variable. However, since this genbank-formatted
                # file is acquired through PhagesDB, both the
                # Annotation status is expected to be 'final' and
                # the Annotation author is expected to be 'hatfull'.
                tkt = ticket.ImportTicket()
                tkt.type = "replace"
                tkt.phage_id = mysqldb_gnm.id
                tkt.data_dict["host_genus"] = "retrieve"
                tkt.data_dict["cluster"] = "retrieve"
                tkt.data_dict["subcluster"] = "retrieve"
                tkt.data_dict["annotation_status"] = "final"
                tkt.data_dict["annotation_author"] = 1
                tkt.description_field = "product"
                tkt.data_dict["accession"] = "retrieve"
                tkt.eval_mode = "final"
                # TODO secondary_phage_id data is for old ticket format.
                tkt.data_dict["secondary_phage_id"] = mysqldb_gnm.id
                tkt.data_dict["retrieve_record"] = 1
                import_tickets.append(tkt)

    count1 = len(import_tickets)
    if count1 > 0:
        print(f"\n\n{count1} phage(s) were retrieved from PhagesDB.")
        filepath = basic.prepare_filepath(phagesdb_folder,
                                          "legacy_import_table.csv")
        import_tickets1 = convert_tickets_to_dict(import_tickets,
                                                  old_format=True)
        basic.export_data_dict(import_tickets1, filepath, IMPORT_COLUMNS1)

        # TODO new dictwriter. Use this block instead of above once the
        # new import script is functioning.
        if BOTH:
            filepath2 = basic.prepare_filepath(phagesdb_folder,
                                               "import_table.csv")
            import_tickets2 = convert_tickets_to_dict(import_tickets)
            basic.export_data_dict(import_tickets2,
                                   filepath2,
                                   IMPORT_COLUMNS2,
                                   include_headers=True)

    if len(failed_list) > 0:
        print(f"{len(failed_list)} phage(s) failed to be retrieved:")
        for element in failed_list:
            print(element)
        input("\n\nPress ENTER to continue.")

    # Now remove empty folders.
    if len(basic.identify_contents(genome_folder, kind=None)) == 0:
        genome_folder.rmdir()
    if len(basic.identify_contents(phagesdb_folder, kind=None)) == 0:
        phagesdb_folder.rmdir()
Example #14
0
def get_update_data(output_folder, matched_genomes):
    """Run sub-pipeline to retrieve field updates from PhagesDB."""
    update_tickets = []
    for gnm_pair in matched_genomes:
        mysqldb_gnm = gnm_pair.genome1
        phagesdb_gnm = gnm_pair.genome2

        # Compare Cluster
        if mysqldb_gnm.cluster != phagesdb_gnm.cluster:
            result1 = {
                "table": "phage",
                "field": "Cluster",
                "value": phagesdb_gnm.cluster,
                "key_name": "PhageID",
                "key_value": mysqldb_gnm.id
            }
            update_tickets.append(result1)

        # Compare Subcluster
        if mysqldb_gnm.subcluster != phagesdb_gnm.subcluster:
            result3 = {
                "table": "phage",
                "field": "Subcluster",
                "value": phagesdb_gnm.subcluster,
                "key_name": "PhageID",
                "key_value": mysqldb_gnm.id
            }
            update_tickets.append(result3)

        # Compare Host genus
        if mysqldb_gnm.host_genus != phagesdb_gnm.host_genus:
            result5 = {
                "table": "phage",
                "field": "HostGenus",
                "value": phagesdb_gnm.host_genus,
                "key_name": "PhageID",
                "key_value": mysqldb_gnm.id
            }
            update_tickets.append(result5)

        # Compare Accession
        # If the genome author is not "hatfull", then don't worry about
        # updating the accession. This used to be determined with
        # the status field, but now it is determined with the
        # AnnotationAuthor field.
        if (mysqldb_gnm.accession != phagesdb_gnm.accession and \
                mysqldb_gnm.annotation_author == 1):
            result6 = {
                "table": "phage",
                "field": "Accession",
                "value": phagesdb_gnm.accession,
                "key_name": "PhageID",
                "key_value": mysqldb_gnm.id
            }
            update_tickets.append(result6)

    # Field updates
    if len(update_tickets) > 0:
        print("\n\nNew field updates are available.")
        filepath = basic.prepare_filepath(output_folder,
                                          "update_table.csv",
                                          folder_name="updates")
        basic.export_data_dict(update_tickets,
                               filepath,
                               UPDATE_COLUMNS,
                               include_headers=True)
    else:
        print("\n\nNo field updates found.")