Beispiel #1
0
def output_genbank_summary(output_folder, results):
    """Save summary of GenBank retrieval results to file."""
    filepath = pathlib.Path(output_folder, GENBANK_RESULTS_TABLE)
    fileio.export_data_dict(results,
                            filepath,
                            NCBI_RESULTS_COLUMNS,
                            include_headers=True)
Beispiel #2
0
def write_clustering_update_ticket(working_dir, scheme_alterations,
                                   field="Cluster", filename=None):
    if filename is None:
        filename = working_dir.with_suffix(".csv").name
    update_dicts = []

    for cluster, diff_data in scheme_alterations.items():
        for data_dict in diff_data:
            update_dict = {}

            if cluster is None:
                cluster = "NULL"

            update_data = ("phage", field, cluster, "PhageID", data_dict["id"])
            for i in range(len(TICKET_HEADER)):
                update_dict[TICKET_HEADER[i]] = update_data[i]

            update_dicts.append(update_dict)

    if not update_dicts:
        return False

    filepath = working_dir.joinpath(filename)
    pdm_fileio.export_data_dict(update_dicts, filepath, TICKET_HEADER,
                                include_headers=True)

    return True
Beispiel #3
0
def create_ticket_table(tickets, output_folder):
    """Save tickets associated with retrieved from GenBank files."""
    filepath = pathlib.Path(output_folder, IMPORT_TABLE)
    tickets = convert_tickets_to_dict(tickets)
    fileio.export_data_dict(tickets,
                            filepath,
                            IMPORT_COLUMNS,
                            include_headers=True)
Beispiel #4
0
def execute_csv_export(db_filter, export_path, folder_path, columns, csv_name,
                       data_cache=None, sort=[], raw_bytes=False,
                       verbose=False, dump=False):
    """Executes csv export of a MySQL database table with select columns.

    :param db_filter: A connected and fully built Filter object.
    :type db_filter: Filter
    :param export_path: Path to a dir for file creation.
    :type export_path: Path
    :param folder_path: Path to a top-level dir.
    :type folder_path: Path
    :param table: MySQL table name.
    :type table: str
    :param conditionals: MySQL WHERE clause-related SQLAlchemy objects.
    :type conditionals: list[BinaryExpression]
    :param sort: A list of SQLAlchemy Columns to sort by.
    :type sort: list[Column]
    :param values: List of values to fitler database results.
    :type values: list[str]
    :param verbose: A boolean value to toggle progress print statements.
    :type verbose: bool
    :param dump: A boolean value to toggle dump in current working dir.
    :type dump: bool
    """
    if data_cache is None:
        data_cache = {}

    if verbose:
        relative_path = str(export_path.relative_to(folder_path))
        print(f"Preparing {csv_name} export for '{relative_path}'...")

    headers = [db_filter._key.name]
    for column in columns:
        if column.name != db_filter._key.name:
            headers.append(column.name)

    results = db_filter.select(columns)

    if not raw_bytes:
        decode_results(results, columns, verbose=verbose)

    if len(results) == 0:
        print(f"No database entries received for {csv_name}.")
        if not dump:
            export_path.rmdir()
    else:
        if dump:
            if export_path != folder_path:
                export_path.rmdir()
                export_path = export_path.parent
        if verbose:
            print(f"...Writing csv {csv_name}.csv in '{export_path.name}'...")

        file_path = export_path.joinpath(f"{csv_name}.csv")
        fileio.export_data_dict(results, file_path, headers,
                                include_headers=True)
Beispiel #5
0
def write_report(data, export_path, header, csv_name="Report", verbose=False):
    """Outputs a csv file
    """
    if not export_path.is_dir():
        print("Passed in path is not a directory.")
        sys.exit(1)

    file_path = export_path.joinpath(f"{csv_name}.csv")
    if verbose:
        print(f"Writing {file_path.name} in {export_path.name}...")

    fileio.export_data_dict(data, file_path, header, include_headers=True)
Beispiel #6
0
    def setUpClass(self):
        test_db_utils.create_filled_test_db()

        self.test_dir = Path(TEST_DIR)
        if self.test_dir.is_dir():
            shutil.rmtree(TEST_DIR)

        self.test_dir.mkdir()
        self.revise_form = self.test_dir.joinpath("revise_form.txt")

        fileio.export_data_dict(TEST_FR_DATA,
                                self.revise_form,
                                REVIEW_HEADER,
                                include_headers=True)
Beispiel #7
0
def write_cluster_analysis(intracluster_edges, working_dir, file_name=None):
    data_dicts = []
    for edge in intracluster_edges:
        data_dict = {}
        for i in range(len(edge)):
            data_dict[CLUSTER_ANALYSIS_HEADER[i]] = edge[i]

        data_dicts.append(data_dict)

    if file_name is None:
        file_name = working_dir.name

    filepath = working_dir.joinpath(file_name).with_suffix(".csv")
    pdm_fileio.export_data_dict(data_dicts, filepath, CLUSTER_ANALYSIS_HEADER,
                                include_headers=True)
Beispiel #8
0
    def setUp(self):
        self.alchemist = AlchemyHandler()
        self.alchemist.username = USER
        self.alchemist.password = PWD
        self.alchemist.database = DB
        self.alchemist.connect(ask_database=True, login_attempts=0)

        self.revise_test_dir = self.test_dir.joinpath("revise_test_dir")
        self.fr_input_file_path = self.test_dir.joinpath("FunctionReport.csv")
        self.csv_input_file_path = self.revise_test_dir.joinpath("gene.csv")

        fileio.export_data_dict(TEST_FR_DATA,
                                self.fr_input_file_path,
                                REVIEW_HEADER,
                                include_headers=True)

        self.assertTrue(self.fr_input_file_path.is_file())
Beispiel #9
0
def write_revise_file(data_dicts,
                      output_path,
                      file_format="p_curation",
                      file_name=CURATION_NAME,
                      verbose=False):
    """Writes a revision csv in the desired file format with necessary changes.

    :param data_dicts: List of data dictionaries to convert to curation format.
    :type data_dicts: list[dict]
    :param output_path: Path to a dir for file creation.
    :type output_path: Path
    :param file_format: Format of the csv to be written.
    :type file_format: str
    :param file_name: Name of the file to write curation data to.
    :type file_name: str
    :param verbose: A boolean value to toggle progress print statements.
    :type verbose: bool
    """
    data_dicts = sorted(data_dicts, key=lambda data_dict: data_dict["PhageID"])

    if file_format == "p_curation":
        format_data = format_curation_data
    elif file_format == "ticket":
        format_data = format_update_ticket_data
    else:
        raise ValueError(f"File format '{file_format}' is not "
                         "recognized for revise output.")

    for d in data_dicts:
        format_data(d)

    if verbose:
        print(f"Writing {file_name} in {output_path.name}...")
    file_path = output_path.joinpath(file_name)

    if file_format == "p_curation":
        header = CURATION_HEADER
    elif file_format == "ticket":
        header = TICKET_HEADER

    fileio.export_data_dict(data_dicts,
                            file_path,
                            header,
                            include_headers=True)
Beispiel #10
0
    def test_export_data_dict_1(self):
        """Verify data is exported correctly."""

        list_of_data = [self.tkt_dict1, self.tkt_dict2]
        headers = ["type", "phage_id", "host_genus", "cluster"]
        fileio.export_data_dict(list_of_data,
                                self.data_dict_file,
                                headers,
                                include_headers=True)

        exp_success_tkts = []
        with open(self.data_dict_file, 'r') as file:
            file_reader = csv.DictReader(file)
            for dict in file_reader:
                exp_success_tkts.append(dict)

        with self.subTest():
            self.assertEqual(len(exp_success_tkts), 2)
        with self.subTest():
            self.assertEqual(set(exp_success_tkts[0].keys()), set(headers))
Beispiel #11
0
def get_update_data(output_folder, matched_genomes):
    """Run sub-pipeline to retrieve field updates from PhagesDB."""
    updates_folder = pathlib.Path(output_folder, UPDATES_FOLDER)
    updates_folder.mkdir()
    update_tickets = []
    for gnm_pair in matched_genomes:
        tkt_list = compare_data(gnm_pair)
        update_tickets.extend(tkt_list)

    # Field updates
    if len(update_tickets) > 0:
        print(f"\n\n{len(update_tickets)} field updates are available.")
        filepath = pathlib.Path(updates_folder, UPDATE_TABLE)
        fileio.export_data_dict(update_tickets,
                                filepath,
                                UPDATE_COLUMNS,
                                include_headers=True)
    else:
        print("\n\nNo field updates.")

    # Now remove empty folders.
    if len(basic.identify_contents(updates_folder, kind=None)) == 0:
        updates_folder.rmdir()
Beispiel #12
0
def execute_remote_revise(alchemist,
                          folder_path=None,
                          folder_name=DEFAULT_FOLDER_NAME,
                          config=None,
                          output_type="p_curation",
                          values=None,
                          filters="",
                          groups=[],
                          verbose=False,
                          force=False):
    ncbi_creds = {}
    if config is not None:
        ncbi_creds = config["ncbi"]

    db_filter = pipelines_basic.build_filter(alchemist,
                                             "phage",
                                             filters,
                                             values=values,
                                             verbose=verbose)
    db_filter.add(BASE_CONDITIONALS)

    revise_path = pipelines_basic.create_working_path(folder_path,
                                                      folder_name,
                                                      force=force)

    conditionals_map = pipelines_basic.build_groups_map(db_filter,
                                                        revise_path,
                                                        groups=groups,
                                                        verbose=verbose,
                                                        force=force)

    values = db_filter.values
    for mapped_path in conditionals_map.keys():
        db_filter.reset()
        db_filter.values = values

        conditionals = conditionals_map[mapped_path]
        db_filter.values = db_filter.build_values(where=conditionals)

        if db_filter.hits() == 0:
            print(f"No database entries received for '{mapped_path}'.")

        pipelines_basic.create_working_dir(mapped_path, force=force)
        build_revise_log_file(mapped_path)

        logger.info(f"pdm_utils version: {VERSION}")
        logger.info(f"Revise run date: {CURRENT_DATE}")
        logger.info(f"Connected to database: {alchemist.database}")

        accession_data = db_filter.select(["phage.PhageID", "phage.Accession"])

        acc_id_dict = {}
        for data_dict in accession_data:
            accession = data_dict["Accession"]
            if not (accession is None or accession == ""):
                acc_id_dict[accession] = data_dict["PhageID"]

        tbl_records = get_tbl_records(acc_id_dict, ncbi_cred_dict=ncbi_creds)

        validated_phages = []
        for tbl_record in tbl_records:
            validated_phages.append(tbl_record.name)

        id_record_map = build_id_record_map(alchemist, validated_phages)

        if output_type == "tbl":
            revised_records = revise_seqrecords(id_record_map,
                                                tbl_records,
                                                verbose=verbose)

            if not revised_records:
                print("No discrepancies detected between "
                      f"local data and GenBank data for '{mapped_path}'.")
                continue

        elif output_type == "p_curation":
            curation_data_dicts = find_product_discrepancies(id_record_map,
                                                             tbl_records,
                                                             verbose=verbose)

            if not curation_data_dicts:
                print("No discrepancies detected between "
                      f"local data and GenBank data for '{mapped_path}'.")
                continue

        if output_type == "tbl":
            fileio.write_feature_table(revised_records,
                                       mapped_path,
                                       verbose=verbose)
        elif output_type == "p_curation":
            file_path = mapped_path.joinpath("revise.csv")
            fileio.export_data_dict(curation_data_dicts,
                                    file_path,
                                    CURATION_HEADER,
                                    include_headers=True)
Beispiel #13
0
def execute_pham_finder(alchemist, folder_path, folder_name, 
                        adatabase, bdatabase, values=None,
                        filters="", groups=[], sort=[],
                        show_per=False, use_locus=False, verbose=False):
    """Executes the entirety of the file export pipeline.

    :param alchemist: A connected and fully build AlchemyHandler object.
    :type alchemist: AlchemyHandler
    :param folder_path: Path to a valid dir for new dir creation.
    :type folder_path: Path
    :param folder_name: A name for the export folder.
    :type folder_name: str
    :param adatabase: Name of reference database to source phams from.
    :type adatabase: str
    :param bdatabase: Name of database to find corresponding phams for.
    :type bdatabase: str
    :param values: List of values to filter database results:
    :type values: list[str]
    :param verbose: A boolean value to toggle progress print statements.
    :type verbose: bool
    :param table: MySQL table name.
    :type table: str
    :param filters: A list of lists with filter values, grouped by ORs.
    :type filters: str
    :param groups: A list of supported MySQL column names to group by.
    :type groups: A list of supported MySQL column names to group by.
    :type groups: list[str]
    :param sort: A list of supported MySQL column names to sort by.
    :type sort: list[str]
    :param show_per: Enables display gene coverage of the corresponding phams.
    :type show_per: bool
    :param use_locus: Toggles conversion between phams using LocusTag instead
    :type use_locus: bool
    """
    if not (adatabase in alchemist.databases and \
            bdatabase in alchemist.databases):
        print("User credentials does not have access to both "
             f"databases {adatabase} and {bdatabase}.\n"
              "Please check your database access and try again.")
        sys.exit(1)

    alchemist.database = adatabase
    alchemist.connect()
    a_filter = pipelines_basic.build_filter(alchemist, "gene.PhamID", filters,
                                            values=values, verbose=verbose)

    alchemist.database = bdatabase
    alchemist.connect()
    if use_locus:
        b_filter = pipelines_basic.build_filter(alchemist, "gene.LocusTag", "")
    else:
        b_filter = pipelines_basic.build_filter(alchemist, "gene", "")

    if sort:
        try:
            a_filter.sort(sort)
        except:
            print("Please check your syntax for sorting columns:\n"
                 f"{', '.join(sort)}")
            sys.exit(1)

    if verbose:
        print("Creating pham_finder folder...")
    export_path = folder_path.joinpath(folder_name)
    export_path = basic.make_new_dir(folder_path, export_path, attempt=50)

    conditionals_map = {}
    pipelines_basic.build_groups_map(a_filter, export_path,
                                        conditionals_map,
                                        groups=groups,
                                        verbose=verbose)

    if verbose:
        print("Prepared query and path structure, beginning export...")

    values = a_filter.values
    for mapped_path in conditionals_map.keys():
        a_filter.reset()
        a_filter.values = values

        conditionals = conditionals_map[mapped_path]
        a_filter.values = a_filter.build_values(where=conditionals)
        
        if a_filter.hits() == 0:
            print("No database entries received from gene.PhamID "
                 f"for '{mapped_path}'.")
            shutil.rmtree(mapped_path)
            continue

        if sort:
            sort_columns = get_sort_columns(alchemist, sort)
            a_filter.sort(sort_columns)

        mapped_phams = find_phams(a_filter, b_filter, show_per=show_per)
        if not mapped_phams:
            print("Phams are consistent between the two databases "
                 f"for '{mapped_path}'.")
            shutil.rmtree(mapped_path)
            continue

        out_data_dicts = []
        for ref_pham, corr_phams in mapped_phams.items():
            data_dict = {}
            data_dict[PHAM_FINDER_HEADER[0]] = ref_pham
            data_dict[PHAM_FINDER_HEADER[1]] = corr_phams
            out_data_dicts.append(data_dict)

        file_path = mapped_path.joinpath("PhamMap.csv")
        fileio.export_data_dict(out_data_dicts, file_path, PHAM_FINDER_HEADER,
                                include_headers=True)