Example #1
0
def execute_make_db(alchemist, db_type, values=None, folder_path=None,
                    folder_name=DEFAULT_FOLDER_NAME, verbose=False, filters="",
                    groups=[], db_name=None, threads=1, use_mpi=False,
                    mol_type=None, hash_index=False, parse_seqids=True,
                    gi_mask=False, mask_data=None, mask_id=None, logfile=None,
                    tax_id=None, tax_id_map=None):
    if db_name is None:
        db_name = alchemist.database

    if verbose:
        print("Retrieving database version...")
    db_version = mysqldb_basic.get_first_row_data(alchemist.engine, "version")

    db_filter = pipelines_basic.build_filter(alchemist, "pham", filters,
                                             values=values,
                                             verbose=verbose)

    working_path = pipelines_basic.create_working_path(folder_path,
                                                       folder_name)

    conditionals_map = pipelines_basic.build_groups_map(
                                            db_filter, working_path,
                                            groups=groups, verbose=verbose)

    data_cache = {}
    values = db_filter.values
    for mapped_path in conditionals_map.keys():
        db_filter.reset()
        db_filter.values = values

        conditionals = conditionals_map[mapped_path]
        db_filter.values = db_filter.build_values(where=conditionals)

        if db_filter.hits() == 0:
            print(f"No database entries received for '{mapped_path}'.")
            continue

        pipelines_basic.create_working_dir(mapped_path)

        if db_type == "hhsuite":
            execute_make_hhsuite_database(alchemist, db_filter.values,
                                          mapped_path, db_name, db_version,
                                          data_cache=data_cache,
                                          threads=threads, verbose=verbose,
                                          use_mpi=use_mpi)
        elif db_type == "blast":
            execute_make_blast_database(
                    alchemist, db_filter.values, mapped_path, db_name,
                    db_version, data_cache=data_cache, verbose=verbose,
                    hash_index=False, parse_seqids=True, gi_mask=False,
                    mask_data=None, mask_id=None, logfile=None, tax_id=None,
                    tax_id_map=None)
Example #2
0
def execute_sql_export(alchemist, export_path, folder_path, db_version,
                       db_name=None, dump=False, force=False, phams_out=False,
                       threads=1, verbose=False):
    pipelines_basic.create_working_dir(export_path, dump=dump, force=force)

    if phams_out:
        temp_dir = Path(TEMP_DIR)
        if temp_dir.is_dir():
            shutil.rmtree(temp_dir)
        temp_dir.mkdir()

        phams_out_fasta_dir = temp_dir.joinpath("fastas")
        pipelines_basic.create_working_dir(phams_out_fasta_dir,
                                           dump=dump, force=force)
        phams_out_aln_dir = temp_dir.joinpath("alns")
        pipelines_basic.create_working_dir(phams_out_aln_dir,
                                           dump=dump, force=force)

        phams_dict = pham_alignment.get_all_pham_gene_translations(alchemist)

        if verbose:
            print("...Writing and aligning pham fasta files...")
        pham_alignment.write_phams(phams_out_fasta_dir, phams_out_aln_dir,
                                   phams_dict, cores=threads, verbose=verbose)

        pham_fastas_zip = export_path.joinpath("fastas.zip")
        pham_alns_zip = export_path.joinpath("alns.zip")

        shutil.make_archive(pham_fastas_zip.with_suffix(""), "zip",
                            temp_dir, phams_out_fasta_dir.name)
        shutil.make_archive(pham_alns_zip.with_suffix(""), "zip",
                            temp_dir, phams_out_aln_dir.name)

    if verbose:
        print("Writing SQL database file...")

    fileio.write_database(alchemist, db_version["Version"], export_path,
                          db_name=db_name)
Example #3
0
def execute_pham_review(alchemist,
                        folder_path=None,
                        folder_name=DEFAULT_FOLDER_NAME,
                        no_review=False,
                        values=[],
                        filters="",
                        groups=[],
                        sort=[],
                        s_report=False,
                        gr_reports=False,
                        psr_reports=False,
                        production=False,
                        verbose=False,
                        force=False):
    """Executes the entirety of the pham review pipeline.

    :param alchemist: A connected and fully built AlchemyHandler object.
    :type alchemist: AlchemyHandler
    :param folder_path: Path to a valid dir for new dir creation.
    :type folder_path: Path
    :param folder_name: A name for the export folder.
    :type folder_name: str
    :param csv_title: Title for an appended csv file prefix.
    :type csv_title: str
    :param review: A boolean to toggle filtering of phams by pham discrepancies
    :type review: bool
    :param values: List of values to filter database results.
    :type values: list[str]
    :param force: A boolean to toggle aggresive building of directories.
    :type force: bool
    :param filters: A list of lists with filter values, grouped by ORs.
    :type filters: list[list[str]]
    :param groups: A list of supported MySQL column names to group by.
    :type groups: list[str]
    :param sort: A list of supported MySQL column names to sort by.
    :param gr_reports: A boolean to toggle export of additional pham info
    :type gr_reports: bool
    :param production: Toggles additional filters for production-level review
    :type production: bool
    :param verbose: A boolean value to toggle progress print statements.
    :type verbose: bool
    """
    db_filter = pipelines_basic.build_filter(alchemist,
                                             "gene.PhamID",
                                             filters,
                                             values=values,
                                             verbose=verbose)
    if production:
        db_filter.add(BASE_CONDITIONALS)
        db_filter.update()
    else:
        conditionals = db_filter.build_where_clauses()
        db_filter.values = db_filter.build_values(where=conditionals)

    if not db_filter.values:
        print("Current settings produced no database hits.")
        sys.exit(1)
    else:
        if verbose:
            print(f"Identified {db_filter.hits()} phams to review...")

    if not no_review:
        review_phams(db_filter, verbose=verbose)

    if sort:
        db_filter.sort(sort)

    if verbose:
        print("Creating export folder...")
    export_path = pipelines_basic.create_working_path(folder_path,
                                                      folder_name,
                                                      force=force)

    conditionals_map = pipelines_basic.build_groups_map(db_filter,
                                                        export_path,
                                                        groups=groups,
                                                        verbose=verbose,
                                                        force=force)

    if verbose:
        print("Prepared query and path structure, beginning review export...")
    original_phams = db_filter.values
    gr_data_cache = {}
    psr_data_cache = {}
    for mapped_path in conditionals_map.keys():
        conditionals = conditionals_map[mapped_path]
        db_filter.values = original_phams
        db_filter.values = db_filter.build_values(where=conditionals)

        pipelines_basic.create_working_dir(mapped_path, force=force)

        review_data = get_review_data(alchemist, db_filter, verbose=verbose)
        write_report(review_data,
                     mapped_path,
                     REVIEW_HEADER,
                     csv_name="FunctionReport",
                     verbose=verbose)

        if s_report:
            summary_data = get_summary_data(alchemist, db_filter)
            write_summary_report(alchemist,
                                 summary_data,
                                 mapped_path,
                                 verbose=verbose)

        if gr_reports or psr_reports:
            execute_pham_report_export(alchemist,
                                       db_filter,
                                       mapped_path,
                                       gr_reports=gr_reports,
                                       psr_reports=psr_reports,
                                       gr_data_cache=gr_data_cache,
                                       psr_data_cache=psr_data_cache,
                                       verbose=verbose)
Example #4
0
def execute_build_pan(alchemist,
                      hhdb_path=None,
                      pan_name=None,
                      folder_path=None,
                      folder_name=DEFAULT_FOLDER_NAME,
                      values=None,
                      verbose=False,
                      filters="",
                      groups=[],
                      threads=1,
                      M=50,
                      aD=75,
                      mD=65,
                      B=0.2,
                      PANgraph_out=None):
    db_filter = pipelines_basic.build_filter(alchemist,
                                             "pham",
                                             filters,
                                             values=values)

    working_path = pipelines_basic.create_working_path(folder_path,
                                                       folder_name)

    conditionals_map = pipelines_basic.build_groups_map(db_filter,
                                                        working_path,
                                                        groups=groups,
                                                        verbose=verbose)

    values = db_filter.values
    for mapped_path in conditionals_map.keys():
        db_filter.reset()
        db_filter.values = values

        conditionals = conditionals_map[mapped_path]
        db_filter.values = db_filter.build_values(where=conditionals)

        if db_filter.hits() == 0:
            print(f"No database entries received for '{mapped_path}'.")
            continue

        if pan_name is None:
            pan_name = folder_name

        pipelines_basic.create_working_dir(mapped_path)
        pan_path = mapped_path.joinpath(".".join([pan_name, "sqlite"]))

        pan_alchemist = pan_handling.build_pan(pan_path)
        pan_alchemist.expire_on_commit = True

        pham_data_dir = mapped_path.joinpath("pham_alns")
        pham_data_dir.mkdir()
        data_maps_tuple = create_pham_alns(alchemist.engine,
                                           db_filter.values,
                                           pham_data_dir,
                                           threads=threads,
                                           M=M,
                                           verbose=verbose)

        build_pan_nodes(pan_alchemist,
                        db_filter.values,
                        data_maps_tuple,
                        threads=threads,
                        verbose=verbose)

        cent_data_dir = mapped_path.joinpath("cent_alns")
        cent_data_dir.mkdir()
        build_pan_neighborhoods(alchemist,
                                pan_alchemist,
                                db_filter.values,
                                cent_data_dir,
                                data_maps_tuple,
                                aD=aD,
                                mD=mD,
                                B=B,
                                threads=threads,
                                verbose=verbose)

        hmm_data_dir = mapped_path.joinpath("pham_hhrs")
        hmm_data_dir.mkdir()

        if hhdb_path is not None:
            raise NotImplementedError(
                "Town building is not implemented yet... :(")
            if verbose:
                print("...Calculating pham HMM profiles...")
            hmm_path_map = alignment.create_hmms(data_maps_tuple[0],
                                                 name=True,
                                                 M=M,
                                                 threads=threads,
                                                 verbose=verbose)
            build_pan_towns(alchemist,
                            pan_alchemist,
                            hhdb_path,
                            hmm_data_dir,
                            hmm_path_map,
                            threads=threads,
                            verbose=verbose)

        if PANgraph_out is not None:
            pan_graph = pan_handling.to_networkx(pan_alchemist)
            pde_fileio.write_graph(pan_graph,
                                   PANgraph_out,
                                   mapped_path,
                                   pan_name,
                                   edge_weights=PAN_GRAPH_EDGEWEIGHTS)

        shutil.rmtree(Path(TEMP_DIR))
Example #5
0
def execute_export(alchemist, pipeline, folder_path=None,
                   folder_name=DEFAULT_FOLDER_NAME, values=None, verbose=False,
                   dump=False, force=False, table=DEFAULT_TABLE, filters="",
                   groups=[], sort=[], include_columns=[], exclude_columns=[],
                   sequence_columns=False, raw_bytes=False, concatenate=False,
                   db_name=None, phams_out=False, threads=1):
    """Executes the entirety of the file export pipeline.

    :param alchemist: A connected and fully built AlchemyHandler object.
    :type alchemist: AlchemyHandler
    :param pipeline: File type that dictates data processing.
    :type pipeline: str
    :param folder_path: Path to a valid dir for new dir creation.
    :type folder_path: Path
    :param folder_name: A name for the export folder.
    :type folder_name: str
    :param force: A boolean to toggle aggresive building of directories.
    :type force: bool
    :param values: List of values to filter database results.
    :type values: list[str]
    :param verbose: A boolean value to toggle progress print statements.
    :type verbose: bool
    :param dump: A boolean value to toggle dump in current working dir.
    :type dump: bool
    :param table: MySQL table name.
    :type table: str
    :param filters: A list of lists with filter values, grouped by ORs.
    :type filters: str
    :param groups: A list of supported MySQL column names to group by.
    :type groups: list[str]
    :param sort: A list of supported MySQL column names to sort by.
    :type sort: list[str]
    :param include_columns: A csv export column selection parameter.
    :type include_columns: list[str]
    :param exclude_columns: A csv export column selection parameter.
    :type exclude_columns: list[str]
    :param sequence_columns: A boolean to toggle inclusion of sequence data.
    :type sequence_columns: bool
    :param concatenate: A boolean to toggle concaternation for SeqRecords.
    :type concaternate: bool
    :param threads: Number of processes/threads to spawn during the pipeline
    :type threads: int
    """
    if verbose:
        print("Retrieving database version...")
    db_version = mysqldb_basic.get_first_row_data(alchemist.engine, "version")

    if pipeline == "csv":
        if verbose:
            print("Processing columns for csv export...")
        csv_columns = filter_csv_columns(alchemist, table,
                                         include_columns=include_columns,
                                         exclude_columns=exclude_columns,
                                         sequence_columns=sequence_columns)

    if pipeline in FILTERABLE_PIPELINES:
        db_filter = pipelines_basic.build_filter(alchemist, table, filters,
                                                 values=values,
                                                 verbose=verbose)
        if sort:
            pipelines_basic.add_sort_columns(db_filter, sort, verbose=verbose)

    if verbose:
        print("Creating export folder...")
    export_path = pipelines_basic.create_working_path(folder_path, folder_name,
                                                      dump=dump, force=force)

    data_cache = {}
    if pipeline == "sql":
        execute_sql_export(alchemist, export_path, folder_path, db_version,
                           db_name=db_name, dump=dump, force=force,
                           phams_out=phams_out, threads=threads,
                           verbose=verbose)
    elif pipeline in FILTERABLE_PIPELINES:
        conditionals_map = pipelines_basic.build_groups_map(
                                                db_filter, export_path,
                                                groups=groups,
                                                verbose=verbose, force=force)

        if verbose:
            print("Prepared query and path structure, beginning export...")

        values = db_filter.values
        for mapped_path in conditionals_map.keys():
            db_filter.reset()
            db_filter.values = values

            conditionals = conditionals_map[mapped_path]
            db_filter.values = db_filter.build_values(where=conditionals)

            if db_filter.hits() == 0:
                print(f"No database entries received from {table} "
                      f"for '{mapped_path}'.")
                continue

            if sort:
                sort_columns = get_sort_columns(alchemist, sort)
                db_filter.sort(sort_columns)

            export_name = None
            if dump:
                if mapped_path == export_path:
                    export_name = folder_name

            pipelines_basic.create_working_dir(mapped_path, dump=dump,
                                               force=force)

            if pipeline in BIOPYTHON_PIPELINES + ["tbl"]:
                execute_ffx_export(alchemist, mapped_path, export_path,
                                   db_filter.values, pipeline, db_version,
                                   table, concatenate=concatenate,
                                   data_cache=data_cache,
                                   export_name=export_name, threads=threads,
                                   verbose=verbose, dump=dump)
            elif pipeline == "csv":
                execute_csv_export(db_filter, mapped_path, export_path,
                                   csv_columns, table, raw_bytes=raw_bytes,
                                   data_cache=data_cache,
                                   verbose=verbose, dump=dump)
    else:
        print("Unrecognized export pipeline, aborting export")
        sys.exit(1)
Example #6
0
def ani_subcluster(working_dir, sketch_path_map, cluster_scheme,
                   cluster_lookup, cluster_seqid_map,
                   subcluster_lookup, cores=1, verbose=False,
                   ani=DEFAULT_SETTINGS["ani"],
                   animax=DEFAULT_SETTINGS["animax"], evaluate=False):

    for cluster, cluster_members in cluster_scheme.items():
        if cluster is None:
            continue

        cluster_members_set = set(cluster_members)
        old_cluster_members = set(cluster_seqid_map.get(cluster, list()))
        noncluster_members = list(cluster_members_set.difference(
                                                        old_cluster_members))

        old_cluster_members = list(old_cluster_members.intersection(
                                                    cluster_members_set))
        noncluster_members = list(noncluster_members)

        old_subclusters = set()
        subcluster_seqid_map = {}
        altered_subcluster_lookup = {}
        for member in old_cluster_members:
            subcluster = subcluster_lookup[member]
            altered_subcluster_lookup[member] = subcluster
            old_subclusters.add(subcluster)

            seqids = subcluster_seqid_map.get(subcluster, list())
            seqids.append(member)
            subcluster_seqid_map[subcluster] = seqids

        for nonmember in noncluster_members:
            nonmember_cluster = cluster_lookup[nonmember]
            altered_subcluster_lookup[nonmember] = nonmember_cluster

            seqids = subcluster_seqid_map.get(None, list())
            seqids.append(nonmember)
            subcluster_seqid_map[None] = seqids

        if verbose:
            print(f"Subclustering {cluster}...")

        ani_matrix = calculate_ani_matrix(cluster_members, sketch_path_map,
                                          cores=cores, verbose=verbose)

        subcluster_scheme = cluster_db(ani_matrix, ani, emax=animax,
                                       cores=cores, verbose=verbose,
                                       is_distance=False)

        subcluster_redistributions = get_cluster_redistributions(
                                            subcluster_scheme,
                                            altered_subcluster_lookup,
                                            old_subclusters)

        subcluster_scheme = assign_cluster_names(
                                            subcluster_scheme,
                                            subcluster_redistributions,
                                            verbose=verbose,
                                            subcluster=cluster)

        scheme_alterations = diff_cluster_schemes(subcluster_scheme,
                                                  altered_subcluster_lookup)

        subcluster_dir = working_dir.joinpath(str(cluster))
        pipelines_basic.create_working_dir(subcluster_dir)

        wrote_ticket = write_clustering_update_ticket(
                                subcluster_dir, scheme_alterations,
                                field="Subcluster")

        wrote_eval = False
        if evaluate:
            new_matrix_cache = dict()

            if verbose:
                print(f"...Evaluating cluster {cluster} "
                      "subclustering scheme...")
            scheme_metadata = evaluate_clustering_scheme(
                                ani_matrix, subcluster_scheme,
                                matrix_cache=new_matrix_cache,
                                cores=cores, verbose=verbose)

            old_scheme_metadata = evaluate_clustering_scheme(
                                ani_matrix, subcluster_seqid_map,
                                cores=cores)

            alteration_metadata = evaluate_scheme_alteration(
                                ani_matrix, subcluster_scheme,
                                subcluster_seqid_map, scheme_alterations)

            write_clustering_evaluation(
                                subcluster_dir, scheme_metadata,
                                old_scheme_metadata, alteration_metadata,
                                "average nucleotide identity")

        wrote = wrote_eval or wrote_ticket

        if not wrote:
            shutil.rmtree(subcluster_dir)
Example #7
0
def execute_cluster_db(
                alchemist, folder_path=None,
                folder_name=DEFAULT_FOLDER_NAME, values=None, verbose=None,
                filters="", groups=[], threads=1,
                kmer=DEFAULT_SETTINGS["kmer"],
                sketch=DEFAULT_SETTINGS["sketch"],
                gcs=DEFAULT_SETTINGS["gcs"], ani=DEFAULT_SETTINGS["ani"],
                gcsmax=DEFAULT_SETTINGS["gcsmax"],
                animax=DEFAULT_SETTINGS["animax"],
                gcsS=DEFAULT_SETTINGS["gcsS"], gcsM=DEFAULT_SETTINGS["gcsM"],
                aniS=DEFAULT_SETTINGS["aniS"], aniM=DEFAULT_SETTINGS["aniM"],
                mat_out=False, evaluate=False, subcluster=False,
                cluster_prefix=None):
    db_filter = pipelines_basic.build_filter(alchemist, "phage", filters,
                                             values=values)

    working_path = pipelines_basic.create_working_path(
                                            folder_path, folder_name)
    temp_dir = create_temp_path(TEMP_DIR)
    conditionals_map = pipelines_basic.build_groups_map(
                                            db_filter, working_path,
                                            groups=groups, verbose=verbose)

    values = db_filter.values
    for mapped_path in conditionals_map.keys():
        db_filter.reset()
        db_filter.values = values

        conditionals = conditionals_map[mapped_path]
        db_filter.values = db_filter.build_values(where=conditionals)

        if verbose:
            print("Querying MySQL database for clustering metadata...")
        cluster_metadata = query_cluster_metadata(db_filter)

        gcs_matrix = calculate_gcs_matrix(alchemist, db_filter.values,
                                          verbose=verbose, cores=threads)

        pipelines_basic.create_working_dir(mapped_path)

        if verbose:
            print("Clustering database genomes...")
        cluster_scheme = gcs_cluster(
                                mapped_path, gcs_matrix,
                                cluster_metadata[0], cluster_metadata[1],
                                gcs=gcs, gcsmax=gcsmax, S=gcsS, M=gcsM,
                                evaluate=evaluate, cores=threads,
                                verbose=verbose,
                                cluster_prefix=cluster_prefix)

        if subcluster:
            sketch_path_map = sketch_genomes(db_filter, temp_dir,
                                             verbose=verbose)

            if verbose:
                print("Subclustering database genomes...")
            ani_subcluster(mapped_path, sketch_path_map, cluster_scheme,
                           cluster_metadata[0], cluster_metadata[1],
                           cluster_metadata[2], cores=threads,
                           verbose=verbose, ani=ani, animax=animax,
                           evaluate=evaluate)

            empty = True
            for _ in mapped_path.iterdir():
                empty = False

            if empty:
                shutil.rmtree(mapped_path)
Example #8
0
def execute_pham_align(alchemist,
                       folder_path=None,
                       folder_name=DEFAULT_FOLDER_NAME,
                       values=None,
                       filters="",
                       groups=[],
                       file_type="fasta",
                       mat_out=False,
                       tree_out=False,
                       threads=1,
                       verbose=False,
                       dump=False,
                       force=False):
    """Executes the entirety of the pham align pipeline.
    :param alchemist: A connected and fully built AlchemyHandler object.
    :type alchemist: AlchemyHandler
    :param folder_path: Path to a valid dir for working dir creation.
    :type folder_path: Path
    :param folder_name: A name for the working directory.
    :type folder_name: str
    :param force: A boolean to toggle aggresive building of directories.
    :type force: bool
    :param values: List of values to filter database results.
    :type values: list[str]
    :param verbose: A boolean value to toggle progress print statements.
    :type verbose: bool
    :param dump: A boolean value to toggle dump in current working dir.
    :type dump: bool
    :param filters: A MySQL formatted WHERE clause string
    :type filters: str
    :param groups: A list of supported MySQL column names to group by.
    :type groups: list[str]
    :param file_type: Format type of sequence alignment file to export.
    :type file_type: str
    :param mat_out: A boolean to toggle distance matrix file generation.
    :type mat_out: bool
    :param tree_out: A boolean to toggle guidetree file generation.
    :type tree_out: bool
    :param threads: Number of processes to spawn during alignment workflow.
    :type threads: int
    """
    db_filter = pipelines_basic.build_filter(alchemist,
                                             "pham",
                                             filters,
                                             values=values,
                                             verbose=verbose)
    working_path = pipelines_basic.create_working_path(folder_path,
                                                       folder_name,
                                                       dump=dump,
                                                       force=force)

    data_cache = {}
    conditionals_map = pipelines_basic.build_groups_map(db_filter,
                                                        working_path,
                                                        groups=groups,
                                                        verbose=verbose,
                                                        force=force)
    values = db_filter.values
    for mapped_path in conditionals_map.keys():
        db_filter.reset()
        db_filter.values = values

        conditionals = conditionals_map[mapped_path]
        db_filter.values = db_filter.build_values(where=conditionals)

        if db_filter.hits() == 0:
            print(f"No database entries received for '{mapped_path}'")
            continue

        pipelines_basic.create_working_dir(mapped_path, dump=dump, force=force)

    execute_pham_MSA_alignment(alchemist,
                               mapped_path,
                               db_filter.values,
                               data_cache=data_cache,
                               file_type=file_type,
                               mat_out=mat_out,
                               tree_out=tree_out,
                               threads=threads,
                               verbose=verbose)
Example #9
0
def execute_get_gb_records(alchemist, file_type, folder_path=None,
                           folder_name=DEFAULT_FOLDER_NAME, 
                           config=None, values=None, verbose=False, 
                           force=False, filters="", groups=[]):
    """Executes the entirety of the get_gb_records pipeline

    :param alchemist: A connected and fully build AlchemyHandler object.
    :type alchemist: AlchemyHandler
    :param folder_path: Path to a valid dir for new dir creation.
    :type folder_path: Path
    :param folder_name: A name for the export folder.
    :type folder_name: str
    :param file_type: File type to be exported.
    :type file_type: str
    :param config: ConfigParser object containing NCBI credentials.
    :type config: ConfigParser
    :param force: A boolean to toggle aggresive building of directories.
    :type force: bool
    :param values: List of values to filter database results.
    :type values: list[str]
    :param verbose: A boolean value to toggle progress print statemtns.
    :type verbose: bool
    :param filters: A List of lists with filter value,grouped by ORs.
    :type filter: str
    :param groups: A list of supported MySQL column names to goup by.
    :type groups: list[str]
    """
    ncbi_creds = {}
    if config is not None:
        ncbi_creds = config["ncbi"]

    db_filter = pipelines_basic.build_filter(alchemist, FILTER_KEY, filters,
                                             values=values, verbose=verbose)

    if verbose:
        print("Creating records folder...")
    records_path = pipelines_basic.create_working_path(folder_path,
                                                       folder_name,
                                                       force=force)

    conditionals_map = pipelines_basic.build_groups_map(
                                                db_filter, records_path,
                                                groups=groups, verbose=verbose,
                                                force=force)

    values = db_filter.values
    for mapped_path in conditionals_map.keys():
        db_filter.reset()
        db_filter.values = values

        conditionals = conditionals_map[mapped_path]
        db_filter.values = db_filter.build_values(where=conditionals)

        # Create data sets
        if verbose:
            print("Retrieving accessions from the database...")
        accession_data = db_filter.select(["phage.PhageID", "phage.Accession"])

        acc_id_dict = {}
        for data_dict in accession_data:
            accession = data_dict["Accession"]
            if not (accession is None or accession == ""):
                acc_id_dict[accession] = data_dict["PhageID"]

        pipelines_basic.create_working_dir(mapped_path, force=force)
        if len(acc_id_dict.keys()) > 0:
            ncbi_handle = ncbi.get_verified_data_handle(
                                                     acc_id_dict,
                                                     ncbi_cred_dict=ncbi_creds,
                                                     file_type=file_type)

            copy_gb_data(ncbi_handle, acc_id_dict, mapped_path, file_type,
                         verbose=verbose)
        else:
            print(f"There are no records to retrieve for '{mapped_path}'.")
            continue
Example #10
0
def execute_remote_revise(alchemist,
                          folder_path=None,
                          folder_name=DEFAULT_FOLDER_NAME,
                          config=None,
                          output_type="p_curation",
                          values=None,
                          filters="",
                          groups=[],
                          verbose=False,
                          force=False):
    ncbi_creds = {}
    if config is not None:
        ncbi_creds = config["ncbi"]

    db_filter = pipelines_basic.build_filter(alchemist,
                                             "phage",
                                             filters,
                                             values=values,
                                             verbose=verbose)
    db_filter.add(BASE_CONDITIONALS)

    revise_path = pipelines_basic.create_working_path(folder_path,
                                                      folder_name,
                                                      force=force)

    conditionals_map = pipelines_basic.build_groups_map(db_filter,
                                                        revise_path,
                                                        groups=groups,
                                                        verbose=verbose,
                                                        force=force)

    values = db_filter.values
    for mapped_path in conditionals_map.keys():
        db_filter.reset()
        db_filter.values = values

        conditionals = conditionals_map[mapped_path]
        db_filter.values = db_filter.build_values(where=conditionals)

        if db_filter.hits() == 0:
            print(f"No database entries received for '{mapped_path}'.")

        pipelines_basic.create_working_dir(mapped_path, force=force)
        build_revise_log_file(mapped_path)

        logger.info(f"pdm_utils version: {VERSION}")
        logger.info(f"Revise run date: {CURRENT_DATE}")
        logger.info(f"Connected to database: {alchemist.database}")

        accession_data = db_filter.select(["phage.PhageID", "phage.Accession"])

        acc_id_dict = {}
        for data_dict in accession_data:
            accession = data_dict["Accession"]
            if not (accession is None or accession == ""):
                acc_id_dict[accession] = data_dict["PhageID"]

        tbl_records = get_tbl_records(acc_id_dict, ncbi_cred_dict=ncbi_creds)

        validated_phages = []
        for tbl_record in tbl_records:
            validated_phages.append(tbl_record.name)

        id_record_map = build_id_record_map(alchemist, validated_phages)

        if output_type == "tbl":
            revised_records = revise_seqrecords(id_record_map,
                                                tbl_records,
                                                verbose=verbose)

            if not revised_records:
                print("No discrepancies detected between "
                      f"local data and GenBank data for '{mapped_path}'.")
                continue

        elif output_type == "p_curation":
            curation_data_dicts = find_product_discrepancies(id_record_map,
                                                             tbl_records,
                                                             verbose=verbose)

            if not curation_data_dicts:
                print("No discrepancies detected between "
                      f"local data and GenBank data for '{mapped_path}'.")
                continue

        if output_type == "tbl":
            fileio.write_feature_table(revised_records,
                                       mapped_path,
                                       verbose=verbose)
        elif output_type == "p_curation":
            file_path = mapped_path.joinpath("revise.csv")
            fileio.export_data_dict(curation_data_dicts,
                                    file_path,
                                    CURATION_HEADER,
                                    include_headers=True)
Example #11
0
def execute_local_revise(alchemist,
                         revisions_file_path,
                         folder_path=None,
                         folder_name=DEFAULT_FOLDER_NAME,
                         config=None,
                         input_type="function_report",
                         output_type="p_curation",
                         production=False,
                         filters="",
                         groups=[],
                         force=False,
                         verbose=False):
    """Executes the entirety of the genbank local revise pipeline.

    :param alchemist: A connected and fully built AlchemyHandler object.
    :type alchemist: AlchemyHandler
    :param revisions_data_dicts: Data dictionaries containing pham/notes data.
    :type revisions_data_dicts: list[dict]
    :param folder_path: Path to a valid dir for new dir creation.
    :type folder_path: Path
    :param folder_name: A name for the export folder.
    :type folder_name: str
    :param input_type: Specifies the file format of the input file
    :type input_type: str
    :param output_type: Specifies the file format of the outputted file
    :type output_type: str
    :param production: Toggles additional filters for production-level revision
    :type production: bool
    :param verbose: A boolean value to toggle progress print statements.
    :type verbose: bool
    :param force: A boolean to toggle aggresive building of directories.
    :type force: bool
    """
    keys = INPUT_FILE_KEYS.get(input_type)
    if keys is None:
        raise ValueError(f"Revision input type {input_type} is not supported.")

    revisions_data_dicts = fileio.retrieve_data_dict(revisions_file_path)

    values = []
    for data_dict in revisions_data_dicts:
        values.append(data_dict[keys['data_key']])

    db_filter = pipelines_basic.build_filter(alchemist,
                                             keys['filter_key'],
                                             filters,
                                             values=values,
                                             verbose=verbose)

    if production:
        db_filter.add(BASE_CONDITIONALS)

    revise_columns = db_filter.get_columns(REVISION_COLUMNS)

    if verbose:
        print("Creating export folder...")
    export_path = pipelines_basic.create_working_path(folder_path,
                                                      folder_name,
                                                      force=force)

    conditionals_map = pipelines_basic.build_groups_map(db_filter,
                                                        export_path,
                                                        force=force,
                                                        groups=groups,
                                                        verbose=verbose)

    if verbose:
        print("Prepared query and path structure, beginning revise export...")

    for mapped_path in conditionals_map.keys():
        conditionals = conditionals_map[mapped_path]

        if input_type == "function_report":
            export_dicts = use_function_report_data(db_filter,
                                                    revisions_data_dicts,
                                                    revise_columns,
                                                    conditionals,
                                                    verbose=verbose)
        elif input_type == "csv":
            export_dicts = use_csv_data(db_filter,
                                        revisions_data_dicts,
                                        revise_columns,
                                        conditionals,
                                        verbose=verbose)

        if not export_dicts:
            if verbose:
                print(f"'{mapped_path.name}' data selected does not require "
                      "revision; no file exported...")

            continue

        pipelines_basic.create_working_dir(mapped_path, force=force)

        write_revise_file(export_dicts,
                          mapped_path,
                          file_format=output_type,
                          verbose=verbose)
Example #12
0
def execute_get_server_db(alchemist,
                          database,
                          url,
                          folder_path=None,
                          folder_name=RESULTS_FOLDER,
                          db_name=None,
                          config_file=None,
                          verbose=False,
                          subdirectory=None,
                          download_only=False,
                          get_fastas=False,
                          get_alns=False,
                          force_pull=False,
                          get_version=False,
                          schema_version=None):

    if subdirectory:
        url = "".join([url, str(subdirectory), "/"])

    pool = url_basic.create_pool(pipeline=True)
    if database is None:
        print("Loading get_db interactive environment...")
        cmd = pipeline_shells.GetDBCMD(url, name=alchemist.username, pool=pool)
        cmd.cmdloop(intro=pipeline_shells.GET_DB_CMD_INTRO)

        if cmd.selected is None:
            return

        database = cmd.selected.name
        pkg_url = "".join([cmd.selected.get_abs_path(), "/"])
    else:
        response = url_basic.pool_request(url, pool=pool, pipeline=True)
        directory_listing = url_basic.get_url_listing_dirs(response)

        if database not in directory_listing:
            print("Requested database is not at the specified url.\n"
                  "Please check the database availability.")
            return

        response.close()
        pkg_url = "".join([url, database, "/"])

    if db_name is None:
        db_name = database

    pkg_response = url_basic.pool_request(pkg_url, pool=pool, pipeline=True)

    sql_file_listing = url_basic.get_url_listing_files(pkg_response, "sql")
    version_file_listing = url_basic.get_url_listing_files(
        pkg_response, "version")

    if not sql_file_listing:
        print("Requested database file package does not have a SQL file.\n"
              "Please check SQL file availability at the specified url.")
        return
    database_filename = sql_file_listing[0]

    if not version_file_listing:
        if get_version:
            print("Requested database file package does not have"
                  "a version file.\nPlease check version file availability "
                  "at the specified url.")
            return
        else:
            version_filename = None
    else:
        version_filename = version_file_listing[0]

    if folder_path is None:
        output_path = pipelines_basic.create_working_path(
            pathlib.Path(DEFAULT_OUTPUT_FOLDER), folder_name)
        if output_path.is_dir():
            shutil.rmtree(output_path)
        pipelines_basic.create_working_dir(output_path, force=True)
    else:
        output_path = pipelines_basic.create_working_path(
            folder_path, folder_name)
        pipelines_basic.create_working_dir(output_path)

    # Only look for version file is selected.
    if version_filename is not None:
        version_filepath, status1 = prepare_download(output_path, pkg_url,
                                                     version_filename,
                                                     "version")
        version_filehandle = version_filepath.open(mode="r")
        version = int(version_filehandle.readline().rstrip())
    else:
        status1 = True
        version = 0

    if (not force_pull) and (version > 0):
        if db_name in alchemist.databases:
            alchemist.database = db_name
            alchemist.build_engine()

            curr_schema_version = mysqldb.get_schema_version(alchemist.engine)
            if curr_schema_version > 2:
                curr_version_data = mysqldb_basic.get_first_row_data(
                    alchemist.engine, "version")
                curr_version = int(curr_version_data.get("Version", 0))
                if curr_version >= version:
                    print(f"Current database version of {db_name} "
                          "is greater than or equal to the database version "
                          "at the specified listing.\nPlease use "
                          "the --force_pull flag if you would like to "
                          "indiscriminately pull and install a database.")
                    return

    db_filepath, status2 = prepare_download(output_path,
                                            pkg_url,
                                            database_filename,
                                            "sql",
                                            verbose=verbose)
    if not status1 or not status2:
        print("Unable to download data from server.\n Aborting pipeline.")
        return

    # If downloading from server, user may have selected to not
    # install the database file.
    if (not download_only) and (not get_fastas) and (not get_alns):
        install_db(alchemist,
                   db_name,
                   db_filepath=db_filepath,
                   config_file=config_file,
                   schema_version=schema_version,
                   verbose=verbose)

        # The output folder was only created for downloading from server.
        print("Removing downloaded data.")
        shutil.rmtree(output_path)
Example #13
0
def execute_find_primers(alchemist,
                         folder_path=None,
                         folder_name=DEFAULT_FOLDER_NAME,
                         values=None,
                         filters="",
                         groups=[],
                         verbose=False,
                         threads=4,
                         prc=0.7,
                         dev_net=0,
                         len_oligomer=20,
                         minD=900,
                         maxD=1100,
                         tm_min=52.0,
                         tm_max=58.0,
                         hpn_min=-2000,
                         ho_min=-5000,
                         GC_max=60.0,
                         het_min=-5000,
                         tm_gap=5.0,
                         ta_min=48.0,
                         fwd_in=None,
                         rvs_in=None,
                         ta_max=68.0,
                         mode=0,
                         full_genome=False,
                         soft_cap=None,
                         phams_in=[]):
    """Executes the entirety of the file export pipeline.

    :param alchemist: A connected and fully build AlchemyHandler object.
    :type alchemist: AlchemyHandler
    :param folder_path: Path
    :type folder_path: Path
    :param folder_name: A name for the working directory folder
    :type folder_name: str
    :param values: List of values to filter database results
    :type values: list[str]
    :param verbose: A boolean value to toggle progress print statements.
    :type verbose: bool
    :param filters: A pseudo-SQL WHERE clause string to filter values.
    :type filters: str
    :param groups: A list of SQL column names to filter values.
    :type groups: list[str]
    :param threads: Number of child process workers to utilize
    :type threads: int
    :param prc: Percentage of genomes a pham must exist in to pass prefiltering
    :type prc: float
    :param dev_net: Allowance for the primer positions to pass prefiltering
    :type dev_net: int
    :param len_oligomer: Length of the oligomers used to create the primers
    :type len_oligomer: int
    :param minD: Minimum primer product length to pass primer testing
    :type minD: int
    :param maxD: Maximum primer product length to pass primer testing
    :type maxD: int
    :param tm_min: Minimum primer melting temperature to pass primer testing
    :type tm_min: float
    :param tm_max: Maximum primer melting temperature to pass primer testing
    :type tm_max: float
    :param hpn_min: Minimum hairpin Gibbs free energy to pass primer testing
    :type hpn_min: int
    :param ho_min: Minimum homodimer Gibbs free energy to pass primer testing
    :type ho_min: int
    :param GC_max: Maximum GC content percentage allowed for an oligomer
    :type GC_max: float
    :param het_min: Minimum heterodimer Gibbs free energy to pass testing
    :type het_min: int
    :param tm_gap: Maximum allowed melting temperature gap between oligomers
    :type tm_gap: float
    :param ta_min: Minimum allowed optimal annealing temperature
    :type ta_min: float
    :param ta_max: Maximum allowed optimal annealing temperature
    :type ta_max: float
    :param fwd_in: Fixed forward sequence to find primer pairs for
    :type fwd_in: str
    :param rvs_in: Fixed reverse sequence to find primer pairs for
    :type rvs_in: str
    :param mode: Run mode for find primers analysis
    :type mode: int
    :param soft_cap: Cap limit on number of pairs evaluated after testing
    :type soft_cap: int
    :param phams_in: Phams to evaluate during count min sketch eval of kmers
    :type phams_in: list[str]
    """
    db_filter = pipelines_basic.build_filter(alchemist,
                                             "phage",
                                             filters,
                                             values=values)

    working_path = pipelines_basic.create_working_path(folder_path,
                                                       folder_name)

    conditionals_map = pipelines_basic.build_groups_map(db_filter,
                                                        working_path,
                                                        groups=groups,
                                                        verbose=verbose)

    if verbose:
        print("Prepared query and path structure, beginning primer search...")

    if not TEMP_DIR.is_dir():
        TEMP_DIR.mkdir()
    pickled_results_file = TEMP_DIR.joinpath(PICKLED_FILE_NAME)
    if pickled_results_file.is_file():
        pickled_results_file.unlink()

    values = db_filter.values
    for mapped_path in conditionals_map.keys():
        results_map = {}
        db_filter.reset()
        db_filter.key = "phage"
        db_filter.values = values

        conditionals = conditionals_map[mapped_path]

        db_filter.values = db_filter.build_values(where=conditionals)

        if db_filter.hits() == 0:
            print("No database entries received from phage "
                  f" for '{mapped_path.name}'.")

        genome_map = {}
        for genome_id in db_filter.values:
            export_db.get_single_genome(alchemist,
                                        genome_id,
                                        data_cache=genome_map)

        if verbose:
            print(f"...Identifying primer pairs for '{mapped_path}'...")

        if full_genome:
            F_results, R_results = find_full_genome_oligomers(
                genome_map,
                verbose=verbose,
                threads=threads,
                prc=prc,
                minD=minD,
                maxD=maxD,
                len_oligomer=len_oligomer,
                tm_min=tm_min,
                tm_max=tm_max,
                hpn_min=hpn_min,
                ho_min=ho_min,
                GC_max=GC_max)
        else:
            pham_gene_map = build_pham_gene_map(db_filter,
                                                conditionals,
                                                phams_in=phams_in,
                                                verbose=verbose)
            if not pham_gene_map:
                print(f"No valid phams found for '{mapped_path}' with current "
                      "settings")

            F_results, R_results = find_oligomers(alchemist,
                                                  pham_gene_map,
                                                  genome_map,
                                                  verbose=verbose,
                                                  threads=threads,
                                                  prc=prc,
                                                  minD=minD,
                                                  maxD=maxD,
                                                  len_oligomer=len_oligomer,
                                                  tm_min=tm_min,
                                                  tm_max=tm_max,
                                                  hpn_min=hpn_min,
                                                  ho_min=ho_min,
                                                  GC_max=GC_max,
                                                  fwd_in=fwd_in,
                                                  rvs_in=rvs_in)

        if (not F_results) or (not R_results):
            if verbose:
                print(f"No valid oligomers found for '{mapped_path.name}'")
            continue

        if verbose:
            print("...Matching oligomers to create primer pairs...")

        primer_pairs = match_oligomers(F_results,
                                       R_results,
                                       minD=minD,
                                       maxD=maxD,
                                       dev_net=dev_net,
                                       threads=threads)

        if not primer_pairs:
            print(f"No valid primer pairs found for '{mapped_path}' with "
                  "current parameters...")
            continue

        if verbose:
            print(f"...Identified {len(primer_pairs)} valid primer pairs.")

        if verbose:
            print(f"...Testing primer pairs for '{mapped_path}'...")
        primer_pairs = test_primer_pairs(primer_pairs,
                                         genome_map,
                                         threads=threads,
                                         verbose=verbose,
                                         minD=minD,
                                         maxD=maxD,
                                         het_min=het_min,
                                         ta_min=ta_min,
                                         ta_max=ta_max,
                                         tm_gap_max=tm_gap)

        if verbose:
            print(f"...{len(primer_pairs)} passed primer testing.")

        if soft_cap is not None:
            if len(primer_pairs) > soft_cap:
                primer_pairs = primer_pairs[:soft_cap]

        if pickled_results_file.is_file():
            with pickled_results_file.open(mode="rb") as filehandle:
                results_map = pickle.load(filehandle)

        if primer_pairs:
            results_map[mapped_path] = (primer_pairs, genome_map)

        with pickled_results_file.open(mode="wb") as filehandle:
            pickle.dump(results_map, filehandle)

    if pickled_results_file.is_file():
        pickled_results_file.unlink()

    if not results_map:
        print("No primer pairs found with current parameters...")

    results_map = select_primer_pairs(results_map,
                                      verbose=verbose,
                                      mode=mode,
                                      het_min=het_min)

    for mapped_path, primer_pairs in results_map.items():
        pipelines_basic.create_working_dir(mapped_path)
        file_path = mapped_path.joinpath("primer.txt")
        fileio.write_primer_txt_file(primer_pairs[0][0], file_path)