Exemple #1
0
def cblaster(
    query_file=None,
    query_ids=None,
    mode=None,
    json_db=None,
    database=None,
    gap=20000,
    unique=3,
    min_hits=3,
    min_identity=30,
    min_coverage=50,
    max_evalue=0.01,
    entrez_query=None,
    output=None,
    output_hide_headers=False,
    output_delimiter=None,
    output_decimals=4,
    binary=None,
    binary_hide_headers=True,
    binary_delimiter=None,
    binary_key=len,
    binary_attr="identity",
    binary_decimals=4,
    rid=None,
    require=None,
    session_file=None,
    indent=None,
    plot=False,
    recompute=False,
    blast_file=None,
    ipg_file=None,
    hitlist_size=None,
):
    """Run cblaster.

    This function is the central workflow for the entire cblaster package.

    Arguments:
        query_file (str): Path to FASTA format query file
        query_ids (list): NCBI protein sequence identifiers
        mode (str): Search mode ('local' or 'remote')
        json_db (str): JSON database created with cblaster makedb
        database (str): Search database (NCBI if remote, DIAMOND if local)
        gap (int): Maximum gap (kilobase) between cluster hits
        unique (int): Minimum number of query sequences with hits in clusters
        min_hits (int): Minimum number of hits in clusters
        min_identity (float): Minumum identity (%) cutoff
        min_coverage (float): Minumum coverage (%) cutoff
        max_evalue (float): Maximum e-value threshold
        entrez_query (str): NCBI Entrez query to filter search database
        output (str): Path to cblaster summary output file
        output_hide_headers (bool): Hide headers in summary table
        output_delimiter (str): Delimiter used in summary table
        output_decimals (int): Total decimal places in hit scores in summary table
        binary (str): Path to cblaster binary output file
        binary_hide_headers (bool): Hide headers in binary table
        binary_delimiter (str): Delimiter used in binary table
        binary_key (str): Key function used in binary table (len, max or sum)
        binary_attr (str): Hit attribute used for calculating cell values in binary table
        binary_decimals (int): Total decimal places in cell values in binary table
        rid (str): NCBI BLAST search request identifier (RID)
        require (list): Query sequences that must be in hit clusters
        session_file (str): Path to cblaster session JSON file
        indent (int): Total spaces to indent JSON files
        plot (str): Path to cblaster plot HTML file
        recompute (str): Path to recomputed session JSON file
    Returns:
        Session: cblaster search Session object
    """

    if session_file and all(Path(sf).exists() for sf in session_file):
        LOG.info("Loading session(s) %s", session_file)
        session = Session.from_files(session_file)

        if recompute:
            LOG.info("Filtering session with new thresholds")
            context.filter_session(
                session,
                min_identity,
                min_coverage,
                max_evalue,
                gap,
                unique,
                min_hits,
                require,
            )
            if recompute is not True:
                LOG.info("Writing recomputed session to %s", recompute)
                with open(recompute, "w") as fp:
                    session.to_json(fp, indent=indent)
    else:
        session = Session(
            queries=query_ids if query_ids else [],
            sequences=helpers.get_sequences(
                query_file=query_file,
                query_ids=query_ids,
            ),
            params={
                "mode": mode,
                "database": database,
                "min_identity": min_identity,
                "min_coverage": min_coverage,
                "max_evalue": max_evalue,
            },
        )

        if query_file:
            # get_sequences() returns OrderedDict, so save keys to
            # preserve query order
            session.queries = list(session.sequences)
            session.params["query_file"] = query_file

        if json_db:
            session.params["json_db"] = json_db

        if mode == "local":
            LOG.info("Starting cblaster in local mode")
            results = local.search(
                database,
                sequences=session.sequences,
                min_identity=min_identity,
                min_coverage=min_coverage,
                max_evalue=max_evalue,
                blast_file=blast_file,
            )
        elif mode == "remote":
            LOG.info("Starting cblaster in remote mode")
            if entrez_query:
                session.params["entrez_query"] = entrez_query
            rid, results = remote.search(
                sequences=session.sequences,
                rid=rid,
                database=database,
                min_identity=min_identity,
                min_coverage=min_coverage,
                max_evalue=max_evalue,
                entrez_query=entrez_query,
                blast_file=blast_file,
                hitlist_size=hitlist_size,
            )
            session.params["rid"] = rid

        LOG.info("Found %i hits meeting score thresholds", len(results))
        LOG.info("Fetching genomic context of hits")

        query_sequence_order = list(session.sequences.keys()) \
            if any(query_file.endswith(ext) for ext in (".gbk", ".gb", ".genbank", ".gbff", ".embl", ".emb"))\
            else None
        session.organisms = context.search(
            results,
            unique=unique,
            min_hits=min_hits,
            gap=gap,
            require=require,
            json_db=json_db,
            ipg_file=ipg_file,
            query_sequence_order=query_sequence_order)

        if session_file:
            LOG.info("Writing current search session to %s", session_file[0])
            if len(session_file) > 1:
                LOG.warning("Multiple session files specified, using first")
            with open(session_file[0], "w") as fp:
                session.to_json(fp, indent=indent)

    if binary:
        LOG.info("Writing binary summary table to %s", binary)
        session.format(
            "binary",
            open(binary, "w"),
            hide_headers=binary_hide_headers,
            delimiter=binary_delimiter,
            key=binary_key,
            attr=binary_attr,
            decimals=binary_decimals,
        )

    LOG.info("Writing summary to %s",
             "stdout" if output == sys.stdout else output)
    results = session.format(
        "summary",
        fp=open(output, "w") if output else sys.stdout,
        hide_headers=output_hide_headers,
        delimiter=output_delimiter,
        decimals=output_decimals,
    )

    if plot:
        plot = None if plot is True else plot
        plot_session(session, output=plot)

    LOG.info("Done.")
    return session
Exemple #2
0
def cblaster(
    query_file=None,
    query_ids=None,
    query_profiles=None,
    mode=None,
    databases=None,
    database_pfam=None,
    gap=20000,
    unique=3,
    min_hits=3,
    min_identity=30,
    min_coverage=50,
    max_evalue=0.01,
    percentage=None,
    entrez_query=None,
    output=None,
    output_hide_headers=False,
    output_delimiter=None,
    output_decimals=4,
    output_sort_clusters=False,
    binary=None,
    binary_hide_headers=True,
    binary_delimiter=None,
    binary_key=len,
    binary_attr="identity",
    binary_decimals=4,
    rid=None,
    require=None,
    session_file=None,
    indent=None,
    plot=False,
    max_plot_clusters=50,
    recompute=False,
    blast_file=None,
    ipg_file=None,
    hitlist_size=None,
    cpus=None,
    intermediate_genes=False,
    intermediate_gene_distance=5000,
    intermediate_max_clusters=100,
    testing=False,
):
    """Run cblaster.

    This function is the central workflow for the entire cblaster package.

    Arguments:
        query_file (str): Path to FASTA format query file
        query_ids (list): NCBI protein sequence identifiers
        query_profiles(list): Pfam profile identifiers
        mode (str): Search mode ('local' or 'remote')
        databases (str): Search database (NCBI if remote, DIAMOND if local)
        database_pfam (str): Path to pfam db or where to download it
        gap (int): Maximum gap (kilobase) between cluster hits
        unique (int): Minimum number of query sequences with hits in clusters
        min_hits (int): Minimum number of hits in clusters
        min_identity (float): Minumum identity (%) cutoff
        min_coverage (float): Minumum coverage (%) cutoff
        max_evalue (float): Maximum e-value threshold
        percentage (int): % of query genes needed to be present in cluster
        entrez_query (str): NCBI Entrez query to filter search database
        output (str): Path to cblaster summary output file
        output_hide_headers (bool): Hide headers in summary table
        output_delimiter (str): Delimiter used in summary table
        output_decimals (int): Total decimal places in hit scores in summary table
        output_sort_clusters (bool): If the clusters in the final summary table need to sorted
        binary (str): Path to cblaster binary output file
        binary_hide_headers (bool): Hide headers in binary table
        binary_delimiter (str): Delimiter used in binary table
        binary_key (str): Key function used in binary table (len, max or sum)
        binary_attr (str): Hit attribute used for calculating cell values in binary table
        binary_decimals (int): Total decimal places in cell values in binary table
        rid (str): NCBI BLAST search request identifier (RID)
        require (list): Query sequences that must be in hit clusters
        session_file (str): Path to cblaster session JSON file
        indent (int): Total spaces to indent JSON files
        plot (str): Path to cblaster plot HTML file
        max_plot_clusters (int): maximum clusters that are plotted when -osc (sort on score ) argument is used
        recompute (str): Path to recomputed session JSON file
        blast_file (str): path to file to save blast output
        ipg_file (str): path to file to save ipg output
        cpus (int): number of cpu's to use when blasting.
        intermediate_genes (bool): Signifies if intermediate genes have to be shown
        hitlist_size (int): Number of database sequences to keep
        intermediate_gene_distance (int): the maximum allowed distance between the
         edge of a cluster and an intermediate gene.
        intermediate_max_clusters (int): the maximum amount of clusters for which intermediate
         genes will be fetched, since this can become expensive for remote searches
        testing (bool): flag to make sure certain code does not run when testing

    Returns:
        Session: cblaster search Session object
    """
    if session_file and all(Path(sf).exists() for sf in session_file):
        LOG.info("Loading session(s) %s", session_file)
        session = Session.from_files(session_file)

        if recompute:
            LOG.info("Filtering session with new thresholds")
            context.filter_session(
                session,
                min_identity,
                min_coverage,
                max_evalue,
                gap,
                unique,
                min_hits,
                require,
                percentage,
            )

            if intermediate_genes:
                find_intermediate_genes(session, intermediate_gene_distance,
                                        intermediate_max_clusters)

            if recompute is not True:
                LOG.info("Writing recomputed session to %s", recompute)
                session.params["min_identity"] = min_identity
                session.params["min_coverage"] = min_coverage
                session.params["max_evalue"] = max_evalue
                session.params["require"] = require
                with open(recompute, "w") as fp:
                    session.to_json(fp, indent=indent)
    else:
        # Create a cblaster Cluster object from query input
        query = helpers.parse_query_sequences(
            query_file=query_file,
            query_ids=query_ids,
            query_profiles=query_profiles,
        )

        # Create a cblaster Session
        session = Session(
            query=query,
            queries=query.names,
            params={
                "mode": mode,
                "database": databases,
                "min_identity": min_identity,
                "min_coverage": min_coverage,
                "max_evalue": max_evalue,
                "require": require,
            },
        )

        if query_file:
            # get_sequences() returns OrderedDict, so save keys to
            # preserve query order
            session.params["query_file"] = query_file

        sqlite_db = None
        session.params["rid"] = rid

        if "combi" in mode and not len(databases) == 2:
            raise RuntimeError("Expected two databases for 'combi_' modes")

        if mode in ("hmm", "combi_local", "combi_remote"):
            sqlite_db = helpers.find_sqlite_db(databases[0])
            results = hmm_search.perform_hmmer(fasta=databases[0],
                                               query_profiles=query_profiles,
                                               pfam=database_pfam,
                                               session=session)

            # Delete first (FASTA) database when doing combined searches
            # Expect .dmnd/NCBI database name for local/remote, respectively
            if "combi" in mode:
                del databases[0]

            LOG.info("Found %i hits meeting score thresholds for hmm search",
                     len(results))
            LOG.info("Fetching genomic context of hits")
            organisms = context.search(
                results,
                sqlite_db=sqlite_db,
                unique=unique,
                min_hits=min_hits,
                gap=gap,
                require=require,
                ipg_file=ipg_file,
                query_sequence_order=session.queries,
                percentage=percentage,
            )
            session.organisms.extend(organisms)

        # When running combined modes, run local/remote search right after HMM search
        if mode == "combi_local":
            mode = "local"
        elif mode == "combi_remote":
            mode = "remote"

        if mode == "local":
            LOG.info("Starting cblaster in local mode")
            sqlite_db = helpers.find_sqlite_db(databases[0])
            results = local.search(
                databases[0],
                sequences=session.query.sequences,
                min_identity=min_identity,
                min_coverage=min_coverage,
                max_evalue=max_evalue,
                blast_file=blast_file,
                cpus=cpus,
            )
            LOG.info("Found %i hits meeting score thresholds for local search",
                     len(results))
            LOG.info("Fetching genomic context of hits")
            organisms = context.search(
                results,
                sqlite_db=sqlite_db,
                unique=unique,
                min_hits=min_hits,
                gap=gap,
                require=require,
                ipg_file=ipg_file,
                query_sequence_order=session.queries,
                percentage=percentage,
            )
            session.organisms.extend(organisms)

        elif mode == "remote":
            LOG.info("Starting cblaster in remote mode")

            if entrez_query:
                session.params["entrez_query"] = entrez_query
            rid, results = remote.search(
                sequences=session.query.sequences,
                rid=rid,
                database=databases[0],
                min_identity=min_identity,
                min_coverage=min_coverage,
                max_evalue=max_evalue,
                entrez_query=entrez_query,
                blast_file=blast_file,
                hitlist_size=hitlist_size,
            )
            session.params["rid"] = rid
            LOG.info(
                "Found %i hits meeting score thresholds for remote search",
                len(results))
            LOG.info("Fetching genomic context of hits")
            organisms = context.search(
                results,
                unique=unique,
                min_hits=min_hits,
                gap=gap,
                require=require,
                ipg_file=ipg_file,
                query_sequence_order=session.queries,
                percentage=percentage,
            )
            session.organisms.extend(organisms)

        if sqlite_db:
            session.params["sqlite_db"] = str(sqlite_db)

        if intermediate_genes:
            find_intermediate_genes(session, intermediate_gene_distance,
                                    intermediate_max_clusters)

        if session_file:
            LOG.info("Writing current search session to %s", session_file[0])
            if len(session_file) > 1:
                LOG.warning("Multiple session files specified, using first")
            with open(session_file[0], "w") as fp:
                session.to_json(fp, indent=indent)

    if binary:
        LOG.info("Writing binary summary table to %s", binary)
        session.format(
            "binary",
            open(binary, "w"),
            hide_headers=binary_hide_headers,
            delimiter=binary_delimiter,
            key=binary_key,
            attr=binary_attr,
            decimals=binary_decimals,
            sort_clusters=output_sort_clusters,
        )

    LOG.info("Writing summary to %s", "stdout" if output is None else output)
    session.format(
        "summary",
        fp=open(output, "w") if output else sys.stdout,
        hide_headers=output_hide_headers,
        delimiter=output_delimiter,
        decimals=output_decimals,
        sort_clusters=output_sort_clusters,
    )

    if plot:
        plot = None if plot is True else plot
        plot_session(
            session,
            output=plot,
            sort_clusters=output_sort_clusters,
            max_clusters=max_plot_clusters,
            testing=testing,
        )

    LOG.info("Done.")
    return session