Example #1
0
def gne(
    session,
    output=None,
    max_gap=100000,
    samples=100,
    scale="linear",
    plot=None,
    hide_headers=False,
    delimiter=",",
    decimals=4,
):
    """Estimate gene neighbourhood."""
    LOG.info("Starting cblaster gene neighbourhood estimation")
    LOG.info("Loading session from: %s", session)
    with open(session) as fp:
        session = Session.from_json(fp)

    LOG.info("Computing gene neighbourhood statistics")
    results = context.estimate_neighbourhood(session,
                                             max_gap=max_gap,
                                             samples=samples,
                                             scale=scale)
    if output:
        LOG.info("Writing GNE table to %s", output.name)
        summary = summarise_gne(
            results,
            hide_headers=hide_headers,
            delimiter=delimiter,
            decimals=decimals,
        )
        output.write(summary)

    plot_gne(results, output=plot)
    LOG.info("Done.")
Example #2
0
def plot_clusters(
    session,
    cluster_numbers=None,
    score_threshold=None,
    organisms=None,
    scaffolds=None,
    plot_outfile=None,
    max_clusters=50,
    testing=False,
):
    """Plot Cluster objects from a Session file
    Args:
        session (string): path to a session.json file
        cluster_numbers (list): cluster numbers to include
        score_threshold (float): minumum score in order for a cluster to be included
        organisms (list): Organism filtering regular expressions, clusters for
        these organisms are included
        scaffolds(list): clusters on these scaffolds are included
        plot_outfile (str): path to a file for the final plot
        max_clusters (int): the maximum amount of clusters plotted regardless of filters
        testing (bool): argument to switch of plotting when testing making sure that no dynamioc plot
        is served since this will crash the testing.
    """
    LOG.info("Starting generation of cluster plot with clinker.")
    session = Session.from_file(session)

    # Filter the cluster using filter functions from the extract_clusters module
    cluster_hierarchies = get_sorted_cluster_hierarchies(
        session,
        cluster_numbers,
        score_threshold,
        organisms,
        scaffolds,
        max_clusters,
    )

    # Form the query cluster from the session query file
    query_cluster = cblaster_to_clinker_cluster(
        session.query,
        cluster_label="Query Cluster",
        scaffold_accession=session.params.get("query_file", "N.A."),
    )

    # Create a Globaligner object containing mocked clusters/alignments/links
    globaligner = clusters_to_clinker_globaligner(query_cluster, cluster_hierarchies)

    if not testing:
        clinker_plot_clusters(globaligner, plot_outfile, use_file_order=True)

    if plot_outfile:
        LOG.info(f"Plot file can be found at {plot_outfile}")

    LOG.info("Done!")
Example #3
0
def extract(
    session,
    delimiter=None,
    name_only=False,
    extract_seqs=False,
    output=None,
    queries=None,
    organisms=None,
    scaffolds=None,
):
    """Extract subject sequences from a cblaster session.

    Args:
        session (str): path to json file encoding a cblaster Session object
        extract_seqs (bool): Put the sequences of the extracted proteins into a fasta file
        output (str): Output file name
        queries (list): Query sequence names
        organisms (list): Organism filtering regular expressions
        scaffolds (list): Scaffold names and ranges
        delimiter (str): Sequence description delimiter character
        name_only (bool): Do not save sequence descriptions
    """
    LOG.info("Starting cblaster extraction")
    LOG.info("Loading session from: %s", session)
    with open(session) as fp:
        session = Session.from_json(fp)

    LOG.info("Extracting subject sequences matching filters")
    records = extract_records(
        session,
        queries=queries,
        organisms=organisms,
        scaffolds=scaffolds,
    )

    if extract_seqs:
        extract_sequences(session, records)

    text = format_records(
        records,
        delimiter=delimiter,
        to_fasta=extract_seqs,
        name_only=name_only,
    )

    if output:
        with open(output, "w") as fp:
            LOG.info("Writing output to %s", fp.name)
            fp.write(text)
    else:
        print(text)

    LOG.info("Done!")
Example #4
0
def extract(
    session,
    in_cluster=True,
    delimiter=None,
    name_only=False,
    download=False,
    output=None,
    queries=None,
    organisms=None,
    scaffolds=None,
):
    """Extract subject sequences from a cblaster session.

    Parameters:
        session (Session): cblaster Session object
        in_cluster: (bool): Only sequences in clusters are extracted
        download (bool): Download hit sequences from NCBI
        output (str): Output file name
        queries (list): Query sequence names
        organisms (list): Organism filtering regular expressions
        scaffolds (list): Scaffold names and ranges
        delimiter (str): Sequence description delimiter character
        name_only (bool): Do not save sequence descriptions
    """
    LOG.info("Starting cblaster extraction")
    LOG.info("Loading session from: %s", session)
    with open(session) as fp:
        session = Session.from_json(fp)

    LOG.info("Extracting subject sequences matching filters")
    records = extract_records(
        session,
        in_cluster=in_cluster,
        queries=queries,
        organisms=organisms,
        scaffolds=scaffolds,
    )

    if download:
        LOG.info("Fetching %i sequences from NCBI", len(records))
        headers = [record.get("name") for record in records]
        sequences = efetch_sequences(headers)
        for record in records:
            record["sequence"] = sequences.get(record["name"])

    # FASTA format if downloading from NCBI, otherwise newline separated IDs
    text = format_records(
        records,
        delimiter=delimiter,
        to_fasta=download,
        name_only=name_only,
    )

    if output:
        with open(output, "w") as fp:
            LOG.info("Writing output to %s", fp.name)
            fp.write(text)
    else:
        print(text)

    LOG.info("Done!")
    return records
Example #5
0
def plot_session_file(path, output=None):
    with open(path) as fp:
        session = Session.from_json(fp)
    plot_session(session, output=output)
Example #6
0
def cblaster(
    query_file=None,
    query_ids=None,
    mode=None,
    json_db=None,
    database=None,
    gap=20000,
    unique=3,
    min_hits=3,
    min_identity=30,
    min_coverage=50,
    max_evalue=0.01,
    entrez_query=None,
    output=None,
    output_hide_headers=False,
    output_delimiter=None,
    output_decimals=4,
    binary=None,
    binary_hide_headers=True,
    binary_delimiter=None,
    binary_key=len,
    binary_attr="identity",
    binary_decimals=4,
    rid=None,
    require=None,
    session_file=None,
    indent=None,
    plot=False,
    recompute=False,
    blast_file=None,
    ipg_file=None,
    hitlist_size=None,
):
    """Run cblaster.

    This function is the central workflow for the entire cblaster package.

    Arguments:
        query_file (str): Path to FASTA format query file
        query_ids (list): NCBI protein sequence identifiers
        mode (str): Search mode ('local' or 'remote')
        json_db (str): JSON database created with cblaster makedb
        database (str): Search database (NCBI if remote, DIAMOND if local)
        gap (int): Maximum gap (kilobase) between cluster hits
        unique (int): Minimum number of query sequences with hits in clusters
        min_hits (int): Minimum number of hits in clusters
        min_identity (float): Minumum identity (%) cutoff
        min_coverage (float): Minumum coverage (%) cutoff
        max_evalue (float): Maximum e-value threshold
        entrez_query (str): NCBI Entrez query to filter search database
        output (str): Path to cblaster summary output file
        output_hide_headers (bool): Hide headers in summary table
        output_delimiter (str): Delimiter used in summary table
        output_decimals (int): Total decimal places in hit scores in summary table
        binary (str): Path to cblaster binary output file
        binary_hide_headers (bool): Hide headers in binary table
        binary_delimiter (str): Delimiter used in binary table
        binary_key (str): Key function used in binary table (len, max or sum)
        binary_attr (str): Hit attribute used for calculating cell values in binary table
        binary_decimals (int): Total decimal places in cell values in binary table
        rid (str): NCBI BLAST search request identifier (RID)
        require (list): Query sequences that must be in hit clusters
        session_file (str): Path to cblaster session JSON file
        indent (int): Total spaces to indent JSON files
        plot (str): Path to cblaster plot HTML file
        recompute (str): Path to recomputed session JSON file
    Returns:
        Session: cblaster search Session object
    """

    if session_file and all(Path(sf).exists() for sf in session_file):
        LOG.info("Loading session(s) %s", session_file)
        session = Session.from_files(session_file)

        if recompute:
            LOG.info("Filtering session with new thresholds")
            context.filter_session(
                session,
                min_identity,
                min_coverage,
                max_evalue,
                gap,
                unique,
                min_hits,
                require,
            )
            if recompute is not True:
                LOG.info("Writing recomputed session to %s", recompute)
                with open(recompute, "w") as fp:
                    session.to_json(fp, indent=indent)
    else:
        session = Session(
            queries=query_ids if query_ids else [],
            sequences=helpers.get_sequences(
                query_file=query_file,
                query_ids=query_ids,
            ),
            params={
                "mode": mode,
                "database": database,
                "min_identity": min_identity,
                "min_coverage": min_coverage,
                "max_evalue": max_evalue,
            },
        )

        if query_file:
            # get_sequences() returns OrderedDict, so save keys to
            # preserve query order
            session.queries = list(session.sequences)
            session.params["query_file"] = query_file

        if json_db:
            session.params["json_db"] = json_db

        if mode == "local":
            LOG.info("Starting cblaster in local mode")
            results = local.search(
                database,
                sequences=session.sequences,
                min_identity=min_identity,
                min_coverage=min_coverage,
                max_evalue=max_evalue,
                blast_file=blast_file,
            )
        elif mode == "remote":
            LOG.info("Starting cblaster in remote mode")
            if entrez_query:
                session.params["entrez_query"] = entrez_query
            rid, results = remote.search(
                sequences=session.sequences,
                rid=rid,
                database=database,
                min_identity=min_identity,
                min_coverage=min_coverage,
                max_evalue=max_evalue,
                entrez_query=entrez_query,
                blast_file=blast_file,
                hitlist_size=hitlist_size,
            )
            session.params["rid"] = rid

        LOG.info("Found %i hits meeting score thresholds", len(results))
        LOG.info("Fetching genomic context of hits")

        query_sequence_order = list(session.sequences.keys()) \
            if any(query_file.endswith(ext) for ext in (".gbk", ".gb", ".genbank", ".gbff", ".embl", ".emb"))\
            else None
        session.organisms = context.search(
            results,
            unique=unique,
            min_hits=min_hits,
            gap=gap,
            require=require,
            json_db=json_db,
            ipg_file=ipg_file,
            query_sequence_order=query_sequence_order)

        if session_file:
            LOG.info("Writing current search session to %s", session_file[0])
            if len(session_file) > 1:
                LOG.warning("Multiple session files specified, using first")
            with open(session_file[0], "w") as fp:
                session.to_json(fp, indent=indent)

    if binary:
        LOG.info("Writing binary summary table to %s", binary)
        session.format(
            "binary",
            open(binary, "w"),
            hide_headers=binary_hide_headers,
            delimiter=binary_delimiter,
            key=binary_key,
            attr=binary_attr,
            decimals=binary_decimals,
        )

    LOG.info("Writing summary to %s",
             "stdout" if output == sys.stdout else output)
    results = session.format(
        "summary",
        fp=open(output, "w") if output else sys.stdout,
        hide_headers=output_hide_headers,
        delimiter=output_delimiter,
        decimals=output_decimals,
    )

    if plot:
        plot = None if plot is True else plot
        plot_session(session, output=plot)

    LOG.info("Done.")
    return session
Example #7
0
def cblaster(
    query_file=None,
    query_ids=None,
    query_profiles=None,
    mode=None,
    databases=None,
    database_pfam=None,
    gap=20000,
    unique=3,
    min_hits=3,
    min_identity=30,
    min_coverage=50,
    max_evalue=0.01,
    percentage=None,
    entrez_query=None,
    output=None,
    output_hide_headers=False,
    output_delimiter=None,
    output_decimals=4,
    output_sort_clusters=False,
    binary=None,
    binary_hide_headers=True,
    binary_delimiter=None,
    binary_key=len,
    binary_attr="identity",
    binary_decimals=4,
    rid=None,
    require=None,
    session_file=None,
    indent=None,
    plot=False,
    max_plot_clusters=50,
    recompute=False,
    blast_file=None,
    ipg_file=None,
    hitlist_size=None,
    cpus=None,
    intermediate_genes=False,
    intermediate_gene_distance=5000,
    intermediate_max_clusters=100,
    testing=False,
):
    """Run cblaster.

    This function is the central workflow for the entire cblaster package.

    Arguments:
        query_file (str): Path to FASTA format query file
        query_ids (list): NCBI protein sequence identifiers
        query_profiles(list): Pfam profile identifiers
        mode (str): Search mode ('local' or 'remote')
        databases (str): Search database (NCBI if remote, DIAMOND if local)
        database_pfam (str): Path to pfam db or where to download it
        gap (int): Maximum gap (kilobase) between cluster hits
        unique (int): Minimum number of query sequences with hits in clusters
        min_hits (int): Minimum number of hits in clusters
        min_identity (float): Minumum identity (%) cutoff
        min_coverage (float): Minumum coverage (%) cutoff
        max_evalue (float): Maximum e-value threshold
        percentage (int): % of query genes needed to be present in cluster
        entrez_query (str): NCBI Entrez query to filter search database
        output (str): Path to cblaster summary output file
        output_hide_headers (bool): Hide headers in summary table
        output_delimiter (str): Delimiter used in summary table
        output_decimals (int): Total decimal places in hit scores in summary table
        output_sort_clusters (bool): If the clusters in the final summary table need to sorted
        binary (str): Path to cblaster binary output file
        binary_hide_headers (bool): Hide headers in binary table
        binary_delimiter (str): Delimiter used in binary table
        binary_key (str): Key function used in binary table (len, max or sum)
        binary_attr (str): Hit attribute used for calculating cell values in binary table
        binary_decimals (int): Total decimal places in cell values in binary table
        rid (str): NCBI BLAST search request identifier (RID)
        require (list): Query sequences that must be in hit clusters
        session_file (str): Path to cblaster session JSON file
        indent (int): Total spaces to indent JSON files
        plot (str): Path to cblaster plot HTML file
        max_plot_clusters (int): maximum clusters that are plotted when -osc (sort on score ) argument is used
        recompute (str): Path to recomputed session JSON file
        blast_file (str): path to file to save blast output
        ipg_file (str): path to file to save ipg output
        cpus (int): number of cpu's to use when blasting.
        intermediate_genes (bool): Signifies if intermediate genes have to be shown
        hitlist_size (int): Number of database sequences to keep
        intermediate_gene_distance (int): the maximum allowed distance between the
         edge of a cluster and an intermediate gene.
        intermediate_max_clusters (int): the maximum amount of clusters for which intermediate
         genes will be fetched, since this can become expensive for remote searches
        testing (bool): flag to make sure certain code does not run when testing

    Returns:
        Session: cblaster search Session object
    """
    if session_file and all(Path(sf).exists() for sf in session_file):
        LOG.info("Loading session(s) %s", session_file)
        session = Session.from_files(session_file)

        if recompute:
            LOG.info("Filtering session with new thresholds")
            context.filter_session(
                session,
                min_identity,
                min_coverage,
                max_evalue,
                gap,
                unique,
                min_hits,
                require,
                percentage,
            )

            if intermediate_genes:
                find_intermediate_genes(session, intermediate_gene_distance,
                                        intermediate_max_clusters)

            if recompute is not True:
                LOG.info("Writing recomputed session to %s", recompute)
                session.params["min_identity"] = min_identity
                session.params["min_coverage"] = min_coverage
                session.params["max_evalue"] = max_evalue
                session.params["require"] = require
                with open(recompute, "w") as fp:
                    session.to_json(fp, indent=indent)
    else:
        # Create a cblaster Cluster object from query input
        query = helpers.parse_query_sequences(
            query_file=query_file,
            query_ids=query_ids,
            query_profiles=query_profiles,
        )

        # Create a cblaster Session
        session = Session(
            query=query,
            queries=query.names,
            params={
                "mode": mode,
                "database": databases,
                "min_identity": min_identity,
                "min_coverage": min_coverage,
                "max_evalue": max_evalue,
                "require": require,
            },
        )

        if query_file:
            # get_sequences() returns OrderedDict, so save keys to
            # preserve query order
            session.params["query_file"] = query_file

        sqlite_db = None
        session.params["rid"] = rid

        if "combi" in mode and not len(databases) == 2:
            raise RuntimeError("Expected two databases for 'combi_' modes")

        if mode in ("hmm", "combi_local", "combi_remote"):
            sqlite_db = helpers.find_sqlite_db(databases[0])
            results = hmm_search.perform_hmmer(fasta=databases[0],
                                               query_profiles=query_profiles,
                                               pfam=database_pfam,
                                               session=session)

            # Delete first (FASTA) database when doing combined searches
            # Expect .dmnd/NCBI database name for local/remote, respectively
            if "combi" in mode:
                del databases[0]

            LOG.info("Found %i hits meeting score thresholds for hmm search",
                     len(results))
            LOG.info("Fetching genomic context of hits")
            organisms = context.search(
                results,
                sqlite_db=sqlite_db,
                unique=unique,
                min_hits=min_hits,
                gap=gap,
                require=require,
                ipg_file=ipg_file,
                query_sequence_order=session.queries,
                percentage=percentage,
            )
            session.organisms.extend(organisms)

        # When running combined modes, run local/remote search right after HMM search
        if mode == "combi_local":
            mode = "local"
        elif mode == "combi_remote":
            mode = "remote"

        if mode == "local":
            LOG.info("Starting cblaster in local mode")
            sqlite_db = helpers.find_sqlite_db(databases[0])
            results = local.search(
                databases[0],
                sequences=session.query.sequences,
                min_identity=min_identity,
                min_coverage=min_coverage,
                max_evalue=max_evalue,
                blast_file=blast_file,
                cpus=cpus,
            )
            LOG.info("Found %i hits meeting score thresholds for local search",
                     len(results))
            LOG.info("Fetching genomic context of hits")
            organisms = context.search(
                results,
                sqlite_db=sqlite_db,
                unique=unique,
                min_hits=min_hits,
                gap=gap,
                require=require,
                ipg_file=ipg_file,
                query_sequence_order=session.queries,
                percentage=percentage,
            )
            session.organisms.extend(organisms)

        elif mode == "remote":
            LOG.info("Starting cblaster in remote mode")

            if entrez_query:
                session.params["entrez_query"] = entrez_query
            rid, results = remote.search(
                sequences=session.query.sequences,
                rid=rid,
                database=databases[0],
                min_identity=min_identity,
                min_coverage=min_coverage,
                max_evalue=max_evalue,
                entrez_query=entrez_query,
                blast_file=blast_file,
                hitlist_size=hitlist_size,
            )
            session.params["rid"] = rid
            LOG.info(
                "Found %i hits meeting score thresholds for remote search",
                len(results))
            LOG.info("Fetching genomic context of hits")
            organisms = context.search(
                results,
                unique=unique,
                min_hits=min_hits,
                gap=gap,
                require=require,
                ipg_file=ipg_file,
                query_sequence_order=session.queries,
                percentage=percentage,
            )
            session.organisms.extend(organisms)

        if sqlite_db:
            session.params["sqlite_db"] = str(sqlite_db)

        if intermediate_genes:
            find_intermediate_genes(session, intermediate_gene_distance,
                                    intermediate_max_clusters)

        if session_file:
            LOG.info("Writing current search session to %s", session_file[0])
            if len(session_file) > 1:
                LOG.warning("Multiple session files specified, using first")
            with open(session_file[0], "w") as fp:
                session.to_json(fp, indent=indent)

    if binary:
        LOG.info("Writing binary summary table to %s", binary)
        session.format(
            "binary",
            open(binary, "w"),
            hide_headers=binary_hide_headers,
            delimiter=binary_delimiter,
            key=binary_key,
            attr=binary_attr,
            decimals=binary_decimals,
            sort_clusters=output_sort_clusters,
        )

    LOG.info("Writing summary to %s", "stdout" if output is None else output)
    session.format(
        "summary",
        fp=open(output, "w") if output else sys.stdout,
        hide_headers=output_hide_headers,
        delimiter=output_delimiter,
        decimals=output_decimals,
        sort_clusters=output_sort_clusters,
    )

    if plot:
        plot = None if plot is True else plot
        plot_session(
            session,
            output=plot,
            sort_clusters=output_sort_clusters,
            max_clusters=max_plot_clusters,
            testing=testing,
        )

    LOG.info("Done.")
    return session
Example #8
0
def extract_clusters(
    session,
    output_dir,
    prefix="",
    cluster_numbers=None,
    score_threshold=None,
    organisms=None,
    scaffolds=None,
    format_="genbank",
    max_clusters=50,
):
    """Extracts Cluster objects from a Session file and writes them to a file.

    If BiG-SCAPE format is chosen,  a 'gene_kind' qualifier is added to each CDS
    feature to indicate what genes are part of the core of the cluster.

    Genes that are flagged as required are considered core. If no genes are flagged as
    required, all genes are considered to be core genes.

    An additional qualifier is provided called 'cluster_role'. This qualifier allows
    the identification between hits found against required genes of the query, hits
    found agains any gene of the query and intermediate genes.

    Args:
        session (string): path to a session.json file
        output_dir (string): path to a directory for writing the output files
        prefix (string): string to start the file name of each cluster with
        cluster_numbers (list): cluster numbers to include
        score_threshold (float): minum score in order for a cluster to be included
        organisms (list): Organism filtering regular expressions, clusters for
         these organisms are included
        scaffolds(list): clusters on these scaffolds are included
        format_ (str): the format that the extracted cluster should have
        max_clusters (int): the maximum amount of clusters extracted regardless of filters

    """
    LOG.info("Starting cblaster plotting of clusters using clinker")
    LOG.info("Loading session from: %s", session)
    with open(session) as fp:
        session = Session.from_json(fp)

    LOG.info("Extracting clusters that match the filters")
    cluster_hierarchy = get_sorted_cluster_hierarchies(
        session,
        cluster_numbers,
        score_threshold,
        organisms,
        scaffolds,
        max_clusters,
    )

    LOG.info(f"Extracted {len(cluster_hierarchy)} clusters.")
    if len(cluster_hierarchy) == 0:
        LOG.info("There are no clusters that meet the filtering criteria. Exiting...")
        raise SystemExit(0)

    LOG.info("Writing genbank files")
    create_genbanks_from_clusters(
        session,
        cluster_hierarchy,
        output_dir,
        prefix,
        format_,
    )

    LOG.info(f"Clusters have been written to {output_dir}")
    LOG.info("Done!")