def gne( session, output=None, max_gap=100000, samples=100, scale="linear", plot=None, hide_headers=False, delimiter=",", decimals=4, ): """Estimate gene neighbourhood.""" LOG.info("Starting cblaster gene neighbourhood estimation") LOG.info("Loading session from: %s", session) with open(session) as fp: session = Session.from_json(fp) LOG.info("Computing gene neighbourhood statistics") results = context.estimate_neighbourhood(session, max_gap=max_gap, samples=samples, scale=scale) if output: LOG.info("Writing GNE table to %s", output.name) summary = summarise_gne( results, hide_headers=hide_headers, delimiter=delimiter, decimals=decimals, ) output.write(summary) plot_gne(results, output=plot) LOG.info("Done.")
def plot_clusters( session, cluster_numbers=None, score_threshold=None, organisms=None, scaffolds=None, plot_outfile=None, max_clusters=50, testing=False, ): """Plot Cluster objects from a Session file Args: session (string): path to a session.json file cluster_numbers (list): cluster numbers to include score_threshold (float): minumum score in order for a cluster to be included organisms (list): Organism filtering regular expressions, clusters for these organisms are included scaffolds(list): clusters on these scaffolds are included plot_outfile (str): path to a file for the final plot max_clusters (int): the maximum amount of clusters plotted regardless of filters testing (bool): argument to switch of plotting when testing making sure that no dynamioc plot is served since this will crash the testing. """ LOG.info("Starting generation of cluster plot with clinker.") session = Session.from_file(session) # Filter the cluster using filter functions from the extract_clusters module cluster_hierarchies = get_sorted_cluster_hierarchies( session, cluster_numbers, score_threshold, organisms, scaffolds, max_clusters, ) # Form the query cluster from the session query file query_cluster = cblaster_to_clinker_cluster( session.query, cluster_label="Query Cluster", scaffold_accession=session.params.get("query_file", "N.A."), ) # Create a Globaligner object containing mocked clusters/alignments/links globaligner = clusters_to_clinker_globaligner(query_cluster, cluster_hierarchies) if not testing: clinker_plot_clusters(globaligner, plot_outfile, use_file_order=True) if plot_outfile: LOG.info(f"Plot file can be found at {plot_outfile}") LOG.info("Done!")
def extract( session, delimiter=None, name_only=False, extract_seqs=False, output=None, queries=None, organisms=None, scaffolds=None, ): """Extract subject sequences from a cblaster session. Args: session (str): path to json file encoding a cblaster Session object extract_seqs (bool): Put the sequences of the extracted proteins into a fasta file output (str): Output file name queries (list): Query sequence names organisms (list): Organism filtering regular expressions scaffolds (list): Scaffold names and ranges delimiter (str): Sequence description delimiter character name_only (bool): Do not save sequence descriptions """ LOG.info("Starting cblaster extraction") LOG.info("Loading session from: %s", session) with open(session) as fp: session = Session.from_json(fp) LOG.info("Extracting subject sequences matching filters") records = extract_records( session, queries=queries, organisms=organisms, scaffolds=scaffolds, ) if extract_seqs: extract_sequences(session, records) text = format_records( records, delimiter=delimiter, to_fasta=extract_seqs, name_only=name_only, ) if output: with open(output, "w") as fp: LOG.info("Writing output to %s", fp.name) fp.write(text) else: print(text) LOG.info("Done!")
def extract( session, in_cluster=True, delimiter=None, name_only=False, download=False, output=None, queries=None, organisms=None, scaffolds=None, ): """Extract subject sequences from a cblaster session. Parameters: session (Session): cblaster Session object in_cluster: (bool): Only sequences in clusters are extracted download (bool): Download hit sequences from NCBI output (str): Output file name queries (list): Query sequence names organisms (list): Organism filtering regular expressions scaffolds (list): Scaffold names and ranges delimiter (str): Sequence description delimiter character name_only (bool): Do not save sequence descriptions """ LOG.info("Starting cblaster extraction") LOG.info("Loading session from: %s", session) with open(session) as fp: session = Session.from_json(fp) LOG.info("Extracting subject sequences matching filters") records = extract_records( session, in_cluster=in_cluster, queries=queries, organisms=organisms, scaffolds=scaffolds, ) if download: LOG.info("Fetching %i sequences from NCBI", len(records)) headers = [record.get("name") for record in records] sequences = efetch_sequences(headers) for record in records: record["sequence"] = sequences.get(record["name"]) # FASTA format if downloading from NCBI, otherwise newline separated IDs text = format_records( records, delimiter=delimiter, to_fasta=download, name_only=name_only, ) if output: with open(output, "w") as fp: LOG.info("Writing output to %s", fp.name) fp.write(text) else: print(text) LOG.info("Done!") return records
def plot_session_file(path, output=None): with open(path) as fp: session = Session.from_json(fp) plot_session(session, output=output)
def cblaster( query_file=None, query_ids=None, mode=None, json_db=None, database=None, gap=20000, unique=3, min_hits=3, min_identity=30, min_coverage=50, max_evalue=0.01, entrez_query=None, output=None, output_hide_headers=False, output_delimiter=None, output_decimals=4, binary=None, binary_hide_headers=True, binary_delimiter=None, binary_key=len, binary_attr="identity", binary_decimals=4, rid=None, require=None, session_file=None, indent=None, plot=False, recompute=False, blast_file=None, ipg_file=None, hitlist_size=None, ): """Run cblaster. This function is the central workflow for the entire cblaster package. Arguments: query_file (str): Path to FASTA format query file query_ids (list): NCBI protein sequence identifiers mode (str): Search mode ('local' or 'remote') json_db (str): JSON database created with cblaster makedb database (str): Search database (NCBI if remote, DIAMOND if local) gap (int): Maximum gap (kilobase) between cluster hits unique (int): Minimum number of query sequences with hits in clusters min_hits (int): Minimum number of hits in clusters min_identity (float): Minumum identity (%) cutoff min_coverage (float): Minumum coverage (%) cutoff max_evalue (float): Maximum e-value threshold entrez_query (str): NCBI Entrez query to filter search database output (str): Path to cblaster summary output file output_hide_headers (bool): Hide headers in summary table output_delimiter (str): Delimiter used in summary table output_decimals (int): Total decimal places in hit scores in summary table binary (str): Path to cblaster binary output file binary_hide_headers (bool): Hide headers in binary table binary_delimiter (str): Delimiter used in binary table binary_key (str): Key function used in binary table (len, max or sum) binary_attr (str): Hit attribute used for calculating cell values in binary table binary_decimals (int): Total decimal places in cell values in binary table rid (str): NCBI BLAST search request identifier (RID) require (list): Query sequences that must be in hit clusters session_file (str): Path to cblaster session JSON file indent (int): Total spaces to indent JSON files plot (str): Path to cblaster plot HTML file recompute (str): Path to recomputed session JSON file Returns: Session: cblaster search Session object """ if session_file and all(Path(sf).exists() for sf in session_file): LOG.info("Loading session(s) %s", session_file) session = Session.from_files(session_file) if recompute: LOG.info("Filtering session with new thresholds") context.filter_session( session, min_identity, min_coverage, max_evalue, gap, unique, min_hits, require, ) if recompute is not True: LOG.info("Writing recomputed session to %s", recompute) with open(recompute, "w") as fp: session.to_json(fp, indent=indent) else: session = Session( queries=query_ids if query_ids else [], sequences=helpers.get_sequences( query_file=query_file, query_ids=query_ids, ), params={ "mode": mode, "database": database, "min_identity": min_identity, "min_coverage": min_coverage, "max_evalue": max_evalue, }, ) if query_file: # get_sequences() returns OrderedDict, so save keys to # preserve query order session.queries = list(session.sequences) session.params["query_file"] = query_file if json_db: session.params["json_db"] = json_db if mode == "local": LOG.info("Starting cblaster in local mode") results = local.search( database, sequences=session.sequences, min_identity=min_identity, min_coverage=min_coverage, max_evalue=max_evalue, blast_file=blast_file, ) elif mode == "remote": LOG.info("Starting cblaster in remote mode") if entrez_query: session.params["entrez_query"] = entrez_query rid, results = remote.search( sequences=session.sequences, rid=rid, database=database, min_identity=min_identity, min_coverage=min_coverage, max_evalue=max_evalue, entrez_query=entrez_query, blast_file=blast_file, hitlist_size=hitlist_size, ) session.params["rid"] = rid LOG.info("Found %i hits meeting score thresholds", len(results)) LOG.info("Fetching genomic context of hits") query_sequence_order = list(session.sequences.keys()) \ if any(query_file.endswith(ext) for ext in (".gbk", ".gb", ".genbank", ".gbff", ".embl", ".emb"))\ else None session.organisms = context.search( results, unique=unique, min_hits=min_hits, gap=gap, require=require, json_db=json_db, ipg_file=ipg_file, query_sequence_order=query_sequence_order) if session_file: LOG.info("Writing current search session to %s", session_file[0]) if len(session_file) > 1: LOG.warning("Multiple session files specified, using first") with open(session_file[0], "w") as fp: session.to_json(fp, indent=indent) if binary: LOG.info("Writing binary summary table to %s", binary) session.format( "binary", open(binary, "w"), hide_headers=binary_hide_headers, delimiter=binary_delimiter, key=binary_key, attr=binary_attr, decimals=binary_decimals, ) LOG.info("Writing summary to %s", "stdout" if output == sys.stdout else output) results = session.format( "summary", fp=open(output, "w") if output else sys.stdout, hide_headers=output_hide_headers, delimiter=output_delimiter, decimals=output_decimals, ) if plot: plot = None if plot is True else plot plot_session(session, output=plot) LOG.info("Done.") return session
def cblaster( query_file=None, query_ids=None, query_profiles=None, mode=None, databases=None, database_pfam=None, gap=20000, unique=3, min_hits=3, min_identity=30, min_coverage=50, max_evalue=0.01, percentage=None, entrez_query=None, output=None, output_hide_headers=False, output_delimiter=None, output_decimals=4, output_sort_clusters=False, binary=None, binary_hide_headers=True, binary_delimiter=None, binary_key=len, binary_attr="identity", binary_decimals=4, rid=None, require=None, session_file=None, indent=None, plot=False, max_plot_clusters=50, recompute=False, blast_file=None, ipg_file=None, hitlist_size=None, cpus=None, intermediate_genes=False, intermediate_gene_distance=5000, intermediate_max_clusters=100, testing=False, ): """Run cblaster. This function is the central workflow for the entire cblaster package. Arguments: query_file (str): Path to FASTA format query file query_ids (list): NCBI protein sequence identifiers query_profiles(list): Pfam profile identifiers mode (str): Search mode ('local' or 'remote') databases (str): Search database (NCBI if remote, DIAMOND if local) database_pfam (str): Path to pfam db or where to download it gap (int): Maximum gap (kilobase) between cluster hits unique (int): Minimum number of query sequences with hits in clusters min_hits (int): Minimum number of hits in clusters min_identity (float): Minumum identity (%) cutoff min_coverage (float): Minumum coverage (%) cutoff max_evalue (float): Maximum e-value threshold percentage (int): % of query genes needed to be present in cluster entrez_query (str): NCBI Entrez query to filter search database output (str): Path to cblaster summary output file output_hide_headers (bool): Hide headers in summary table output_delimiter (str): Delimiter used in summary table output_decimals (int): Total decimal places in hit scores in summary table output_sort_clusters (bool): If the clusters in the final summary table need to sorted binary (str): Path to cblaster binary output file binary_hide_headers (bool): Hide headers in binary table binary_delimiter (str): Delimiter used in binary table binary_key (str): Key function used in binary table (len, max or sum) binary_attr (str): Hit attribute used for calculating cell values in binary table binary_decimals (int): Total decimal places in cell values in binary table rid (str): NCBI BLAST search request identifier (RID) require (list): Query sequences that must be in hit clusters session_file (str): Path to cblaster session JSON file indent (int): Total spaces to indent JSON files plot (str): Path to cblaster plot HTML file max_plot_clusters (int): maximum clusters that are plotted when -osc (sort on score ) argument is used recompute (str): Path to recomputed session JSON file blast_file (str): path to file to save blast output ipg_file (str): path to file to save ipg output cpus (int): number of cpu's to use when blasting. intermediate_genes (bool): Signifies if intermediate genes have to be shown hitlist_size (int): Number of database sequences to keep intermediate_gene_distance (int): the maximum allowed distance between the edge of a cluster and an intermediate gene. intermediate_max_clusters (int): the maximum amount of clusters for which intermediate genes will be fetched, since this can become expensive for remote searches testing (bool): flag to make sure certain code does not run when testing Returns: Session: cblaster search Session object """ if session_file and all(Path(sf).exists() for sf in session_file): LOG.info("Loading session(s) %s", session_file) session = Session.from_files(session_file) if recompute: LOG.info("Filtering session with new thresholds") context.filter_session( session, min_identity, min_coverage, max_evalue, gap, unique, min_hits, require, percentage, ) if intermediate_genes: find_intermediate_genes(session, intermediate_gene_distance, intermediate_max_clusters) if recompute is not True: LOG.info("Writing recomputed session to %s", recompute) session.params["min_identity"] = min_identity session.params["min_coverage"] = min_coverage session.params["max_evalue"] = max_evalue session.params["require"] = require with open(recompute, "w") as fp: session.to_json(fp, indent=indent) else: # Create a cblaster Cluster object from query input query = helpers.parse_query_sequences( query_file=query_file, query_ids=query_ids, query_profiles=query_profiles, ) # Create a cblaster Session session = Session( query=query, queries=query.names, params={ "mode": mode, "database": databases, "min_identity": min_identity, "min_coverage": min_coverage, "max_evalue": max_evalue, "require": require, }, ) if query_file: # get_sequences() returns OrderedDict, so save keys to # preserve query order session.params["query_file"] = query_file sqlite_db = None session.params["rid"] = rid if "combi" in mode and not len(databases) == 2: raise RuntimeError("Expected two databases for 'combi_' modes") if mode in ("hmm", "combi_local", "combi_remote"): sqlite_db = helpers.find_sqlite_db(databases[0]) results = hmm_search.perform_hmmer(fasta=databases[0], query_profiles=query_profiles, pfam=database_pfam, session=session) # Delete first (FASTA) database when doing combined searches # Expect .dmnd/NCBI database name for local/remote, respectively if "combi" in mode: del databases[0] LOG.info("Found %i hits meeting score thresholds for hmm search", len(results)) LOG.info("Fetching genomic context of hits") organisms = context.search( results, sqlite_db=sqlite_db, unique=unique, min_hits=min_hits, gap=gap, require=require, ipg_file=ipg_file, query_sequence_order=session.queries, percentage=percentage, ) session.organisms.extend(organisms) # When running combined modes, run local/remote search right after HMM search if mode == "combi_local": mode = "local" elif mode == "combi_remote": mode = "remote" if mode == "local": LOG.info("Starting cblaster in local mode") sqlite_db = helpers.find_sqlite_db(databases[0]) results = local.search( databases[0], sequences=session.query.sequences, min_identity=min_identity, min_coverage=min_coverage, max_evalue=max_evalue, blast_file=blast_file, cpus=cpus, ) LOG.info("Found %i hits meeting score thresholds for local search", len(results)) LOG.info("Fetching genomic context of hits") organisms = context.search( results, sqlite_db=sqlite_db, unique=unique, min_hits=min_hits, gap=gap, require=require, ipg_file=ipg_file, query_sequence_order=session.queries, percentage=percentage, ) session.organisms.extend(organisms) elif mode == "remote": LOG.info("Starting cblaster in remote mode") if entrez_query: session.params["entrez_query"] = entrez_query rid, results = remote.search( sequences=session.query.sequences, rid=rid, database=databases[0], min_identity=min_identity, min_coverage=min_coverage, max_evalue=max_evalue, entrez_query=entrez_query, blast_file=blast_file, hitlist_size=hitlist_size, ) session.params["rid"] = rid LOG.info( "Found %i hits meeting score thresholds for remote search", len(results)) LOG.info("Fetching genomic context of hits") organisms = context.search( results, unique=unique, min_hits=min_hits, gap=gap, require=require, ipg_file=ipg_file, query_sequence_order=session.queries, percentage=percentage, ) session.organisms.extend(organisms) if sqlite_db: session.params["sqlite_db"] = str(sqlite_db) if intermediate_genes: find_intermediate_genes(session, intermediate_gene_distance, intermediate_max_clusters) if session_file: LOG.info("Writing current search session to %s", session_file[0]) if len(session_file) > 1: LOG.warning("Multiple session files specified, using first") with open(session_file[0], "w") as fp: session.to_json(fp, indent=indent) if binary: LOG.info("Writing binary summary table to %s", binary) session.format( "binary", open(binary, "w"), hide_headers=binary_hide_headers, delimiter=binary_delimiter, key=binary_key, attr=binary_attr, decimals=binary_decimals, sort_clusters=output_sort_clusters, ) LOG.info("Writing summary to %s", "stdout" if output is None else output) session.format( "summary", fp=open(output, "w") if output else sys.stdout, hide_headers=output_hide_headers, delimiter=output_delimiter, decimals=output_decimals, sort_clusters=output_sort_clusters, ) if plot: plot = None if plot is True else plot plot_session( session, output=plot, sort_clusters=output_sort_clusters, max_clusters=max_plot_clusters, testing=testing, ) LOG.info("Done.") return session
def extract_clusters( session, output_dir, prefix="", cluster_numbers=None, score_threshold=None, organisms=None, scaffolds=None, format_="genbank", max_clusters=50, ): """Extracts Cluster objects from a Session file and writes them to a file. If BiG-SCAPE format is chosen, a 'gene_kind' qualifier is added to each CDS feature to indicate what genes are part of the core of the cluster. Genes that are flagged as required are considered core. If no genes are flagged as required, all genes are considered to be core genes. An additional qualifier is provided called 'cluster_role'. This qualifier allows the identification between hits found against required genes of the query, hits found agains any gene of the query and intermediate genes. Args: session (string): path to a session.json file output_dir (string): path to a directory for writing the output files prefix (string): string to start the file name of each cluster with cluster_numbers (list): cluster numbers to include score_threshold (float): minum score in order for a cluster to be included organisms (list): Organism filtering regular expressions, clusters for these organisms are included scaffolds(list): clusters on these scaffolds are included format_ (str): the format that the extracted cluster should have max_clusters (int): the maximum amount of clusters extracted regardless of filters """ LOG.info("Starting cblaster plotting of clusters using clinker") LOG.info("Loading session from: %s", session) with open(session) as fp: session = Session.from_json(fp) LOG.info("Extracting clusters that match the filters") cluster_hierarchy = get_sorted_cluster_hierarchies( session, cluster_numbers, score_threshold, organisms, scaffolds, max_clusters, ) LOG.info(f"Extracted {len(cluster_hierarchy)} clusters.") if len(cluster_hierarchy) == 0: LOG.info("There are no clusters that meet the filtering criteria. Exiting...") raise SystemExit(0) LOG.info("Writing genbank files") create_genbanks_from_clusters( session, cluster_hierarchy, output_dir, prefix, format_, ) LOG.info(f"Clusters have been written to {output_dir}") LOG.info("Done!")