def gne( session, output=None, max_gap=100000, samples=100, scale="linear", plot=None, hide_headers=False, delimiter=",", decimals=4, ): """Estimate gene neighbourhood.""" LOG.info("Starting cblaster gene neighbourhood estimation") LOG.info("Loading session from: %s", session) with open(session) as fp: session = Session.from_json(fp) LOG.info("Computing gene neighbourhood statistics") results = context.estimate_neighbourhood(session, max_gap=max_gap, samples=samples, scale=scale) if output: LOG.info("Writing GNE table to %s", output.name) summary = summarise_gne( results, hide_headers=hide_headers, delimiter=delimiter, decimals=decimals, ) output.write(summary) plot_gne(results, output=plot) LOG.info("Done.")
def extract( session, delimiter=None, name_only=False, extract_seqs=False, output=None, queries=None, organisms=None, scaffolds=None, ): """Extract subject sequences from a cblaster session. Args: session (str): path to json file encoding a cblaster Session object extract_seqs (bool): Put the sequences of the extracted proteins into a fasta file output (str): Output file name queries (list): Query sequence names organisms (list): Organism filtering regular expressions scaffolds (list): Scaffold names and ranges delimiter (str): Sequence description delimiter character name_only (bool): Do not save sequence descriptions """ LOG.info("Starting cblaster extraction") LOG.info("Loading session from: %s", session) with open(session) as fp: session = Session.from_json(fp) LOG.info("Extracting subject sequences matching filters") records = extract_records( session, queries=queries, organisms=organisms, scaffolds=scaffolds, ) if extract_seqs: extract_sequences(session, records) text = format_records( records, delimiter=delimiter, to_fasta=extract_seqs, name_only=name_only, ) if output: with open(output, "w") as fp: LOG.info("Writing output to %s", fp.name) fp.write(text) else: print(text) LOG.info("Done!")
def extract( session, in_cluster=True, delimiter=None, name_only=False, download=False, output=None, queries=None, organisms=None, scaffolds=None, ): """Extract subject sequences from a cblaster session. Parameters: session (Session): cblaster Session object in_cluster: (bool): Only sequences in clusters are extracted download (bool): Download hit sequences from NCBI output (str): Output file name queries (list): Query sequence names organisms (list): Organism filtering regular expressions scaffolds (list): Scaffold names and ranges delimiter (str): Sequence description delimiter character name_only (bool): Do not save sequence descriptions """ LOG.info("Starting cblaster extraction") LOG.info("Loading session from: %s", session) with open(session) as fp: session = Session.from_json(fp) LOG.info("Extracting subject sequences matching filters") records = extract_records( session, in_cluster=in_cluster, queries=queries, organisms=organisms, scaffolds=scaffolds, ) if download: LOG.info("Fetching %i sequences from NCBI", len(records)) headers = [record.get("name") for record in records] sequences = efetch_sequences(headers) for record in records: record["sequence"] = sequences.get(record["name"]) # FASTA format if downloading from NCBI, otherwise newline separated IDs text = format_records( records, delimiter=delimiter, to_fasta=download, name_only=name_only, ) if output: with open(output, "w") as fp: LOG.info("Writing output to %s", fp.name) fp.write(text) else: print(text) LOG.info("Done!") return records
def plot_session_file(path, output=None): with open(path) as fp: session = Session.from_json(fp) plot_session(session, output=output)
def extract_clusters( session, output_dir, prefix="", cluster_numbers=None, score_threshold=None, organisms=None, scaffolds=None, format_="genbank", max_clusters=50, ): """Extracts Cluster objects from a Session file and writes them to a file. If BiG-SCAPE format is chosen, a 'gene_kind' qualifier is added to each CDS feature to indicate what genes are part of the core of the cluster. Genes that are flagged as required are considered core. If no genes are flagged as required, all genes are considered to be core genes. An additional qualifier is provided called 'cluster_role'. This qualifier allows the identification between hits found against required genes of the query, hits found agains any gene of the query and intermediate genes. Args: session (string): path to a session.json file output_dir (string): path to a directory for writing the output files prefix (string): string to start the file name of each cluster with cluster_numbers (list): cluster numbers to include score_threshold (float): minum score in order for a cluster to be included organisms (list): Organism filtering regular expressions, clusters for these organisms are included scaffolds(list): clusters on these scaffolds are included format_ (str): the format that the extracted cluster should have max_clusters (int): the maximum amount of clusters extracted regardless of filters """ LOG.info("Starting cblaster plotting of clusters using clinker") LOG.info("Loading session from: %s", session) with open(session) as fp: session = Session.from_json(fp) LOG.info("Extracting clusters that match the filters") cluster_hierarchy = get_sorted_cluster_hierarchies( session, cluster_numbers, score_threshold, organisms, scaffolds, max_clusters, ) LOG.info(f"Extracted {len(cluster_hierarchy)} clusters.") if len(cluster_hierarchy) == 0: LOG.info("There are no clusters that meet the filtering criteria. Exiting...") raise SystemExit(0) LOG.info("Writing genbank files") create_genbanks_from_clusters( session, cluster_hierarchy, output_dir, prefix, format_, ) LOG.info(f"Clusters have been written to {output_dir}") LOG.info("Done!")