Example #1
0
def gne(
    session,
    output=None,
    max_gap=100000,
    samples=100,
    scale="linear",
    plot=None,
    hide_headers=False,
    delimiter=",",
    decimals=4,
):
    """Estimate gene neighbourhood."""
    LOG.info("Starting cblaster gene neighbourhood estimation")
    LOG.info("Loading session from: %s", session)
    with open(session) as fp:
        session = Session.from_json(fp)

    LOG.info("Computing gene neighbourhood statistics")
    results = context.estimate_neighbourhood(session,
                                             max_gap=max_gap,
                                             samples=samples,
                                             scale=scale)
    if output:
        LOG.info("Writing GNE table to %s", output.name)
        summary = summarise_gne(
            results,
            hide_headers=hide_headers,
            delimiter=delimiter,
            decimals=decimals,
        )
        output.write(summary)

    plot_gne(results, output=plot)
    LOG.info("Done.")
Example #2
0
def extract(
    session,
    delimiter=None,
    name_only=False,
    extract_seqs=False,
    output=None,
    queries=None,
    organisms=None,
    scaffolds=None,
):
    """Extract subject sequences from a cblaster session.

    Args:
        session (str): path to json file encoding a cblaster Session object
        extract_seqs (bool): Put the sequences of the extracted proteins into a fasta file
        output (str): Output file name
        queries (list): Query sequence names
        organisms (list): Organism filtering regular expressions
        scaffolds (list): Scaffold names and ranges
        delimiter (str): Sequence description delimiter character
        name_only (bool): Do not save sequence descriptions
    """
    LOG.info("Starting cblaster extraction")
    LOG.info("Loading session from: %s", session)
    with open(session) as fp:
        session = Session.from_json(fp)

    LOG.info("Extracting subject sequences matching filters")
    records = extract_records(
        session,
        queries=queries,
        organisms=organisms,
        scaffolds=scaffolds,
    )

    if extract_seqs:
        extract_sequences(session, records)

    text = format_records(
        records,
        delimiter=delimiter,
        to_fasta=extract_seqs,
        name_only=name_only,
    )

    if output:
        with open(output, "w") as fp:
            LOG.info("Writing output to %s", fp.name)
            fp.write(text)
    else:
        print(text)

    LOG.info("Done!")
Example #3
0
def extract(
    session,
    in_cluster=True,
    delimiter=None,
    name_only=False,
    download=False,
    output=None,
    queries=None,
    organisms=None,
    scaffolds=None,
):
    """Extract subject sequences from a cblaster session.

    Parameters:
        session (Session): cblaster Session object
        in_cluster: (bool): Only sequences in clusters are extracted
        download (bool): Download hit sequences from NCBI
        output (str): Output file name
        queries (list): Query sequence names
        organisms (list): Organism filtering regular expressions
        scaffolds (list): Scaffold names and ranges
        delimiter (str): Sequence description delimiter character
        name_only (bool): Do not save sequence descriptions
    """
    LOG.info("Starting cblaster extraction")
    LOG.info("Loading session from: %s", session)
    with open(session) as fp:
        session = Session.from_json(fp)

    LOG.info("Extracting subject sequences matching filters")
    records = extract_records(
        session,
        in_cluster=in_cluster,
        queries=queries,
        organisms=organisms,
        scaffolds=scaffolds,
    )

    if download:
        LOG.info("Fetching %i sequences from NCBI", len(records))
        headers = [record.get("name") for record in records]
        sequences = efetch_sequences(headers)
        for record in records:
            record["sequence"] = sequences.get(record["name"])

    # FASTA format if downloading from NCBI, otherwise newline separated IDs
    text = format_records(
        records,
        delimiter=delimiter,
        to_fasta=download,
        name_only=name_only,
    )

    if output:
        with open(output, "w") as fp:
            LOG.info("Writing output to %s", fp.name)
            fp.write(text)
    else:
        print(text)

    LOG.info("Done!")
    return records
Example #4
0
def plot_session_file(path, output=None):
    with open(path) as fp:
        session = Session.from_json(fp)
    plot_session(session, output=output)
Example #5
0
def extract_clusters(
    session,
    output_dir,
    prefix="",
    cluster_numbers=None,
    score_threshold=None,
    organisms=None,
    scaffolds=None,
    format_="genbank",
    max_clusters=50,
):
    """Extracts Cluster objects from a Session file and writes them to a file.

    If BiG-SCAPE format is chosen,  a 'gene_kind' qualifier is added to each CDS
    feature to indicate what genes are part of the core of the cluster.

    Genes that are flagged as required are considered core. If no genes are flagged as
    required, all genes are considered to be core genes.

    An additional qualifier is provided called 'cluster_role'. This qualifier allows
    the identification between hits found against required genes of the query, hits
    found agains any gene of the query and intermediate genes.

    Args:
        session (string): path to a session.json file
        output_dir (string): path to a directory for writing the output files
        prefix (string): string to start the file name of each cluster with
        cluster_numbers (list): cluster numbers to include
        score_threshold (float): minum score in order for a cluster to be included
        organisms (list): Organism filtering regular expressions, clusters for
         these organisms are included
        scaffolds(list): clusters on these scaffolds are included
        format_ (str): the format that the extracted cluster should have
        max_clusters (int): the maximum amount of clusters extracted regardless of filters

    """
    LOG.info("Starting cblaster plotting of clusters using clinker")
    LOG.info("Loading session from: %s", session)
    with open(session) as fp:
        session = Session.from_json(fp)

    LOG.info("Extracting clusters that match the filters")
    cluster_hierarchy = get_sorted_cluster_hierarchies(
        session,
        cluster_numbers,
        score_threshold,
        organisms,
        scaffolds,
        max_clusters,
    )

    LOG.info(f"Extracted {len(cluster_hierarchy)} clusters.")
    if len(cluster_hierarchy) == 0:
        LOG.info("There are no clusters that meet the filtering criteria. Exiting...")
        raise SystemExit(0)

    LOG.info("Writing genbank files")
    create_genbanks_from_clusters(
        session,
        cluster_hierarchy,
        output_dir,
        prefix,
        format_,
    )

    LOG.info(f"Clusters have been written to {output_dir}")
    LOG.info("Done!")