예제 #1
0
def create_cluster_borders(anchor: str, clusters: List[ClusterPrediction],
                           record: Record) -> List[ClusterBorder]:
    """ Create the predicted ClusterBorders """
    if not clusters:
        return []
    borders = []
    for i, cluster in enumerate(clusters):
        # cluster borders returned by hmmdetect are based on CDS features
        # in contrast, cluster borders returned by cassis are based on gene features
        # --> hmmdetect derived clusters have exact loctions, like the CDSs have
        # --> cassis derived clusters may have fuzzy locations, like the genes have
        left_name = cluster.start.gene
        right_name = cluster.end.gene
        left = None
        right = None
        for gene in record.get_genes():
            if gene.get_name() == left_name:
                left = gene
            if gene.get_name() == right_name:
                right = gene
            if left and right:
                break

        new_feature = SeqFeature(FeatureLocation(left.location.start,
                                                 right.location.end),
                                 type="cluster_border")
        new_feature.qualifiers = {
            "aStool": ["cassis"],
            "anchor": [anchor],
            "abundance": [cluster.start.abundance + cluster.end.abundance],
            "motif_score":
            ["{:.1e}".format(cluster.start.score + cluster.end.score)],
            "gene_left": [cluster.start.gene],
            "promoter_left": [cluster.start.promoter],
            "abundance_left": [cluster.start.abundance],
            "motif_left": [cluster.start.pairing_string],
            "motif_score_left": ["{:.1e}".format(cluster.start.score)],
            "gene_right": [cluster.end.gene],
            "promoter_right": [cluster.end.promoter],
            "abundance_right": [cluster.end.abundance],
            "motif_right": [cluster.end.pairing_string],
            "motif_score_right": ["{:.1e}".format(cluster.end.score)],
            "genes": [cluster.genes],
            "promoters": [cluster.promoters],
        }

        if i == 0:
            new_feature.qualifiers["note"] = [
                "best prediction (most abundant) for anchor gene {}".format(
                    anchor)
            ]
        else:
            new_feature.qualifiers["note"] = [
                "alternative prediction ({}) for anchor gene {}".format(
                    i, anchor)
            ]

        new_feature = ClusterBorder.from_biopython(new_feature)
        borders.append(new_feature)
    return borders
예제 #2
0
def detect(record: Record, options: ConfigType) -> CassisResults:
    """Use core genes (anchor genes) from hmmdetect as seeds to detect gene clusters"""
    logging.info("Detecting gene clusters using CASSIS")

    results = CassisResults(record.id)

    # get core genes from hmmdetect --> necessary CASSIS input, aka "anchor genes"
    anchor_gene_names = get_anchor_gene_names(record)
    logging.info("Record has %d anchor genes", len(anchor_gene_names))
    if not anchor_gene_names:
        return results

    # filter all genes in record for neighbouring genes with overlapping annotations
    genes = record.get_genes()
    logging.info("Record has %d features of type 'gene'", len(genes))
    if not genes:
        return results
    candidate_genes, ignored_genes = ignore_overlapping(list(genes))

    # compute promoter sequences/regions --> necessary for motif prediction (MEME and FIMO input)
    try:
        # why these values? see "Wolf et al (2015): CASSIS and SMIPS ..."
        upstream_tss = 1000  # nucleotides upstream TSS
        downstream_tss = 50  # nucleotides downstream TSS
        promoters = get_promoters(record, candidate_genes, upstream_tss,
                                  downstream_tss)
        results.promoters = promoters
        write_promoters_to_file(options.output_dir, record.name, promoters)
    except DuplicatePromoterError:
        logging.error(
            "CASSIS discovered an error while working on the promoter sequences, skipping CASSIS analysis"
        )
        return results

    if not promoters:
        logging.debug(
            "CASSIS found zero promoter regions, skipping CASSIS analysis")
        return results
    elif len(promoters) < 3:
        logging.debug(
            "Sequence %r yields less than 3 promoter regions, skipping CASSIS analysis",
            record.name)
        return results
    elif len(promoters) < 40:
        logging.debug("Sequence %r yields only %d promoter regions",
                      record.name, len(promoters))
        logging.debug(
            "Cluster detection on small sequences may lead to incomplete cluster predictions"
        )

    predicted_borders = []
    cluster_predictions = {}  # {anchor gene: cluster predictions}
    for i, anchor in enumerate(anchor_gene_names):
        logging.debug("Detecting cluster around anchor gene %r (%d of %d)",
                      anchor, i + 1, len(anchor_gene_names))
        # get cluster predictions sorted by border abundance
        # (most abundant --> "best" prediction)
        predictions = get_predictions_for_anchor(anchor, promoters, record,
                                                 ignored_genes, options)
        if predictions:
            cluster_predictions[anchor] = predictions
            predicted_borders.extend(
                create_cluster_borders(anchor, predictions, record))

    logging.debug("Cleaning up MEME and FIMO output directories")
    cleanup_outdir(anchor_gene_names, cluster_predictions, options)
    results.borders = predicted_borders
    return results