def create_cluster_borders(anchor: str, clusters: List[ClusterPrediction], record: Record) -> List[ClusterBorder]: """ Create the predicted ClusterBorders """ if not clusters: return [] borders = [] for i, cluster in enumerate(clusters): # cluster borders returned by hmmdetect are based on CDS features # in contrast, cluster borders returned by cassis are based on gene features # --> hmmdetect derived clusters have exact loctions, like the CDSs have # --> cassis derived clusters may have fuzzy locations, like the genes have left_name = cluster.start.gene right_name = cluster.end.gene left = None right = None for gene in record.get_genes(): if gene.get_name() == left_name: left = gene if gene.get_name() == right_name: right = gene if left and right: break new_feature = SeqFeature(FeatureLocation(left.location.start, right.location.end), type="cluster_border") new_feature.qualifiers = { "aStool": ["cassis"], "anchor": [anchor], "abundance": [cluster.start.abundance + cluster.end.abundance], "motif_score": ["{:.1e}".format(cluster.start.score + cluster.end.score)], "gene_left": [cluster.start.gene], "promoter_left": [cluster.start.promoter], "abundance_left": [cluster.start.abundance], "motif_left": [cluster.start.pairing_string], "motif_score_left": ["{:.1e}".format(cluster.start.score)], "gene_right": [cluster.end.gene], "promoter_right": [cluster.end.promoter], "abundance_right": [cluster.end.abundance], "motif_right": [cluster.end.pairing_string], "motif_score_right": ["{:.1e}".format(cluster.end.score)], "genes": [cluster.genes], "promoters": [cluster.promoters], } if i == 0: new_feature.qualifiers["note"] = [ "best prediction (most abundant) for anchor gene {}".format( anchor) ] else: new_feature.qualifiers["note"] = [ "alternative prediction ({}) for anchor gene {}".format( i, anchor) ] new_feature = ClusterBorder.from_biopython(new_feature) borders.append(new_feature) return borders
def detect(record: Record, options: ConfigType) -> CassisResults: """Use core genes (anchor genes) from hmmdetect as seeds to detect gene clusters""" logging.info("Detecting gene clusters using CASSIS") results = CassisResults(record.id) # get core genes from hmmdetect --> necessary CASSIS input, aka "anchor genes" anchor_gene_names = get_anchor_gene_names(record) logging.info("Record has %d anchor genes", len(anchor_gene_names)) if not anchor_gene_names: return results # filter all genes in record for neighbouring genes with overlapping annotations genes = record.get_genes() logging.info("Record has %d features of type 'gene'", len(genes)) if not genes: return results candidate_genes, ignored_genes = ignore_overlapping(list(genes)) # compute promoter sequences/regions --> necessary for motif prediction (MEME and FIMO input) try: # why these values? see "Wolf et al (2015): CASSIS and SMIPS ..." upstream_tss = 1000 # nucleotides upstream TSS downstream_tss = 50 # nucleotides downstream TSS promoters = get_promoters(record, candidate_genes, upstream_tss, downstream_tss) results.promoters = promoters write_promoters_to_file(options.output_dir, record.name, promoters) except DuplicatePromoterError: logging.error( "CASSIS discovered an error while working on the promoter sequences, skipping CASSIS analysis" ) return results if not promoters: logging.debug( "CASSIS found zero promoter regions, skipping CASSIS analysis") return results elif len(promoters) < 3: logging.debug( "Sequence %r yields less than 3 promoter regions, skipping CASSIS analysis", record.name) return results elif len(promoters) < 40: logging.debug("Sequence %r yields only %d promoter regions", record.name, len(promoters)) logging.debug( "Cluster detection on small sequences may lead to incomplete cluster predictions" ) predicted_borders = [] cluster_predictions = {} # {anchor gene: cluster predictions} for i, anchor in enumerate(anchor_gene_names): logging.debug("Detecting cluster around anchor gene %r (%d of %d)", anchor, i + 1, len(anchor_gene_names)) # get cluster predictions sorted by border abundance # (most abundant --> "best" prediction) predictions = get_predictions_for_anchor(anchor, promoters, record, ignored_genes, options) if predictions: cluster_predictions[anchor] = predictions predicted_borders.extend( create_cluster_borders(anchor, predictions, record)) logging.debug("Cleaning up MEME and FIMO output directories") cleanup_outdir(anchor_gene_names, cluster_predictions, options) results.borders = predicted_borders return results