def find_clusters(record, cds_by_cluster_type, rules_by_name) -> List[ClusterBorder]: """ Detects gene clusters based on the identified core genes """ clusters = [] # type: List[ClusterBorder] cds_feature_by_name = record.get_cds_name_mapping() for cluster_type, cds_names in cds_by_cluster_type.items(): cds_features = sorted([cds_feature_by_name[cds] for cds in cds_names]) rule = rules_by_name[cluster_type] cutoff = rule.cutoff extent = rule.extent start, end = sorted( [cds_features[0].location.start, cds_features[0].location.end]) cluster = ClusterBorder(FeatureLocation(start, end), tool="rule-based-clusters", cutoff=cutoff, extent=extent, product=cluster_type) assert cds_features[0].is_contained_by(cluster) assert cds_features[0] in record.get_cds_features_within_location( cluster.location) clusters.append(cluster) for cds in cds_features[1:]: feature_start, feature_end = sorted( [cds.location.start, cds.location.end]) dummy_location = FeatureLocation(cluster.location.start - cutoff, cluster.location.end + cutoff) if cds.is_contained_by(dummy_location): start = min(feature_start, start) end = max(feature_end, end) cluster.location = FeatureLocation(start, end) else: start = feature_start end = feature_end cluster = ClusterBorder(FeatureLocation(start, end), tool="rule-based-clusters", cutoff=cutoff, extent=extent, product=cluster_type) clusters.append(cluster) for cluster in clusters: cluster.rule = str(rules_by_name[cluster.product].conditions) if cluster.location.start < 0: cluster.location = FeatureLocation(0, cluster.location.end) cluster.contig_edge = True if cluster.location.end > len(record): cluster.location = FeatureLocation(cluster.location.start, len(record)) cluster.contig_edge = True clusters = remove_redundant_borders(clusters, rules_by_name) logging.debug("%d rule-based cluster(s) found in record", len(clusters)) return clusters
def test_products_from_borders(self): location = FeatureLocation(1, 10) border1 = ClusterBorder(location, "toolA", product="A") assert border1.high_priority_product border2 = ClusterBorder(location, "toolB", product="B") assert border1.high_priority_product assert _build_products_from_borders([border1, border2]) == ["A", "B"] assert _build_products_from_borders([border2, border1]) == ["B", "A"] border2 = ClusterBorder(location, "toolB", product="B", high_priority_product=False) assert _build_products_from_borders([border1, border2]) == ["A"] assert _build_products_from_borders([border2, border1]) == ["A"] border1.high_priority_product = False assert _build_products_from_borders([border1, border2]) == ["A", "B"] assert _build_products_from_borders([border2, border1]) == ["B", "A"]
def from_json(json: Dict[str, Any], record: Record) -> Optional["CassisResults"]: # throw away the results if the conditions are different if json["record_id"] != record.id: logging.debug( "Record identifiers don't match, discarding previous results") return None if json["max_percentage"] != MAX_PERCENTAGE: logging.debug( "CASSIS commonality threshold changed, discarding previous results" ) return None if json["max_gap_length"] != MAX_GAP_LENGTH: logging.debug( "CASSIS maximum island length changed, discarding previous results" ) return None borders = [] promoters = [] # type: List[Promoter] for border in json["borders"]: borders.append( ClusterBorder.from_biopython(feature_from_json(border))) for promoter in json["promoters"]: if promoter["type"] == "CombinedPromoter": promoters.append(CombinedPromoter.from_json(promoter)) else: promoters.append(Promoter.from_json(promoter)) results = CassisResults(record.id) results.borders = borders results.promoters = promoters return results
def create_cluster_borders(anchor: str, clusters: List[ClusterPrediction], record: Record) -> List[ClusterBorder]: """ Create the predicted ClusterBorders """ if not clusters: return [] borders = [] for i, cluster in enumerate(clusters): # cluster borders returned by hmmdetect are based on CDS features # in contrast, cluster borders returned by cassis are based on gene features # --> hmmdetect derived clusters have exact loctions, like the CDSs have # --> cassis derived clusters may have fuzzy locations, like the genes have left_name = cluster.start.gene right_name = cluster.end.gene left = None right = None for gene in record.get_genes(): if gene.get_name() == left_name: left = gene if gene.get_name() == right_name: right = gene if left and right: break new_feature = SeqFeature(FeatureLocation(left.location.start, right.location.end), type="cluster_border") new_feature.qualifiers = { "aStool": ["cassis"], "anchor": [anchor], "abundance": [cluster.start.abundance + cluster.end.abundance], "motif_score": ["{:.1e}".format(cluster.start.score + cluster.end.score)], "gene_left": [cluster.start.gene], "promoter_left": [cluster.start.promoter], "abundance_left": [cluster.start.abundance], "motif_left": [cluster.start.pairing_string], "motif_score_left": ["{:.1e}".format(cluster.start.score)], "gene_right": [cluster.end.gene], "promoter_right": [cluster.end.promoter], "abundance_right": [cluster.end.abundance], "motif_right": [cluster.end.pairing_string], "motif_score_right": ["{:.1e}".format(cluster.end.score)], "genes": [cluster.genes], "promoters": [cluster.promoters], } if i == 0: new_feature.qualifiers["note"] = [ "best prediction (most abundant) for anchor gene {}".format( anchor) ] else: new_feature.qualifiers["note"] = [ "alternative prediction ({}) for anchor gene {}".format( i, anchor) ] new_feature = ClusterBorder.from_biopython(new_feature) borders.append(new_feature) return borders
def create_border(self, rule_name, start, end): rule = self.rules_by_name[rule_name] return ClusterBorder(FeatureLocation(start, end), tool="testing", cutoff=rule.cutoff, extent=rule.extent, product=rule_name)
def generate_results(record, options) -> List[ClusterBorder]: """ Find and construct cluster borders """ rule_clusters = find_rule_based_clusters(record, options) prob_clusters = find_probabilistic_clusters(record, options) new_clusters = [] new_clusters.extend(rule_clusters) for cluster in prob_clusters: new_cluster = ClusterBorder(cluster.location, tool="clusterfinder", probability=cluster.probability, product=PUTATIVE_PRODUCT, high_priority_product=False) new_clusters.append(new_cluster) if options.cf_create_clusters: for border in new_clusters: record.add_cluster_border(border) return ClusterFinderResults(record.id, new_clusters, create=options.cf_create_clusters)
def test_merges(self): clusterfinder.generate_results(self.record, self.config) assert len(self.record.get_cluster_borders()) == 2 for start, end in [(10, 40), (1040, 1050), (110, 400)]: loc = FeatureLocation(start, end) self.record.add_cluster_border(ClusterBorder(loc, "testtool", product=str(start))) assert not self.record.get_clusters() self.record.create_clusters_from_borders() clusters = self.record.get_clusters() assert len(clusters) == 2 assert clusters[0].location.start == 10 assert clusters[0].location.end == 400 assert clusters[0].products == ("10", "110") assert clusters[1].location.start == 1030 assert clusters[1].location.end == 1120 assert clusters[1].products == ("1040",)