def from_json(json: Dict[str, Any], record: Record) -> Optional["CassisResults"]: # throw away the results if the conditions are different if json["record_id"] != record.id: logging.debug( "Record identifiers don't match, discarding previous results") return None if json["max_percentage"] != MAX_PERCENTAGE: logging.debug( "CASSIS commonality threshold changed, discarding previous results" ) return None if json["max_gap_length"] != MAX_GAP_LENGTH: logging.debug( "CASSIS maximum island length changed, discarding previous results" ) return None subregions = [] promoters = [] # type: List[Promoter] for cluster in json["subregions"]: subregions.append( SubRegion.from_biopython(feature_from_json(cluster))) for promoter in json["promoters"]: if promoter["type"] == "CombinedPromoter": promoters.append(CombinedPromoter.from_json(promoter)) else: promoters.append(Promoter.from_json(promoter)) results = CassisResults(record.id) results.subregions = subregions results.promoters = promoters return results
def create_subregions(anchor: str, cluster_preds: List[ClusterPrediction], record: Record) -> List[SubRegion]: """ Create the predicted subregions """ subregions = [] # type: List[SubRegion] if not cluster_preds: return subregions for i, cluster in enumerate(cluster_preds): # clusters returned by hmmdetect are based on CDS features # in contrast, subregions returned by cassis are based on gene features # --> hmmdetect derived clusters have exact loctions, like the CDSs have # --> cassis derived subregions may have fuzzy locations, like the genes have left_name = cluster.start.gene right_name = cluster.end.gene left = None right = None for gene in record.get_genes(): if gene.get_name() == left_name: left = gene if gene.get_name() == right_name: right = gene if left and right: break assert left and right, "boundary genes no longer present in Record" new_feature = SeqFeature(FeatureLocation(left.location.start, right.location.end), type="subregion") new_feature.qualifiers = { "aStool": ["cassis"], "anchor": [anchor], "abundance": [cluster.start.abundance + cluster.end.abundance], "motif_score": ["{:.1e}".format(cluster.start.score + cluster.end.score)], "gene_left": [cluster.start.gene], "promoter_left": [cluster.start.promoter], "abundance_left": [cluster.start.abundance], "motif_left": [cluster.start.pairing_string], "motif_score_left": ["{:.1e}".format(cluster.start.score)], "gene_right": [cluster.end.gene], "promoter_right": [cluster.end.promoter], "abundance_right": [cluster.end.abundance], "motif_right": [cluster.end.pairing_string], "motif_score_right": ["{:.1e}".format(cluster.end.score)], "genes": [cluster.genes], "promoters": [cluster.promoters], } if i == 0: new_feature.qualifiers["note"] = [ "best prediction (most abundant) for anchor gene {}".format( anchor) ] else: new_feature.qualifiers["note"] = [ "alternative prediction ({}) for anchor gene {}".format( i, anchor) ] new_feature = SubRegion.from_biopython(new_feature) subregions.append(new_feature) return subregions
def generate_results(record: Record, options: ConfigType) -> ClusterFinderResults: """ Find and construct probabilistic cluster areas """ predictions = find_probabilistic_clusters(record, options) new_areas = [] for prediction in predictions: new_areas.append( SubRegion(prediction.location, tool="clusterfinder", probability=prediction.probability)) return ClusterFinderResults(record.id, new_areas, create=options.cf_create_clusters)
def from_json(json: Dict[str, Any], record: Record) -> Optional["ClusterFinderResults"]: if json.get("schema") != ClusterFinderResults.schema_version: logging.warning( "Dropping ClusterFinder probabilistic results, schema version has changed" ) return None areas = [] for area in json["areas"]: areas.append( SubRegion(location_from_string(area[0]), tool="clusterfinder", probability=area[1])) return ClusterFinderResults(record.id, areas, create=json["created"])