def find_protoclusters(record: Record, cds_by_cluster_type: Dict[str, Set[str]], rules_by_name: Dict[str, rule_parser.DetectionRule]) -> List[Protocluster]: """ Detects gene clusters based on the identified core genes """ clusters: List[Protocluster] = [] cds_feature_by_name = record.get_cds_name_mapping() for cluster_type, cds_names in cds_by_cluster_type.items(): cds_features = sorted([cds_feature_by_name[cds] for cds in cds_names]) rule = rules_by_name[cluster_type] cutoff = rule.cutoff core_location = cds_features[0].location for cds in cds_features[1:]: if cds.overlaps_with(FeatureLocation(max(0, core_location.start - cutoff), core_location.end + cutoff)): core_location = FeatureLocation(min(cds.location.start, core_location.start), max(cds.location.end, core_location.end)) assert core_location.start >= 0 and core_location.end <= len(record) continue # create the previous cluster and start a new location surrounds = FeatureLocation(max(0, core_location.start - rule.neighbourhood), min(core_location.end + rule.neighbourhood, len(record))) surrounding_cdses = record.get_cds_features_within_location(surrounds, with_overlapping=False) real_start = min(contained.location.start for contained in surrounding_cdses) real_end = max(contained.location.end for contained in surrounding_cdses) surrounds = FeatureLocation(real_start, real_end) clusters.append(Protocluster(core_location, surrounding_location=surrounds, tool="rule-based-clusters", cutoff=cutoff, neighbourhood_range=rule.neighbourhood, product=cluster_type, detection_rule=str(rule.conditions))) core_location = cds.location # finalise the last cluster surrounds = FeatureLocation(max(0, core_location.start - rule.neighbourhood), min(core_location.end + rule.neighbourhood, len(record))) clusters.append(Protocluster(core_location, surrounding_location=surrounds, tool="rule-based-clusters", cutoff=cutoff, neighbourhood_range=rule.neighbourhood, product=cluster_type, detection_rule=str(rule.conditions))) # fit to record if outside for cluster in clusters: contained = FeatureLocation(max(0, cluster.location.start), min(cluster.location.end, len(record))) if contained != cluster.location: cluster.location = contained clusters = remove_redundant_protoclusters(clusters, rules_by_name) logging.debug("%d rule-based cluster(s) found in record", len(clusters)) return clusters
def run_lanthi_on_genes(record: Record, focus: CDSFeature, cluster: Protocluster, genes: List[CDSFeature], results: LanthiResults) -> None: """ Runs lanthipeptide around a single focus gene which is a core biosynthetic enzyme for lanthipeptides. Updates the results object with any precursors found. Arguments: record: the Record instance containing the genes focus: a core lanthipeptide gene cluster: the Protocluster being analysed genes: a list of candidate precursor genes results: a LanthiResults object to update Returns: None """ if not genes: return domains = get_detected_domains(cluster.cds_children) non_candidate_neighbours = find_neighbours_in_range( focus, cluster.cds_children) flavoprotein_found = contains_feature_with_single_domain( non_candidate_neighbours, {"Flavoprotein"}) halogenase_found = contains_feature_with_single_domain( non_candidate_neighbours, {"Trp_halogenase"}) oxygenase_found = contains_feature_with_single_domain( non_candidate_neighbours, {"p450"}) dehydrogenase_found = contains_feature_with_single_domain( non_candidate_neighbours, {"adh_short", "adh_short_C2"}) lant_class = predict_class_from_genes(focus, cluster.cds_children) if not lant_class: return for candidate in genes: result_vec = run_lanthipred(record, candidate, lant_class, domains) if result_vec is None: continue result_vec.aminovinyl_group = flavoprotein_found result_vec.chlorinated = halogenase_found result_vec.oxygenated = oxygenase_found result_vec.lactonated = dehydrogenase_found and result_vec.core.startswith( 'S') motif = result_vec_to_feature(candidate, result_vec) results.motifs_by_locus[focus.get_name()].append(motif) results.clusters[cluster.get_protocluster_number()].add( focus.get_name()) # track new CDSFeatures if found with all_orfs if candidate.region is None: results.new_cds_features.add(candidate)
def from_json(json: Dict[str, Any], record: Record) -> Optional["RuleDetectionResults"]: """ Constructs a RuleDetectionResults instance from a JSON representation """ if RuleDetectionResults.schema_version != json.get("schema_version", 1): return None cds_by_cluster = {} for json_cluster, json_cds_results in json["cds_by_protocluster"]: cluster = Protocluster.from_biopython(serialiser.feature_from_json(json_cluster)) cds_results = [CDSResults.from_json(result_json, record) for result_json in json_cds_results] cds_by_cluster[cluster] = cds_results cdses_outside = [CDSResults.from_json(chunk, record) for chunk in json["outside_protoclusters"]] return RuleDetectionResults(cds_by_cluster, json["tool"], cdses_outside)